In [2]:
import pandas as pd

print("Processing books data...")

# load books data
books_data = pd.read_csv('/Users/hanakaraibrahimovic/Documents/VU/Text Mining for AI/books_data.csv')
print(f"Total books: {len(books_data)} rows")

# extract books with descriptions 
book_descriptions = books_data['description'].dropna()
print(f" Books with descriptions: {len(book_descriptions)} samples")

# sample taken since there is so many - 5000 to balance with other data
book_sample = book_descriptions.sample(n=min(5000, len(book_descriptions)), random_state=42)
book_labels = ['book'] * len(book_sample)
print(f"Using {len(book_sample)} book samples for training")

# load existing combined data
existing_data = pd.read_csv('/Users/hanakaraibrahimovic/Documents/VU/Text Mining for AI/combined_training_data.csv')
print(f"Existing data: {len(existing_data)} samples")

# combine everything
all_texts = list(existing_data['text']) + list(book_sample)
all_labels = list(existing_data['label']) + book_labels

# final training dataframe
final_training_data = pd.DataFrame({
    'text': all_texts,
    'label': all_labels
})

# clean and shuffle
final_training_data['text'] = final_training_data['text'].astype(str).str.replace('\n', ' ').str.strip()
final_training_data = final_training_data[final_training_data['text'].str.len() > 10]
final_training_data = final_training_data.sample(frac=1, random_state=42).reset_index(drop=True)

# save the complete dataset
final_training_data.to_csv('/Users/hanakaraibrahimovic/Documents/VU/Text Mining for AI/final_training_data.csv', index=False)

print(f"\nSuccess! Final training data created!")
print(f"Total samples: {len(final_training_data)}")
print(f"Label distribution:")
print(final_training_data['label'].value_counts())

# show examples from each category
print(f"\nExamples from each category:")
for label in ['sports', 'movie', 'book']:
    example = final_training_data[final_training_data['label'] == label].iloc[0]
    text_preview = example['text'][:150] + "..." if len(example['text']) > 150 else example['text']
    print(f"\n {label.upper()}: {text_preview}")
    print("-" * 80)

Processing books data...
Total books: 212404 rows
 Books with descriptions: 143962 samples
Using 5000 book samples for training
Existing data: 52410 samples

Success! Final training data created!
Total samples: 57402
Label distribution:
label
movie     50000
book       4992
sports     2410
Name: count, dtype: int64

Examples from each category:

 SPORTS: german growth goes into reverse  germanys economy shrank  in the last three months of  upsetting hopes of a sustained recovery  the figures confounded...
--------------------------------------------------------------------------------

 MOVIE: Christina Raines plays a lovely model in New York who seeks out a new apartment and begins to meet strange neighbors and reveal a secret about the bui...
--------------------------------------------------------------------------------

 BOOK: Each decade new readers discover the characters and curious activities aboard the U.S.S. "Caine in this classic tale of pathos, humor, and scope.
----------