In [10]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import sys
sys.path.append('../src')
from preprocessing.text_cleaner import TweetCleaner
print("Loading dataset...")
df = pd.read_csv('../data/raw/training.1600000.processed.noemoticon.csv',
                 encoding='latin-1',
                 names=['sentiment', 'id', 'date', 'query', 'user', 'text'])
df['sentiment'] = df['sentiment'].map({0: 0, 4: 1})
print(f"Original dataset: {len(df):,} tweets")

Loading dataset...
Original dataset: 1,600,000 tweets


In [11]:
SAMPLE_SIZE =100000 
df_positive = df[df['sentiment']==1].sample(n=SAMPLE_SIZE//2, random_state=42)
df_negative = df[df['sentiment']==0].sample(n=SAMPLE_SIZE//2, random_state=42)
df = pd.concat([df_positive, df_negative]).sample(frac=1, random_state=42).reset_index(drop=True)

print(f"Sampled dataset: {len(df):,} tweets")
print(f"Positive: {(df['sentiment']==1).sum():,}")
print(f"Negative: {(df['sentiment']==0).sum():,}")

Sampled dataset: 100,000 tweets
Positive: 50,000
Negative: 50,000


In [12]:
cleaner = TweetCleaner(
    lowercase=True,
    remove_urls=True,
    remove_mentions=True,
    remove_hashtags=False,
    remove_numbers=False,
    remove_emojis=False,
    expand_contractions=True
)
tqdm.pandas(desc="Cleaning tweets")
df['text_clean'] = df['text'].progress_apply(cleaner.clean)

Cleaning tweets: 100%|██████████████████████████████████████████████████████| 100000/100000 [00:02<00:00, 40642.65it/s]


In [13]:
print("\nFiltering bad samples...")
# Remove empty tweets
df = df[df['text_clean'].str.strip() != '']
# Remove very short tweets (< 3 characters)
df = df[df['text_clean'].str.len() >= 3]
print(f"After filtering: {len(df):,} tweets")


Filtering bad samples...
After filtering: 99,772 tweets


In [14]:
from sklearn.model_selection import train_test_split
train_df, temp_df = train_test_split(
    df, 
    test_size=0.2, 
    random_state=42, 
    stratify=df['sentiment']
)

val_df, test_df = train_test_split(
    temp_df, 
    test_size=0.5, 
    random_state=42, 
    stratify=temp_df['sentiment']
)

print(f"\nTrain: {len(train_df):,}")
print(f"Val:   {len(val_df):,}")
print(f"Test:  {len(test_df):,}")


Train: 79,817
Val:   9,977
Test:  9,978


In [15]:
print(f"Train - Positive: {(train_df['sentiment']==1).sum()/len(train_df)*100:.1f}%")
print(f"Val   - Positive: {(val_df['sentiment']==1).sum()/len(val_df)*100:.1f}%")
print(f"Test  - Positive: {(test_df['sentiment']==1).sum()/len(test_df)*100:.1f}%")

Train - Positive: 50.0%
Val   - Positive: 50.0%
Test  - Positive: 50.0%


In [16]:
columns_to_save = ['text', 'text_clean', 'sentiment']

train_df[columns_to_save].to_csv('../data/processed/train.csv', index=False)
val_df[columns_to_save].to_csv('../data/processed/val.csv', index=False)
test_df[columns_to_save].to_csv('../data/processed/test.csv', index=False)

print("Datasets saved to data/processed/")

Datasets saved to data/processed/


In [17]:

print(f"Train samples: {len(train_df):,}")
print(f"Val samples:   {len(val_df):,}")
print(f"Test samples:  {len(test_df):,}")

print("\nSample cleaned tweets:")
for i in range(5):
    row = train_df.iloc[i]
    print(f"\nOriginal: {row['text']}")
    print(f"Cleaned:  {row['text_clean']}")
    print(f"Sentiment: {'Positive' if row['sentiment']==1 else 'Negative'}")

Train samples: 79,817
Val samples:   9,977
Test samples:  9,978

Sample cleaned tweets:

Original: @DavidArchie i didn't have the chance to watch your concert  but i get to see all your tv guestings. hope you'll come here again.
Cleaned:  i did not have the chance to watch your concert but i get to see all your tv guestings. hope you will come here again.
Sentiment: Negative

Original: @DaveEHS got 3G on? if so, turn it off 
Cleaned:  got 3g on? if so, turn it off
Sentiment: Positive

Original: I better be off and do something productive. 
Cleaned:  i better be off and do something productive.
Sentiment: Positive

Original: &quot;But pickle jars are just pickle jars And pickles are just pickles Ingredients : water, salt, cucumber, garlic and pickling spices&quot; 
Cleaned:  "but pickle jars are just pickle jars and pickles are just pickles ingredients : water, salt, cucumber, garlic and pickling spices"
Sentiment: Positive

Original: @Queensowntalia if you cry a lot , you'll have your 