# Preprocessing of Data (train.csv)
This processes train.csv into train_subset.csv, which will be fed to the models.

## Imports

In [83]:
import pandas as pd
from sklearn.utils import resample
from sklearn.feature_extraction.text import TfidfVectorizer

## Preprocessing

In [84]:
CSV_IN = "input/original_data.csv"   # Original data file
CSV_OUT = "output/clean_data.csv"    # Output file
MIN_POS = 100                        # Minimum positives per label
N_NEG = 500                          # Number of all-zero rows to include
RNG = 42                             # Random seed

# Load the full CSV
df = pd.read_csv(CSV_IN)

# Print the original dataframe
df.head(5)

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [85]:
# List of cyberbullying-related columns
cyberbullying_cols = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

# Add new column that is 1 if any of the specified columns is 1
df['cyberbullying'] = df[cyberbullying_cols].max(axis=1)

# Drop the original individual label columns if not needed
df = df.drop(columns=cyberbullying_cols)

# Drop the unneeded id column
df = df.drop(columns=['id'])

# Print the updated dataframe
df.head(10)

Unnamed: 0,comment_text,cyberbullying
0,Explanation\nWhy the edits made under my usern...,0
1,D'aww! He matches this background colour I'm s...,0
2,"Hey man, I'm really not trying to edit war. It...",0
3,"""\nMore\nI can't make any real suggestions on ...",0
4,"You, sir, are my hero. Any chance you remember...",0
5,"""\n\nCongratulations from me as well, use the ...",0
6,COCKSUCKER BEFORE YOU PISS AROUND ON MY WORK,1
7,Your vandalism to the Matt Shirvington article...,0
8,Sorry if the word 'nonsense' was offensive to ...,0
9,alignment on this subject and which are contra...,0


### tf-idf

In [86]:
# Initialize TF-IDF Vectorizer
vectorizer = TfidfVectorizer(
    max_features=5000,  # optional: limit vocabulary size
    stop_words='english', # filter out common words that typically do not carry significant meaning in text analysis, such as "the," "a," "is,"
    lowercase=True, # Convert to lowercase before tokenizing
    token_pattern = u'(?ui)\\b(?=\\w*[a-z])\\w{3,}\\b' # At least 3 characters, and at least one letter
)

# Apply TF-IDF on the 'comment_text' column
X_tfidf = vectorizer.fit_transform(df['comment_text'])

# Convert to DataFrame for saving
tfidf_df = pd.DataFrame(
    X_tfidf.toarray(),
    columns=vectorizer.get_feature_names_out(),
    index=df.index  # preserve original index
)

# Append the binary label
tfidf_df['cyberbullying'] = df['cyberbullying'].values

In [87]:
# Show the resultant df's shape and output counts
print(tfidf_df.shape)
print(tfidf_df.value_counts("cyberbullying"))

(159571, 5001)
cyberbullying
0    143346
1     16225
Name: count, dtype: int64


## Undersampling
We notice that there is a discrepancy between the number of cyberbullying comments and non-cyberbullying comments in our dataset. About 90% of our dataset is comprised of non-cyberbullying comments, whereas the remaining 10% is cyberbullying. We attempt to address this using undersamping: 

In [88]:
# Separate majority and minority classes
df_majority = tfidf_df[tfidf_df.cyberbullying == 0] # Majority -> non-cyberbullying
df_minority = tfidf_df[tfidf_df.cyberbullying == 1] # Minority -> cyberbullying

# Downsample majority class
# By randomly selecting a subset of majority samples equal in size to the minority class	
df_majority_downsampled = resample(
    df_majority,
    replace=False,               # sample without replacement
    n_samples=len(df_minority),  # to match minority class
    random_state=RNG             # reproducibility
)

# Combine minority class with downsampled majority class
df_balanced = pd.concat([df_majority_downsampled, df_minority])

# Shuffle the resulting dataset, preserving original indexing
df_balanced = df_balanced.sample(frac=1, random_state=RNG)


# Print new class balance
print("Class distribution after undersampling:")
print(df_balanced['cyberbullying'].value_counts())


Class distribution after undersampling:
cyberbullying
0    16225
1    16225
Name: count, dtype: int64


## Verify that tfidf worked

In [89]:
# Add the original comment_text back to df_balanced using the original df index
df_balanced_with_comments = df_balanced.copy(deep=True)
df_balanced_with_comments['comment_text'] = df.loc[df_balanced.index, 'comment_text'].values

# Show non-zero TF-IDF features and original comment text for the first 10 rows
for idx in range(10):
    original_idx = df_balanced.index[idx]  # index in the original df
    original_comment = df.loc[original_idx, 'comment_text']
    
    row = df_balanced.iloc[idx]
    label = row['cyberbullying']
    
    feature_values = row.drop('cyberbullying')
    non_zero_features = feature_values[feature_values > 0]
    
    print(f"Row {idx} — cyberbullying: {label}")
    print(f"Comment: {original_comment}")
    print("Non-zero TF-IDF features:")
    print(non_zero_features)
    print('-' * 80)



Row 0 — cyberbullying: 0.0
Comment: We want to keep the first sentence of the lead concise.  We can list all the CRs at the end of the lead or in a separate section.  Listing the white CRs makes it appear they are more important.  But Jamaica has almost the same population as NZ while Barbados is the oldest realm and at one time the most important.
Non-zero TF-IDF features:
appear        0.215776
concise       0.302647
end           0.197396
important     0.378566
lead          0.425690
list          0.168436
listing       0.256623
makes         0.190995
oldest        0.312897
population    0.246311
section       0.156832
sentence      0.205740
separate      0.227327
time          0.135615
want          0.145251
white         0.225797
Name: 43242, dtype: float64
--------------------------------------------------------------------------------
Row 1 — cyberbullying: 1.0
Comment: WHO CARES.... GOETHEAN IS A capuchin monkeys anyway
Non-zero TF-IDF features:
cares    1.0
Name: 44937, dtype:

## Output to file

In [90]:
# Save final dataset for ML modeling
tfidf_df.to_csv("output/tfidf_dataset.csv", index=False)
print("TF-IDF dataset saved to output/tfidf_dataset.csv")

TF-IDF dataset saved to output/tfidf_dataset.csv
