# Preprocessing of Data (train.csv)
This processes train.csv into train_subset.csv, which will be fed to the models.

## Imports

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

## Preprocessing

In [43]:
CSV_IN = "input/original_data.csv"   # Original data file
CSV_OUT = "output/clean_data.csv"    # Output file
MIN_POS = 100                        # Minimum positives per label
N_NEG = 500                          # Number of all-zero rows to include
RNG = 42                             # Random seed

# Load the full CSV
df = pd.read_csv(CSV_IN)

# Print the original dataframe
df.head(5)

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [44]:
# List of cyberbullying-related columns
cyberbullying_cols = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

# Add new column that is 1 if any of the specified columns is 1
df['cyberbullying'] = df[cyberbullying_cols].max(axis=1)

# Drop the original individual label columns if not needed
df = df.drop(columns=cyberbullying_cols)

# Drop the unneeded id column
df = df.drop(columns=['id'])

# Print the updated dataframe
df.head(10)

Unnamed: 0,comment_text,cyberbullying
0,Explanation\nWhy the edits made under my usern...,0
1,D'aww! He matches this background colour I'm s...,0
2,"Hey man, I'm really not trying to edit war. It...",0
3,"""\nMore\nI can't make any real suggestions on ...",0
4,"You, sir, are my hero. Any chance you remember...",0
5,"""\n\nCongratulations from me as well, use the ...",0
6,COCKSUCKER BEFORE YOU PISS AROUND ON MY WORK,1
7,Your vandalism to the Matt Shirvington article...,0
8,Sorry if the word 'nonsense' was offensive to ...,0
9,alignment on this subject and which are contra...,0


### tf-idf

In [45]:
# Initialize TF-IDF Vectorizer
vectorizer = TfidfVectorizer(
    max_features=5000,  # optional: limit vocabulary size
    stop_words='english', # filter out common words that typically do not carry significant meaning in text analysis, such as "the," "a," "is,"
    lowercase=True, # Convert to lowercase before tokenizing
    token_pattern = u'(?ui)\\b(?=\\w*[a-z])\\w{3,}\\b' # At least 3 characters, and at least one letter
)

# Apply TF-IDF on the 'comment_text' column
X_tfidf = vectorizer.fit_transform(df['comment_text'])

# Convert to DataFrame for saving
tfidf_df = pd.DataFrame(X_tfidf.toarray(), columns=vectorizer.get_feature_names_out())

# Append the binary label
tfidf_df['cyberbullying'] = df['cyberbullying'].values

In [46]:
# Show the resultant df
print(tfidf_df.shape)

# Show non-zero TF-IDF features and original comment text for the first 10 rows
for idx in range(10):
    original_comment = df.iloc[idx]['comment_text']
    label = tfidf_df.iloc[idx]['cyberbullying']
    
    feature_values = tfidf_df.drop(columns=['cyberbullying']).iloc[idx]
    non_zero_features = feature_values[feature_values > 0]
    
    print(f"Row {idx} — cyberbullying: {label}")
    print(f"Comment: {original_comment}")
    print("Non-zero TF-IDF features:")
    print(non_zero_features)
    print('-' * 80)




(159571, 5001)
Row 0 — cyberbullying: 0.0
Comment: Explanation
Why the edits made under my username Hardcore Metallica Fan were reverted? They weren't vandalisms, just closure on some GAs after I voted at New York Dolls FAC. And please don't remove the template from the talk page since I'm retired now.89.205.38.27
Non-zero TF-IDF features:
don            0.125641
edits          0.161602
explanation    0.227547
fac            0.295216
fan            0.253146
gas            0.303179
hardcore       0.334273
just           0.118525
new            0.158802
page           0.109272
remove         0.181422
retired        0.303367
reverted       0.195686
talk           0.110563
template       0.206478
username       0.232906
voted          0.293415
weren          0.275802
york           0.256364
Name: 0, dtype: float64
--------------------------------------------------------------------------------
Row 1 — cyberbullying: 0.0
Comment: D'aww! He matches this background colour I'm seemingly stuck 

In [47]:
# Save final dataset for ML modeling
tfidf_df.to_csv("output/tfidf_dataset.csv", index=False)
print("TF-IDF dataset saved to output/tfidf_dataset.csv")

TF-IDF dataset saved to output/tfidf_dataset.csv
