In [6]:
import pandas as pd

# Load the CSV file
df = pd.read_csv("../data/data.csv")

In [7]:
# Display basic info
print("Dataset shape:", df.shape)
df.info(verbose=True)

Dataset shape: (159571, 8)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 159571 entries, 0 to 159570
Data columns (total 8 columns):
 #   Column         Non-Null Count   Dtype 
---  ------         --------------   ----- 
 0   id             159571 non-null  object
 1   comment_text   159571 non-null  object
 2   toxic          159571 non-null  int64 
 3   severe_toxic   159571 non-null  int64 
 4   obscene        159571 non-null  int64 
 5   threat         159571 non-null  int64 
 6   insult         159571 non-null  int64 
 7   identity_hate  159571 non-null  int64 
dtypes: int64(6), object(2)
memory usage: 9.7+ MB


In [8]:
df["toxic"] = df.iloc[:, 2:8].sum(axis=1) > 0
df.drop(df.columns[3:8], axis=1, inplace=True)
df.head()

Unnamed: 0,id,comment_text,toxic
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,False
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,False
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",False
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",False
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",False


In [9]:
def split_long_comments(df, max_len=1000, min_len_if_toxic=30):
    new_rows = []

    for idx, row in df.iterrows():
        text = row['comment_text']
        label = row['toxic']  # True or False

        if len(text) <= max_len:
            new_rows.append({'comment_text': text, 'toxic': label})
        else:
            for i in range(0, len(text), max_len):
                chunk = text[i:i + max_len]

                # ✅ Skip tiny chunks *only* if the original comment was toxic
                if label and len(chunk) < min_len_if_toxic:
                    continue

                new_rows.append({'comment_text': chunk, 'toxic': label})

    return pd.DataFrame(new_rows)

In [10]:
long_comments = df['comment_text'].apply(len) > 1000
count = long_comments.sum()
print("Number of comments longer than 1000 characters:", count)
df = split_long_comments(df)
df = df.sample(frac=1, random_state=42).reset_index(drop=True)

Number of comments longer than 1000 characters: 13360


In [11]:
print("Duplicate comments:", df.duplicated("comment_text").sum())
df = df.drop_duplicates(subset=["comment_text", "toxic"]).reset_index(drop=True)
print("Duplicate comments:", df.duplicated("comment_text").sum())

Duplicate comments: 505
Duplicate comments: 2


In [12]:
df = df[df['comment_text'].str.strip() != '']

In [13]:
from sklearn.model_selection import train_test_split

# Step 1: train + temp (val + test)
train_df, temp_df = train_test_split(
    df, 
    test_size=0.3, 
    stratify=df['toxic'], 
    random_state=42
)

# Step 2: val + test from temp
val_df, test_df = train_test_split(
    temp_df, 
    test_size=0.5, 
    stratify=temp_df['toxic'], 
    random_state=42
)


In [16]:
train_df.to_csv("../data/data_2_0/train.csv", index=False)
val_df.to_csv("../data/data_2_0/val.csv", index=False)
test_df.to_csv("../data/data_2_0/test.csv", index=False)