In [1]:
pip install pandas datasets scikit-learn

Note: you may need to restart the kernel to use updated packages.


In [2]:
import os

# Specify the new directory path
new_directory = '/Users/levan/ATENEO MASTERAL/Thesis/Development'

# Change the current working directory
os.chdir(new_directory)

In [3]:
import pandas as pd

# Load Dataset

In [4]:
dataset1 = pd.read_csv('Filipino-Text-Benchmarks/data/hatespeech/train.csv', engine='python')
dataset2 = pd.read_csv('Filipino-Text-Benchmarks/data/hatespeech/test.csv', engine='python')
dataset3 = pd.read_csv('Filipino-Text-Benchmarks/data/hatespeech/valid.csv', engine='python')

In [5]:
dataset1.head()
dataset1.label.value_counts()

label
0.0    5340
1.0    4660
Name: count, dtype: int64

In [6]:
dataset2.head()
dataset2.label.value_counts()

label
0.0    2225
1.0    2007
Name: count, dtype: int64

In [7]:
dataset3.head()
dataset3.label.value_counts()

label
0.0    2299
1.0    1933
Name: count, dtype: int64

# Data preprocessing

In [8]:
# Drop rows where 'label' is NaN
dataset1 = dataset1.dropna(subset=['label'])

# Convert 'label' from float to integer
dataset1['label'] = dataset1['label'].astype(int)

print(dataset1.head())
dataset1.label.value_counts()

                                                text  label
1  Inaasahan na ni Vice President Jejomar Binay n...      0
2  Mar Roxas TANG INA TUWID NA DAAN DAW .. EH SYA...      1
3  Salamat sa walang sawang suporta ng mga taga m...      0
4         @rapplerdotcom putangina mo binay TAKBO PA      1
5  Binay with selective amnesia, forgetting about...      0


label
0    5340
1    4660
Name: count, dtype: int64

In [9]:
# Drop rows where 'label' is NaN
dataset2 = dataset2.dropna(subset=['label'])

# Convert 'label' from float to integer
dataset2['label'] = dataset2['label'].astype(int)

print(dataset2.head())
dataset2.label.value_counts()

                                                text  label
0  Unshaded votes and votes for Mayor Duterte goe...      1
2              #NoMoreChance https://t.co/msaaUGv0bS      1
3  @itsmanj well there's other good choices like ...      0
4  Nognog. Pandak. Laki sa hirap. Pero corrupt. Y...      1
5  Ex-Binay aide turns tables on Mercado | https:...      0


label
0    2225
1    2007
Name: count, dtype: int64

In [10]:
# Drop rows where 'label' is NaN
dataset3 = dataset3.dropna(subset=['label'])

# Convert 'label' from float to integer
dataset3['label'] = dataset3['label'].astype(int)

print(dataset3.head())
dataset3.label.value_counts()

                                                text  label
0  Escudero denies betraying Poe after meeting wi...      0
1  Hndi ko makita yung sa one more chance saka ka...      1
2  Mar Roxas is now addressing the crowd gathered...      0
3  @ImYourBaeMax perfect! Para makaharap ni Duter...      0
4  #OnlyBinayPriority4Ps Wag nating hayaan na mal...      0


label
0    2299
1    1933
Name: count, dtype: int64

# Combine Datasets

In [11]:
combined_df = pd.concat([dataset1, dataset2, dataset3]).reset_index(drop=True)

In [12]:
combined_df.head()

Unnamed: 0,text,label
0,Inaasahan na ni Vice President Jejomar Binay n...,0
1,Mar Roxas TANG INA TUWID NA DAAN DAW .. EH SYA...,1
2,Salamat sa walang sawang suporta ng mga taga m...,0
3,@rapplerdotcom putangina mo binay TAKBO PA,1
4,"Binay with selective amnesia, forgetting about...",0


In [13]:
combined_df.label.value_counts()

label
0    9864
1    8600
Name: count, dtype: int64

# Partition Data into 3

In [14]:
from sklearn.model_selection import train_test_split

In [15]:
# Splitting the dataset into two parts, maintaining class balance

# Step 1: Split the original dataset into temp_data (90%) and data_c (10%)
temp_data, data_c = train_test_split(combined_df, test_size=0.1, stratify=combined_df['label'], random_state=42)

# Step 2: Split temp_data into data_a and data_b, each being 50% of temp_data (which is 45% of the original dataset)
data_a, data_b = train_test_split(temp_data, test_size=0.5, stratify=temp_data['label'], random_state=42)

In [16]:
data_a.label.value_counts()

label
0    4438
1    3870
Name: count, dtype: int64

In [17]:
data_b.label.value_counts()

label
0    4439
1    3870
Name: count, dtype: int64

In [18]:
data_c.label.value_counts()

label
0    987
1    860
Name: count, dtype: int64

# Download CSV

In [19]:
folder_path = 'Corpus/'

# Save DataFrames as CSV files
data_a.to_csv(f'{folder_path}data_a.csv', index=False)
data_b.to_csv(f'{folder_path}data_b.csv', index=False)
data_c.to_csv(f'{folder_path}data_c.csv', index=False)