In [1]:
pip install pandas datasets scikit-learn

Note: you may need to restart the kernel to use updated packages.


In [2]:
import os

# Specify the new directory path
new_directory = '/Users/levan/ATENEO MASTERAL/Thesis/Development'

# Change the current working directory
os.chdir(new_directory)

In [3]:
import pandas as pd

# Load Dataset

In [4]:
from datasets import load_dataset

dataset = load_dataset("ccosme/FiReCS")

train_data = dataset['train']
test_data = dataset['test']

dataset1 = pd.DataFrame(train_data)
dataset2 = pd.DataFrame(test_data)

In [5]:
dataset1.head()

Unnamed: 0,review,label
0,im very disappointed kasi di gumana ang dalawa...,0.0
1,10 10 it has minor damage when it was delivere...,1.0
2,super ganda then tama ung size and color sa uu...,2.0
3,we like it po madali naming na receive and goo...,2.0
4,walang picture and video nung product wala kas...,2.0


In [6]:
dataset1.label.value_counts()

label
1.0    2549
2.0    2410
0.0    2381
Name: count, dtype: int64

In [7]:
dataset2.head()

Unnamed: 0,review,label
0,okay naman kaso maliit size e adjust nyona lan...,1.0
1,good seller sends out photo before shipping ma...,2.0
2,di cya gano kabilis sa net loading and loading...,1.0
3,green ang order ko black ang dumating pero gre...,0.0
4,no scam po di deliver sa akin yunq parcel ko t...,0.0


In [8]:
dataset2.label.value_counts()

label
1.0    1087
2.0    1033
0.0    1027
Name: count, dtype: int64

# Data Preprocessing

In [9]:
# Drop rows where 'label' is NaN
dataset1 = dataset1.dropna(subset=['label'])

# Convert 'label' from float to integer
dataset1['label'] = dataset1['label'].astype(int)

print(dataset1.head())
dataset1.label.value_counts()

                                              review  label
0  im very disappointed kasi di gumana ang dalawa...      0
1  10 10 it has minor damage when it was delivere...      1
2  super ganda then tama ung size and color sa uu...      2
3  we like it po madali naming na receive and goo...      2
4  walang picture and video nung product wala kas...      2


label
1    2549
2    2410
0    2381
Name: count, dtype: int64

In [10]:
# Drop rows where 'label' is NaN
dataset2 = dataset2.dropna(subset=['label'])

# Convert 'label' from float to integer
dataset2['label'] = dataset2['label'].astype(int)

print(dataset2.head())
dataset2.label.value_counts()

                                              review  label
0  okay naman kaso maliit size e adjust nyona lan...      1
1  good seller sends out photo before shipping ma...      2
2  di cya gano kabilis sa net loading and loading...      1
3  green ang order ko black ang dumating pero gre...      0
4  no scam po di deliver sa akin yunq parcel ko t...      0


label
1    1087
2    1033
0    1027
Name: count, dtype: int64

# Combine Datasets

In [11]:
combined_df = pd.concat([dataset1, dataset2]).reset_index(drop=True)

In [12]:
combined_df.head()

Unnamed: 0,review,label
0,im very disappointed kasi di gumana ang dalawa...,0
1,10 10 it has minor damage when it was delivere...,1
2,super ganda then tama ung size and color sa uu...,2
3,we like it po madali naming na receive and goo...,2
4,walang picture and video nung product wala kas...,2


In [13]:
combined_df.label.value_counts()

label
1    3636
2    3443
0    3408
Name: count, dtype: int64

# Partition Data into 3

In [14]:
from sklearn.model_selection import train_test_split

In [15]:
# Splitting the dataset into two parts, maintaining class balance

# Step 1: Split the original dataset into temp_data (90%) and data_c (10%)
temp_data, data_c = train_test_split(combined_df, test_size=0.1, stratify=combined_df['label'], random_state=42)

# Step 2: Split temp_data into data_a and data_b, each being 50% of temp_data (which is 45% of the original dataset)
data_a, data_b = train_test_split(temp_data, test_size=0.5, stratify=temp_data['label'], random_state=42)

In [16]:
data_a.label.value_counts()

label
1    1636
2    1550
0    1533
Name: count, dtype: int64

In [17]:
data_b.label.value_counts()

label
1    1636
2    1549
0    1534
Name: count, dtype: int64

In [18]:
data_c.label.value_counts()

label
1    364
2    344
0    341
Name: count, dtype: int64

# Download CSV

In [19]:
folder_path = 'Corpus/FiReCS/'

# Save DataFrames as CSV files
data_a.to_csv(f'{folder_path}FiReCS_data_a.csv', index=False)
data_b.to_csv(f'{folder_path}FiReCS_data_b.csv', index=False)
data_c.to_csv(f'{folder_path}FiReCS_data_c.csv', index=False)