# Final Project

## TRAC 2 - Oversampling training data

In this notebook we generate a dataset oversampling the minority classes in the training data to help with the classes imbalance.

## Package imports

In [1]:
import pandas as pd
import numpy as np

## Load training data

In [2]:
# Load aggressiveness training dataset
train_data = pd.read_csv('../../../data/release-files/eng/trac2_eng_train.csv')

## Oversampling

In [3]:
train_data.head()

Unnamed: 0,ID,Text,Sub-task A,Sub-task B
0,C45.451,Next part,NAG,NGEN
1,C47.11,Iii8mllllllm\nMdxfvb8o90lplppi0005,NAG,NGEN
2,C33.79,🤣🤣😂😂🤣🤣🤣😂osm vedio ....keep it up...make more v...,NAG,NGEN
3,C4.1961,What the fuck was this? I respect shwetabh and...,NAG,NGEN
4,C10.153,Concerned authorities should bring arundathi R...,NAG,NGEN


In [4]:
train_data['Sub-task A'].value_counts(normalize=True)

NAG    0.791696
CAG    0.106263
OAG    0.102041
Name: Sub-task A, dtype: float64

In [5]:
train_data['Sub-task B'].value_counts(normalize=True)

NGEN    0.927516
GEN     0.072484
Name: Sub-task B, dtype: float64

### Oversampling for Task-A

In [6]:
train_data['Sub-task A'].value_counts()

NAG    3375
CAG     453
OAG     435
Name: Sub-task A, dtype: int64

In [7]:
# create a dataframe for each class
train_0 = train_data[train_data['Sub-task A'] == 'NAG']
train_1 = train_data[train_data['Sub-task A'] == 'CAG']
train_2 = train_data[train_data['Sub-task A'] == 'OAG']

In [8]:
# sample minority classes with replacement
df0 = train_0
df1 = train_1.sample(3375, replace=True, random_state=12345)
df2 = train_2.sample(3375, replace=True, random_state=12345)

In [9]:
# concatenate dataframes
train_data_oversampled_taskA = pd.concat([df0,df1,df2], axis=0)

In [10]:
train_data_oversampled_taskA.shape

(10125, 4)

In [11]:
train_data_oversampled_taskA['Sub-task A'].value_counts(normalize=True)

NAG    0.333333
CAG    0.333333
OAG    0.333333
Name: Sub-task A, dtype: float64

In [12]:
train_data_oversampled_taskA.to_csv('../../../data/release-files/eng/trac2_eng_train_oversampled_task_A.csv', index=False)

### Oversampling for Task-B

In [13]:
train_data['Sub-task B'].value_counts()

NGEN    3954
GEN      309
Name: Sub-task B, dtype: int64

In [14]:
# create a dataframe for each class
train_0 = train_data[train_data['Sub-task B'] == 'NGEN']
train_1 = train_data[train_data['Sub-task B'] == 'GEN']

In [15]:
# sample minority classes with replacement
df0 = train_0
df1 = train_1.sample(3954, replace=True, random_state=12345)

In [16]:
# concatenate dataframes
train_data_oversampled_taskB = pd.concat([df0,df1], axis=0)

In [17]:
train_data_oversampled_taskB.shape

(7908, 4)

In [18]:
train_data_oversampled_taskB['Sub-task B'].value_counts(normalize=True)

NGEN    0.5
GEN     0.5
Name: Sub-task B, dtype: float64

In [19]:
train_data_oversampled_taskB.to_csv('../../../data/release-files/eng/trac2_eng_train_oversampled_task_B.csv', index=False)