# Final Project

## TRAC 2 - Oversampling training data

In this notebook we generate a dataset oversampling the minority classes in the training data to help with the classes imbalance.

## Package imports

In [1]:
import pandas as pd
import numpy as np

## Load training data

In [2]:
# Load aggressiveness training dataset
train_data = pd.read_csv('../../../data/release-files/eng/trac2_eng_train.csv')

## Oversampling

In [3]:
train_data.head()

Unnamed: 0,ID,Text,Sub-task A,Sub-task B
0,C45.451,Next part,NAG,NGEN
1,C47.11,Iii8mllllllm\nMdxfvb8o90lplppi0005,NAG,NGEN
2,C33.79,🤣🤣😂😂🤣🤣🤣😂osm vedio ....keep it up...make more v...,NAG,NGEN
3,C4.1961,What the fuck was this? I respect shwetabh and...,NAG,NGEN
4,C10.153,Concerned authorities should bring arundathi R...,NAG,NGEN


In [4]:
train_data['Sub-task A'].value_counts(normalize=True)

NAG    0.791696
CAG    0.106263
OAG    0.102041
Name: Sub-task A, dtype: float64

In [5]:
train_data['Sub-task B'].value_counts(normalize=True)

NGEN    0.927516
GEN     0.072484
Name: Sub-task B, dtype: float64

In [6]:
## create a column that considers all the possible combination of classes for task A and task B
## NAG-NGEN, NAG-GEN, CAG-NGEN, CAG-GEN, OAG-NGEN, OAG-GEN

# create a list of conditions
conditions = [(train_data['Sub-task A'] == 'NAG') & (train_data['Sub-task B'] == 'NGEN'),
              (train_data['Sub-task A'] == 'NAG') & (train_data['Sub-task B'] == 'GEN'), 
              (train_data['Sub-task A'] == 'CAG') & (train_data['Sub-task B'] == 'NGEN'),
              (train_data['Sub-task A'] == 'CAG') & (train_data['Sub-task B'] == 'GEN'),
              (train_data['Sub-task A'] == 'OAG') & (train_data['Sub-task B'] == 'NGEN'),
              (train_data['Sub-task A'] == 'OAG') & (train_data['Sub-task B'] == 'GEN')
             ]
           
# values for each condition
values = [0, 1, 2, 3, 4, 5]

# create a new column 
train_data['combined'] = np.select(conditions, values)

In [7]:
train_data.head()

Unnamed: 0,ID,Text,Sub-task A,Sub-task B,combined
0,C45.451,Next part,NAG,NGEN,0
1,C47.11,Iii8mllllllm\nMdxfvb8o90lplppi0005,NAG,NGEN,0
2,C33.79,🤣🤣😂😂🤣🤣🤣😂osm vedio ....keep it up...make more v...,NAG,NGEN,0
3,C4.1961,What the fuck was this? I respect shwetabh and...,NAG,NGEN,0
4,C10.153,Concerned authorities should bring arundathi R...,NAG,NGEN,0


In [8]:
train_data['combined'].value_counts()

0    3241
2     418
4     295
5     140
1     134
3      35
Name: combined, dtype: int64

In [9]:
# create a dataframe for each class
train_0 = train_data[train_data['combined'] == 0]
train_1 = train_data[train_data['combined'] == 1]
train_2 = train_data[train_data['combined'] == 2]
train_3 = train_data[train_data['combined'] == 3]
train_4 = train_data[train_data['combined'] == 4]
train_5 = train_data[train_data['combined'] == 5]

In [10]:
# sample minority classes with replacement
# using a factor of 6 (i.e. sampling 6 times the number of examples we have in the minority classes)

df0 = train_0
df1 = train_1.sample(804, replace=True, random_state=12345)
df2 = train_2.sample(2508, replace=True, random_state=12345)
df3 = train_3.sample(210, replace=True, random_state=12345)
df4 = train_4.sample(1770, replace=True, random_state=12345)
df5 = train_5.sample(840, replace=True, random_state=12345)


# concatenate dataframes
train_data_oversampled = pd.concat([df0,df1,df2,df3,df4,df5], axis=0)

In [11]:
train_data_oversampled.shape

(9373, 5)

In [12]:
train_data_oversampled['Sub-task A'].value_counts(normalize=True)

NAG    0.431559
CAG    0.289982
OAG    0.278459
Name: Sub-task A, dtype: float64

In [13]:
train_data_oversampled['Sub-task B'].value_counts(normalize=True)

NGEN    0.802198
GEN     0.197802
Name: Sub-task B, dtype: float64

In [14]:
train_data_oversampled.to_csv('../../../data/release-files/eng/trac2_eng_train_oversampled.csv', index=False)