# Constructing data for training 

In [2]:
import pandas as pd

In [3]:
g_df            = pd.read_csv ('data/preprocessed_data/g_df.csv')
t_df            = pd.read_csv ('data/preprocessed_data/t_df.csv')
G_T_df          = pd.read_csv ('data/preprocessed_data/G_T_df.csv')

In [4]:
print (G_T_df.shape)
print (g_df.shape)
print (t_df.shape)

(82552, 3)
(136, 464)
(607, 55)


In [5]:
rand_state = 13

# 1-Splitting for training, cross validation and test set

In [6]:
from sklearn.model_selection import train_test_split

In [7]:
G_T_train, G_T_test  = train_test_split (G_T_df, test_size = 0.1, random_state = rand_state  )
G_T_train, G_T_cv    = train_test_split (G_T_train, test_size = 0.1, random_state = rand_state  )

In [8]:
print (G_T_train.shape)
print (G_T_cv.shape)
print (G_T_test.shape)

(66866, 3)
(7430, 3)
(8256, 3)


# 2-Resampling training set
- resample training set so that the number of 1's equals to the number of 0's

In [9]:
G_T_train['target'].value_counts()

target
0.0    64393
1.0     2473
Name: count, dtype: int64

In [10]:
from sklearn.utils import resample

In [11]:
majority_class = G_T_train[G_T_train['target'] == 0]
minority_class = G_T_train[G_T_train['target'] == 1]

num_instances_to_replicate = len(majority_class) - len(minority_class)

replicated_instances = resample(minority_class, n_samples=num_instances_to_replicate, random_state=42)

# Concatenate the replicated instances with the original minority class
balanced_G_T_train = pd.concat([majority_class, minority_class, replicated_instances])

# shuffling train set
balanced_G_T_train = balanced_G_T_train.sample(frac = 1, random_state= rand_state)


In [12]:
balanced_G_T_train['target'].value_counts()


target
1.0    64393
0.0    64393
Name: count, dtype: int64

In [13]:
balanced_G_T_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 128786 entries, 54731 to 69520
Data columns (total 3 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   group ID      128786 non-null  object 
 1   technique ID  128786 non-null  object 
 2   target        128786 non-null  float64
dtypes: float64(1), object(2)
memory usage: 3.9+ MB


# 3-Constructing Group, Technique, and Target matrices for Training, Cross Validation and Test set
- Constructing one-hot encoded Group and Technique datasets so that they match the dimensions of target datasets

In [14]:
g_df.shape

(136, 464)

In [15]:
G_train = pd.merge (g_df, balanced_G_T_train, on = 'group ID', how = 'right')
G_train.drop (columns= ['technique ID', 'target'], inplace = True)

T_train = pd.merge (t_df,balanced_G_T_train, on = 'technique ID', how = 'right')
T_train.drop (columns= ['group ID', 'target'], inplace= True)

G_cv = pd.merge (g_df, G_T_cv, on = 'group ID', how = 'right') 
G_cv.drop (columns= ['technique ID', 'target'], inplace = True)

T_cv = pd.merge (t_df, G_T_cv, on = 'technique ID', how = 'right')
T_cv.drop (columns= ['group ID', 'target'], inplace= True)

G_test = pd.merge (g_df, G_T_test, on = 'group ID', how = 'right') 
G_test.drop (columns= ['technique ID', 'target'], inplace = True)

T_test = pd.merge (t_df, G_T_test, on = 'technique ID', how = 'right')
T_test.drop (columns= ['group ID', 'target'], inplace= True)



In [16]:
print ("train dataset dimensions")
print (G_train.shape)
print (T_train.shape)
print (balanced_G_T_train.shape)

print ("cross validation dataset dimensions")
print (G_cv.shape)
print (T_cv.shape)
print (G_T_cv.shape)

print ("test dataset dimensions")
print (G_test.shape)
print (T_test.shape)
print (G_T_test.shape)


train dataset dimensions
(128786, 464)
(128786, 55)
(128786, 3)
cross validation dataset dimensions
(7430, 464)
(7430, 55)
(7430, 3)
test dataset dimensions
(8256, 464)
(8256, 55)
(8256, 3)


---
# Export

In [17]:
dfs = {
"G_train" : G_train,
"T_train" : T_train,
"balanced_G_T_train" : balanced_G_T_train,
"G_cv" : G_cv,
"T_cv" : T_cv,
"G_T_cv" : G_T_cv,
"G_test" : G_test,
"T_test" : T_test,
"G_T_test" : G_T_test
}

In [18]:
import attck_utils
for key in dfs.keys():
    # dfs[key].to_csv (f"preprocessed_data/{key}.csv", index = False)
    attck_utils.save_df_to_csv (
    path = 'data/preprocessed_data',
    filename = key,
    df = dfs[key]
    )