In [38]:
%%time

# load libraries
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import itertools
import plotly.graph_objects as go
from plotly.subplots import make_subplots


pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
import warnings  # For handling warnings
warnings.filterwarnings('ignore')  # Ignore all warnings

print("Done!")

Done!
CPU times: user 739 µs, sys: 885 µs, total: 1.62 ms
Wall time: 1.63 ms


In [39]:
# locate data
import os
os.listdir('../aru-hackathon-team1')

['coding_space.ipynb',
 'corona_tested_individuals_ver_006.english.csv',
 'README.md',
 '.git']

In [40]:
# load data
df = pd.read_csv("final_df.csv")
print(df.shape)
df.head(5)

(274702, 14)


Unnamed: 0,days_since_start,cough,fever,sore_throat,shortness_of_breath,head_ache,is_corona_positive,test_indication_Abroad,test_indication_Contact with confirmed,test_indication_Other,age_60_and_above_NaN,age_60_and_above_Yes,gender_NaN,gender_male
0,50,0,0,0,0,0,0,0,0,1,1,0,0,0
1,50,1,0,0,0,0,0,0,0,1,1,0,0,0
2,50,0,1,0,0,0,0,0,0,1,1,0,0,1
3,50,1,0,0,0,0,0,0,0,1,1,0,0,0
4,50,1,0,0,0,0,0,0,0,1,1,0,0,1


In [41]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from imblearn.combine import SMOTEENN

In [42]:
# In the first step we will split the data in training and remaining dataset
X_train, X_rem, y_train, y_rem = train_test_split(df.drop('is_corona_positive', axis=1), df['is_corona_positive'], test_size=0.2, random_state=42)

# Now since we want the valid and test size to be equal (10% each of overall data). 
# we have to define valid_size=0.5 (that is 50% of remaining data)
X_valid, X_test, y_valid, y_test = train_test_split(X_rem, y_rem, test_size=0.5)

In [43]:
print("X_train shape: ", X_train.shape)
print("X_valid shape: ", X_valid.shape)
print("X_test shape: ", X_test.shape)

print("y_train shape: ", y_train.shape)
print("y_valid shape: ", y_valid.shape)
print("y_test shape: ", y_test.shape)

print(y_train.value_counts())
print(y_valid.value_counts())
print(y_test.value_counts())

X_train shape:  (219761, 13)
X_valid shape:  (27470, 13)
X_test shape:  (27471, 13)
y_train shape:  (219761,)
y_valid shape:  (27470,)
y_test shape:  (27471,)
is_corona_positive
0    207944
1     11817
Name: count, dtype: int64
is_corona_positive
0    26009
1     1461
Name: count, dtype: int64
is_corona_positive
0    26055
1     1416
Name: count, dtype: int64


In [44]:
X_smote_train, y_smote_train = SMOTEENN(random_state=42).fit_resample(X_train, y_train)

In [45]:
print("X_smote_train shape: ", X_smote_train.shape)
print("y_smote_train shape: ", y_smote_train.shape)

X_smote_train shape:  (273613, 13)
y_smote_train shape:  (273613,)


In [46]:
from collections import Counter
print(Counter(y_smote_train))

Counter({0: 194788, 1: 78825})


In [47]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_smote_train)
X_valid_scaled, X_test_scaled = scaler.transform(X_valid), scaler.transform(X_test)

In [48]:
print("X_train_scaled shape: ", X_train_scaled.shape)
print("X_valid_scaled shape: ", X_valid_scaled.shape)
print("X_test_scaled shape: ", X_test_scaled.shape)

X_train_scaled shape:  (273613, 13)
X_valid_scaled shape:  (27470, 13)
X_test_scaled shape:  (27471, 13)


In [49]:
X_train_scaled_df = pd.DataFrame(X_train_scaled, columns=X_train.columns).reset_index(drop=True)
X_valid_scaled_df = pd.DataFrame(X_valid_scaled, columns=X_valid.columns).reset_index(drop=True)
X_test_scaled_df = pd.DataFrame(X_test_scaled, columns=X_test.columns).reset_index(drop=True)

y_smote_train_df = pd.DataFrame(y_smote_train, columns=['is_corona_positive']).reset_index(drop=True)
y_valid_df = pd.DataFrame(y_valid, columns=['is_corona_positive']).reset_index(drop=True)
y_test_df = pd.DataFrame(y_test, columns=['is_corona_positive']).reset_index(drop=True)

In [50]:
print(X_train_scaled_df.shape)
print(X_valid_scaled_df.shape)
print(X_test_scaled_df.shape)

print(y_smote_train_df.shape)
print(y_valid_df.shape)
print(y_test_df.shape)

(273613, 13)
(27470, 13)
(27471, 13)
(273613, 1)
(27470, 1)
(27471, 1)


In [51]:
print(y_smote_train_df.value_counts())
print(y_valid_df.value_counts())
print(y_test_df.value_counts())

is_corona_positive
0                     194788
1                      78825
Name: count, dtype: int64
is_corona_positive
0                     26009
1                      1461
Name: count, dtype: int64
is_corona_positive
0                     26055
1                      1416
Name: count, dtype: int64


In [52]:
train_df = pd.concat([X_train_scaled_df, y_smote_train_df], axis=1)
valid_df = pd.concat([X_valid_scaled_df, y_valid_df], axis=1)
test_df = pd.concat([X_test_scaled_df, y_test_df], axis=1)

In [53]:
print(train_df.shape)
print(valid_df.shape)
print(test_df.shape)

(273613, 14)
(27470, 14)
(27471, 14)


In [54]:
train_df.to_csv("train_df.csv", index=False)
valid_df.to_csv("valid_df.csv", index=False)
test_df.to_csv("test_df.csv", index=False)