# ARU - Hackathon - Pre-processing

### Collaboratively Team written by Max
### Cleaned By Max

In [7]:
%%time
%matplotlib inline

# Importing libraries
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import itertools
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from imblearn.combine import SMOTEENN
from collections import Counter

import warnings  # For handling warnings
warnings.filterwarnings('ignore')  # Ignore all warnings

# set pandas display options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)



print("Done!")

Done!
CPU times: user 484 µs, sys: 277 µs, total: 761 µs
Wall time: 748 µs


In [3]:
# Reading the cleaned dataset
df = pd.read_csv("final_df.csv")

In [4]:
# Printing the shape of the dataset
print(df.shape)

(274702, 14)


In [5]:
# Creating train, validation and test sets
X_train, X_rem, y_train, y_rem = train_test_split(df.drop('is_corona_positive', axis=1), df['is_corona_positive'], test_size=0.2, random_state=42)
X_valid, X_test, y_valid, y_test = train_test_split(X_rem, y_rem, test_size=0.5)

In [6]:
# Oversampling the train set with SMOTEENN because of the imbalanced target variable
X_smote_train, y_smote_train = SMOTEENN(random_state=42).fit_resample(X_train, y_train)

In [9]:
# Printing the value counts of the target variable
print(Counter(y_smote_train))

Counter({0: 194788, 1: 78825})


In [10]:
# Scaling the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_smote_train)
X_valid_scaled, X_test_scaled = scaler.transform(X_valid), scaler.transform(X_test)

In [11]:
# Creating dataframes from the scaled data
X_train_scaled_df = pd.DataFrame(X_train_scaled, columns=X_train.columns).reset_index(drop=True)
X_valid_scaled_df = pd.DataFrame(X_valid_scaled, columns=X_valid.columns).reset_index(drop=True)
X_test_scaled_df = pd.DataFrame(X_test_scaled, columns=X_test.columns).reset_index(drop=True)

y_smote_train_df = pd.DataFrame(y_smote_train, columns=['is_corona_positive']).reset_index(drop=True)
y_valid_df = pd.DataFrame(y_valid, columns=['is_corona_positive']).reset_index(drop=True)
y_test_df = pd.DataFrame(y_test, columns=['is_corona_positive']).reset_index(drop=True)

In [13]:
# Printing the value counts of the target variable
print(y_smote_train_df.value_counts())
print(y_valid_df.value_counts())
print(y_test_df.value_counts())

is_corona_positive
0                     194788
1                      78825
dtype: int64
is_corona_positive
0                     26013
1                      1457
dtype: int64
is_corona_positive
0                     26051
1                      1420
dtype: int64


In [14]:
# Creating dataframes for train, validation and test sets
train_df = pd.concat([X_train_scaled_df, y_smote_train_df], axis=1)
valid_df = pd.concat([X_valid_scaled_df, y_valid_df], axis=1)
test_df = pd.concat([X_test_scaled_df, y_test_df], axis=1)

In [54]:
# Saving train, validation and test sets to csv files
train_df.to_csv("train_df.csv", index=False)
valid_df.to_csv("valid_df.csv", index=False)
test_df.to_csv("test_df.csv", index=False)