In [None]:
import numpy as np
import pandas as pd
from statsmodels.stats.outliers_influence import variance_inflation_factor

In [None]:
q1features_train_1 = pd.read_pickle("q1features_train_1.pickle")
q1features_valid_1 = pd.read_pickle("q1features_valid_1.pickle")
q1target_train_1 = pd.read_pickle("q1target_train_1.pickle")
q1target_valid_1 = pd.read_pickle("q1target_valid_1.pickle")

In [None]:
#Imputing validation set
for col in q1features_valid_1.columns.tolist():
  q1features_valid_1[col] = q1features_valid_1[col].replace([None], np.nan)
  if (q1features_valid_1[col].dtype == 'category' or q1features_valid_1[col].dtype =='datetime64[ns]'):
    q1features_valid_1[col]= q1features_valid_1[col].fillna(q1features_train_1[col].mode()[0])
  if (q1features_valid_1[col].dtype == 'Int64' or q1features_valid_1[col].dtype == 'int64'):
    q1features_valid_1[col] = q1features_valid_1[col].astype('float64')
  if (q1features_valid_1[col].dtype == 'float64'):
    q1features_valid_1[col] = q1features_valid_1[col].fillna(q1features_train_1[col].mean())

In [None]:
#Imputing training set
for col in q1features_train_1.columns.tolist():
  q1features_train_1[col] = q1features_train_1[col].replace([None], np.nan)
  if (q1features_train_1[col].dtype == 'category' or q1features_train_1[col].dtype =='datetime64[ns]'):
    q1features_train_1[col] = q1features_train_1[col].fillna(q1features_train_1[col].mode()[0])
  if (q1features_train_1[col].dtype == 'Int64' or q1features_train_1[col].dtype == 'int64'):
    q1features_train_1[col] = q1features_train_1[col].astype('float64')
  if (q1features_train_1[col].dtype == 'float64'):
    q1features_train_1[col] = q1features_train_1[col].fillna(q1features_train_1[col].mean())

In [None]:
#Normalization of validation set
for col in q1features_valid_1.columns.tolist():
  if (q1features_valid_1[col].dtype == 'Int64' or q1features_valid_1[col].dtype == 'int64' or q1features_valid_1[col].dtype == 'float64'):
    q1features_valid_1[col] = (q1features_valid_1[col] - min(q1features_train_1[col]))/(max(q1features_train_1[col]) - min(q1features_train_1[col]))

In [None]:
#Normalization of training set
for col in q1features_train_1.columns.tolist():
  if (q1features_train_1[col].dtype == 'Int64' or q1features_train_1[col].dtype == 'int64' or q1features_train_1[col].dtype == 'float64'):
    q1features_train_1[col] = (q1features_train_1[col] - min(q1features_train_1[col]))/(max(q1features_train_1[col]) - min(q1features_train_1[col]))

In [None]:
#The SMOTE algorithm cannot handle datetime, so converting to number of
#days since January 1st, year one
for i in range(len(q1features_train_1['INTERVIEWDATE'])):
  q1features_train_1.loc[q1features_train_1.index[i], 'INTERVIEWDATE'] =  q1features_train_1.loc[q1features_train_1.index[i], 'INTERVIEWDATE'].toordinal()

for i in range(len(q1features_valid_1['INTERVIEWDATE'])):
  q1features_valid_1.loc[q1features_valid_1.index[i], 'INTERVIEWDATE'] =  q1features_valid_1.loc[q1features_valid_1.index[i], 'INTERVIEWDATE'].toordinal()

q1features_train_1['INTERVIEWDATE'] = q1features_train_1['INTERVIEWDATE'].astype('float64')
q1features_valid_1['INTERVIEWDATE'] = q1features_valid_1['INTERVIEWDATE'].astype('float64')

In [None]:
#Removing highly correlated variables

#First selecting numeric attributes
numericVars = q1features_train_1.select_dtypes(include='float64')

In [None]:
vif_data = pd.DataFrame()
vif_data["feature"] = numericVars.columns

# calculating VIF for each feature
vif_data["VIF"] = [variance_inflation_factor(numericVars.values, i)
                          for i in range(len(numericVars.columns))]

print(vif_data)

In [None]:
#Iteratively removed attributes starting with highest VIF until all
#VIF values are below ten.
numericVars = numericVars.drop(['_AGE80', 'INTERVIEWDATE', 'HTM4'], axis=1)

In [None]:
q1features_train_1 = q1features_train_1.drop(['_AGE80', 'INTERVIEWDATE', 'HTM4'], axis=1)

In [None]:
#SMOTE Algorithm
from imblearn.over_sampling import SMOTE
sm = SMOTE(random_state = 2)
q1features_train_1_SM, q1target_train_1_SM = sm.fit_resample(q1features_train_1, q1target_train_1.ravel())