In [1]:
import pandas as pd
import numpy as np

from sklearn.datasets import make_classification
from sklearn.metrics import classification_report
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
# Feature Scaling
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import RobustScaler

# data mining libaries
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.decomposition import PCA#, FastICA
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold, GridSearchCV, learning_curve
from sklearn import svm
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import roc_curve, auc, confusion_matrix, accuracy_score, f1_score, precision_score, recall_score, roc_auc_score

from imblearn.pipeline import make_pipeline, Pipeline
from imblearn.over_sampling import SMOTE

#plot libaries
import plotly
import plotly.graph_objs as go
import plotly.figure_factory as ff
from plotly.offline import init_notebook_mode
init_notebook_mode(connected=True) # to show plots in notebook

# online plotly
#from plotly.plotly import plot, iplot
#plotly.tools.set_credentials_file(username='XXXXXXXXXXXXXXX', api_key='XXXXXXXXXXXXXXX')

# offline plotly
from plotly.offline import plot, iplot

# do not show any warnings
import warnings
warnings.filterwarnings('ignore')

SEED = 17 # specify seed for reproducable results
pd.set_option('display.max_columns', None) # prevents abbreviation (with '...') of columns in prints

In [3]:
train = pd.read_csv('files/train_vif_corr_removed.csv')
test = pd.read_csv('files/test_vif_corr_removed.csv')

X = train.drop(['Response'], axis=1)
X_test =test
y =train['Response']

In [4]:
X.shape

(50882, 12)

In [5]:
X_test.shape

(21805, 12)

In [6]:
# normalize the dataset (note: for decision tree/random forest it would not be needed)
df_X_normed = (X - X.mean()) / X.std()
df_X_normed_test =(X_test - X_test.mean()) / X_test.std()

In [7]:
# calculate the principal components
pca = PCA(random_state=SEED)
df_X_pca = pca.fit_transform(df_X_normed)

df_X_pca_test = pca.fit_transform(df_X_normed_test)


In [8]:
print('train null check:',df_X_normed.isnull().values.any(),'\nTest null check:',df_X_normed_test.isnull().values.any())

train null check: False 
Test null check: False


In [9]:
print('train inf check:',np.isinf(df_X_normed).any(),'\nTest inf check:',np.isinf(df_X_normed_test).any())

train inf check: Reco_Policy_Premium            False
City_Code_cnt                  False
Region_Code_cnt                False
Health Indicator_cnt           False
Holding_Policy_Duration_cnt    False
Reco_Policy_Cat_cnt            False
Accomodation_Type_Rented       False
Reco_Insurance_Type_Joint      False
Holding_Policy_Type_2.0        False
Holding_Policy_Type_3.0        False
Holding_Policy_Type_4.0        False
Upper_Age_Encoded_Adult        False
dtype: bool 
Test inf check: Reco_Policy_Premium            False
City_Code_cnt                  False
Region_Code_cnt                False
Health Indicator_cnt           False
Holding_Policy_Duration_cnt    False
Reco_Policy_Cat_cnt            False
Accomodation_Type_Rented       False
Reco_Insurance_Type_Joint      False
Holding_Policy_Type_2.0        False
Holding_Policy_Type_3.0        False
Holding_Policy_Type_4.0        False
Upper_Age_Encoded_Adult        False
dtype: bool


In [10]:
tot = sum(pca.explained_variance_) # total explained variance of all principal components
var_exp = [(i / tot) * 100 for i in sorted(pca.explained_variance_, reverse=True)] # individual explained variance
cum_var_exp = np.cumsum(var_exp) # cumulative explained variance

In [11]:
trace_cum_var_exp = go.Bar(
    x=list(range(1, len(cum_var_exp) + 1)), 
    y=var_exp,
    name="individual explained variance",
)
trace_ind_var_exp = go.Scatter(
    x=list(range(1, len(cum_var_exp) + 1)),
    y=cum_var_exp,
    mode='lines+markers',
    name="cumulative explained variance",
    line=dict(
        shape='hv',
    ))
data = [trace_cum_var_exp, trace_ind_var_exp]
layout = go.Layout(
    title='Individual and Cumulative Explained Variance',
    autosize=True,
    yaxis=dict(
        title='percentage of explained variance',
    ),
    xaxis=dict(
        title="principal components",
        dtick=1,
    ),
    legend=dict(
        x=0,
        y=1,
    ),
)
fig = go.Figure(data=data, layout=layout)
iplot(fig, filename='basic-bar')

In [14]:
n_components = 10
df_X_reduced = np.dot(df_X_normed, pca.components_[:n_components,:].T)
df_X_reduced = pd.DataFrame(df_X_reduced, columns=["PC#%d" % (x + 1) for x in range(n_components)])

In [15]:
df_X_reduced.head()

Unnamed: 0,PC#1,PC#2,PC#3,PC#4,PC#5,PC#6,PC#7,PC#8,PC#9,PC#10
0,-0.902248,-0.117767,0.816634,0.211921,-1.079893,0.920238,-1.408015,-0.503593,0.205746,-1.218548
1,2.326013,2.882903,-0.749008,-0.806112,0.428748,1.726829,-0.811209,1.399807,0.737036,0.477093
2,-0.867831,-0.871824,-0.31738,-0.409642,-1.134175,0.379402,-0.117728,0.417464,-1.038658,1.077293
3,0.803935,0.778039,-1.354712,-0.957281,-1.145198,1.434946,0.346349,-0.522072,-1.08024,-0.856512
4,-0.477556,-1.64856,0.199145,-0.779057,0.137281,0.125674,-0.251917,0.127021,0.535004,-1.120035


In [16]:
n_components = 10
df_X_reduced_test = np.dot(df_X_normed_test, pca.components_[:n_components,:].T)
df_X_reduced_test = pd.DataFrame(df_X_reduced_test, columns=["PC#%d" % (x + 1) for x in range(n_components)])

In [17]:
print('train null check:',df_X_reduced.isnull().values.any(),'\nTest null check:',df_X_reduced_test.isnull().values.any())

train null check: False 
Test null check: False


In [18]:
print('train inf check:',np.isinf(df_X_reduced).any(),'\nTest inf check:',np.isinf(df_X_reduced_test).any())

train inf check: PC#1     False
PC#2     False
PC#3     False
PC#4     False
PC#5     False
PC#6     False
PC#7     False
PC#8     False
PC#9     False
PC#10    False
dtype: bool 
Test inf check: PC#1     False
PC#2     False
PC#3     False
PC#4     False
PC#5     False
PC#6     False
PC#7     False
PC#8     False
PC#9     False
PC#10    False
dtype: bool


In [19]:
df_X_reduced.to_csv('train_vif_corr_removed_pca.csv', index=False)
df_X_reduced_test.to_csv('test_vif_corr_removed_pca.csv', index=False)