In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import *
import string
from functools import reduce
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier

In [None]:
##Reading data
event_type = pd.read_csv('../Data/event_type.csv')
log_feature = pd.read_csv('../Data/log_feature.csv')
resource_type = pd.read_csv('../Data/resource_type.csv')
severity_type = pd.read_csv('../Data/severity_type.csv')
test = pd.read_csv('../Data/test.csv')
train = pd.read_csv('../Data/train.csv')

## Data Prep

In [None]:
##event type denormalized
event_type_flattened = pd.get_dummies(event_type, columns=["event_type"])
event_type_flattened = event_type_flattened.groupby(['id']).agg(['sum'])
event_type_flattened.reset_index(inplace = True)

In [None]:
## Log featured denormalized
log_feature_flatened = log_feature.pivot(index='id', columns='log_feature', values='volume')
log_feature_flatened.reset_index(inplace = True)
log_feature_flatened.fillna(0,inplace=True)

In [None]:
## Resource type denormalized
Resource_type_flattened = pd.get_dummies(resource_type, columns=["resource_type"])
Resource_type_flattened = Resource_type_flattened.groupby(['id']).agg(['sum'])
Resource_type_flattened.reset_index(inplace = True)

## Feature extraction

### Ideas for Features extraction
* Number of events/feature/resource occured
* Most used events/feature/resource occured --- Not applicable no duplication
* Volume of feature / max value of feature
* sum of total volume
* Max events/feature/resource
* Severity number
* location

In [None]:
## Number of events/feature/resource occured
Event_count_feature = pd.DataFrame(event_type.groupby(['id']).size()).reset_index()
Feature_count_feature = pd.DataFrame(log_feature.groupby(['id']).size()).reset_index()
Resource_count_feature = pd.DataFrame(resource_type.groupby(['id']).size()).reset_index()

In [None]:
## Volume of feature / max value of feature
max_volume_feature = pd.DataFrame(log_feature.groupby(['log_feature'])['volume'].agg(['max'])).reset_index()

log_feature2 = log_feature.merge(max_volume_feature,how = 'left' , on = 'log_feature')
log_feature2['volume_ratio'] = log_feature2['volume'] / log_feature2['max']

log_feature2_flatened = log_feature2.pivot(index='id', columns='log_feature', values='volume_ratio')
log_feature2_flatened.reset_index(inplace = True)
log_feature2_flatened.fillna(0,inplace=True)

In [None]:
## Sum of total volume
volume = pd.DataFrame(log_feature.groupby(['id'])['volume'].agg(['sum'])).reset_index()

In [None]:
## Max events/feature/resource
event_type['event_number'] = event_type['event_type'].str.slice(-2)
event_type['event_number'] = event_type['event_number'].astype(np.int64)
max_event_number = pd.DataFrame(event_type.groupby(['id'])['event_number'].agg(['max'])).reset_index()

In [None]:
## Severity number
severity_type['severity_type_number'] = severity_type['severity_type'].str.slice(-1)
severity_type['severity_type_number'] = severity_type['severity_type_number'].astype(np.int64)

In [None]:
## location
train['location_number'] = train['location'].str.slice(-3)
train['location_number'] = train['location_number'].apply(lambda x: x.lstrip('n'))
train['location_number'] = train['location_number'].astype(np.int64)

test['location_number'] = test['location'].str.slice(-3)
test['location_number'] = test['location_number'].apply(lambda x: x.lstrip('n'))
test['location_number'] = test['location_number'].astype(np.int64)

## ADS Creation

In [None]:
## insert all dataframes to be merged in a list
dfs = [train, event_type_flattened, log_feature_flatened, Resource_type_flattened , Event_count_feature , Feature_count_feature , Resource_count_feature
      , log_feature2_flatened , volume , max_event_number , severity_type  ]

dfs_test = [test, event_type_flattened, log_feature_flatened, Resource_type_flattened , Event_count_feature , Feature_count_feature , Resource_count_feature
      , log_feature2_flatened , volume , max_event_number , severity_type  ]

In [None]:
## merge all variables dataframes together for train and test
df_final = reduce(lambda left,right: pd.merge(left,right,on='id'), dfs)
df_final_test = reduce(lambda left,right: pd.merge(left,right,on='id'), dfs_test)

In [None]:
## remove categorical variables
df_final.select_dtypes(include='object')
df_final.drop(['location', 'severity_type'], axis=1 , inplace = True)
df_final_test.drop(['location', 'severity_type'], axis=1 , inplace = True)

### Variable exploration

Ideas:
    * Remove constants --- done
    * View correlation  --- impossible with high dimesnion
    * Scatter plot matrix  -- impossible with high dimension
    * Normalize variables   -- done
    * PCA                  -- done
    * Check variables statisticlly using Anova

In [None]:
## Remove constants
df_describtion = df_final.describe().T
df_describtion['max_min'] = df_describtion['max'] - df_describtion['min']
constants_col_remove = list(df_describtion[df_describtion['max_min'] == 0].T.columns)
df_final.drop(constants_col_remove, axis=1 , inplace = True)
df_final_test.drop(constants_col_remove, axis=1 , inplace = True)

In [None]:
##Scatter plot matrix
## it won't work cause we have many dimensions
#spm = pd.plotting.scatter_matrix(df_final.iloc[:,:50], alpha=0.2, figsize=(15, 15), diagonal='hist')

In [None]:
##Correlation
#np.corrcoef(df_final.loc[:, df_final.columns != 'fault_severity'], df_final['fault_severity'])

In [None]:
# corr = df_final.corr()
# fig = plt.figure(figsize=(50, 50))
# ax = fig.add_subplot(111)
# cax = ax.matshow(corr,cmap='coolwarm', vmin=-1, vmax=1)
# fig.colorbar(cax)
# ticks = np.arange(0,len(df_final.columns),1)
# ax.set_xticks(ticks)
# plt.xticks(rotation=90)
# ax.set_yticks(ticks)
# ax.set_xticklabels(df_final.columns)
# ax.set_yticklabels(df_final.columns)
# plt.show()

In [None]:
##Normalize data
##extrat x and y
X = df_final.loc[:, df_final.columns != 'fault_severity']
y = df_final['fault_severity']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

Scale_model = StandardScaler()
X_train_scaled = pd.DataFrame(Scale_model.fit_transform(X_train))
X_train_scaled.columns = X_train.columns

X_test_scaled = pd.DataFrame(Scale_model.transform(X_test))
X_test_scaled.columns = X_test.columns

df_final_test_scaled = pd.DataFrame(Scale_model.transform(df_final_test))
df_final_test_scaled.columns = df_final_test.columns

In [None]:
def PCA_varianceSearch (model,X_train_scaled,y_train,X_test_scaled,y_test,random_state,scoring):
    n_comp = [1,0.95,0.90,0.85]
    pca_output = {}
    pca_acc = {}

    for i in n_comp:
        pca = PCA(i,random_state = random_state)
        principalComponents = pca.fit_transform(X_train_scaled)
        principalDf = pd.DataFrame(data = principalComponents)
        principalDf.rename(columns=lambda x: 'pca' + str(x), inplace=True)
        #principalDf[y_name] = y_train
        ###test
        principalDf_test1 = pd.DataFrame(pca.transform(X_test_scaled))
        principalDf_test1.rename(columns=lambda x: 'pca' + str(x), inplace=True)
        
#         if test_transform == True:
#             principalDf_test2 = pd.DataFrame(pca.transform(df_final_test_scaled))
#             principalDf_test2.rename(columns=lambda x: 'pca' + str(x), inplace=True)
        
        #train the model
        classifier = model().fit(principalDf,y_train)
        acc_score = classifier.score(principalDf_test1,y_test)
        pca_output[i] = (pca,principalDf,principalDf_test1,acc_score)
        pca_acc[i] = acc_score
    
    max_acc = max(pca_acc.values())
    position = int(np.where(list(pca_acc.values()) == max_acc)[0])
    Max_Name = list(pca_output.keys())[position]
    best_pca = pca_output[Max_Name][0]
    
    
    
    return pca_output , pca_acc , best_pca

        

In [None]:
##Getting the best pca and running it on test data
pca_output , pca_acc , best_pca = PCA_varianceSearch (GradientBoostingClassifier,X_train_scaled,y_train,X_test_scaled,y_test,0,'accuracy')
principalDf_test2 = pd.DataFrame(best_pca.transform(df_final_test_scaled))
principalDf_test2.rename(columns=lambda x: 'pca' + str(x), inplace=True)

In [None]:
varience_ratio_df = pd.DataFrame(best_pca.explained_variance_ratio_ )
varience_ratio_df['column_name'] = pca_output[best_pca.n_components][1].columns
varience_ratio_df.sort_values(0 , inplace = True , ascending=False)

In [None]:
X_train_pca = pca_output[best_pca.n_components][1]
X_test_pca = pca_output[best_pca.n_components][2]
X_train_pca['fault_severity'] = y_train
X_test_pca['fault_severity'] = y_test

In [None]:
#pca_output[0.85][1]

In [None]:
#X_train_scaled.shape , y_train.shape

In [None]:
## scatter plot matrix after pca
#spm = pd.plotting.scatter_matrix(principalDf, alpha=0.2, figsize=(50, 50), diagonal='hist')

In [None]:
%store df_final
%store df_final_test

%store X_train_pca
%store X_test_pca
%store principalDf_test2