## Procedure

1. Clean and transform data
2. Exploratory Data Analysis (EDA)
3. Handle imbalanced classes
4. Modeling & evaluation
5. **Final iteration**

### Load Libraries

In [111]:
import pandas as pd
import numpy as np
from datetime import datetime
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE

from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import f1_score, confusion_matrix

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

import warnings
warnings.filterwarnings("ignore", category=FutureWarning)
from sklearn.exceptions import DataConversionWarning
warnings.filterwarnings("ignore", category=DataConversionWarning)

### Load data

In [2]:
original_data = pd.read_csv('../work/data/device_failure.csv', engine='python')

In [3]:
original_data.head(2)

Unnamed: 0,date,device,failure,attribute1,attribute2,attribute3,attribute4,attribute5,attribute6,attribute7,attribute8,attribute9
0,2015-01-01,S1F01085,0,215630672,56,0,52,6,407438,0,0,7
1,2015-01-01,S1F0166B,0,61370680,0,3,0,6,403174,0,0,0


## Write classes/functions here

In [79]:
class ETL:
    
    def __init__(self):
        self.name = name
    
    def general_checks(df):
        for col in df:
            print(col, "\t", df[col].isnull().sum(), "\t", df[col].dtype, "\t",)

    def convert_and_extract_from_date(df, date_column):
        df[date_column] = pd.to_datetime(df[date_column])
        dayofweek_list = []
        for i in df[date_column]:
            dayofweek_list.append(datetime.weekday(i))
        df['dayofweek'] = pd.Series(dayofweek_list)
        print("Check date: ", df[date_column].dtype)
        print("Check dayofweek: ", df['dayofweek'].dtype)
    
    def deskew_numeric_variables(num_variables):
        logged_num_variables = num_variables.copy()
        for col in num_variables.columns:
            logged_num_variables[col] = np.log(1+num_variables[col])
        return logged_num_variables
    
    def encode_categorical_variables(cat_variables):
        cat_variables = cat_variables.astype('object')
        print("original size: ", cat_variables.shape)
        encoded_cat_variables = pd.get_dummies(cat_variables)
        print("new size: ", encoded_cat_variables.shape)
        return encoded_cat_variables
    
    def merge_num_cat_other_variables(num_variables, cat_variables, other_variables):
        print(num_variables.shape)
        print(cat_variables.shape)
        print(other_variables.shape)
        transformed_variables = pd.concat([num_variables,cat_variables, other_variables], axis=1)
        transformed_variables.rename(columns={0:'Sun', 1:'Mon', 2:'Tue', 3:'Wed', 4:'Thu', 5:'Fri', 6:'Sat'}, inplace=True)
        return transformed_variables
    
    def extract_features_and_target(df):
        features = df.loc[:, df.columns != 'failure']
        target = df['failure']
        return features, target
    
    def train_test_split(features, target):
        X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=.3, random_state=33)
        return X_train, X_test, y_train, y_test

class Modeling:
    def __init__(self):
        self.name = name
    
    def evaluate_model(model, X_test, y_test):
        prediction = model.predict(X_test)
        print('Accuracy: ', model.score(X_test, y_test))
        print('F1: ', f1_score(y_test, prediction))
        return pd.DataFrame(confusion_matrix(y_test, prediction))

### Perform general checks

In [5]:
ETL.general_checks(original_data)

date 	 0 	 object 	
device 	 0 	 object 	
failure 	 0 	 int64 	
attribute1 	 0 	 int64 	
attribute2 	 0 	 int64 	
attribute3 	 0 	 int64 	
attribute4 	 0 	 int64 	
attribute5 	 0 	 int64 	
attribute6 	 0 	 int64 	
attribute7 	 0 	 int64 	
attribute8 	 0 	 int64 	
attribute9 	 0 	 int64 	


### Convert and extract from `['date']` column

In [7]:
ETL.convert_and_extract_from_date(original_data, 'date')

Check date:  datetime64[ns]
Check dayofweek:  int64


### Deskew right-skewed numeric variables only

In [29]:
deskewed_num_variables = ETL.deskew_numeric_variables(original_data[['attribute2', 'attribute3', 'attribute4', 'attribute5', 'attribute7', 'attribute8', 'attribute9']])

### Encode categorical variables

In [21]:
encoded_cat_variables = ETL.encode_categorical_variables(original_data['dayofweek'])

original size:  (124494,)
new size:  (124494, 7)


### Merge transformed numeric, categorical and other variables back together

In [47]:
other_variables = original_data[['attribute1','attribute6','failure']]

In [48]:
transformed_data = ETL.merge_num_cat_other_variables(deskewed_num_variables,encoded_cat_variables,other_variables)

(124494, 7)
(124494, 7)
(124494, 3)


### Generate training and test data

In [49]:
features, target = ETL.extract_features_and_target(transformed_data)

In [50]:
X_train, X_test, y_train, y_test = train_test_split(features, target)

### Generate resampled data
SMOTE synthesizes new minority classes by ignoring majority class examples, then for every minority instance it chooses its k-nearest-neighbors and if, for example, you want 300% replication, then 3 neighbors are chosen and it creates new instances halfway between the first instance and its neighbors.

In [109]:
print(X_train.shape)
print(y_train.shape)

(93370, 16)
(93370,)


In [107]:
y_train.value_counts()

0    93298
1       72
Name: failure, dtype: int64

In [112]:
sm = SMOTE(random_state=33, ratio = 1.0)
X_train_resampled, y_train_resampled = sm.fit_sample(X_train, y_train)

In [113]:
print(X_train_resampled.shape)
print(y_train_resampled.shape)

(186596, 16)
(186596,)


In [122]:
frequency_count = np.bincount(y_train_resampled)
frequency_count

array([93298, 93298])

## Transformation & modeling pipelines

## Logistic Regression
Draws a line (hyperplane) between different classes of points. The further a point is from the boundary line, the more its score (estimate) increases (nearing 0 or 1). Do a simple Logistic Regression on scaled and non-scaled data just to see which performs better.

In [98]:
lg_scaled = make_pipeline(StandardScaler(),
                          LogisticRegression())
lg_scaled.fit(X_train, y_train)
print("\n")
Modeling.evaluate_model(lg_scaled, X_test, y_test)



Accuracy:  0.998939724971
F1:  0.0571428571429


Unnamed: 0,0,1
0,31090,0
1,33,1


## Decision Tree
Recursively subdivides the instance space into finer and finer subregions until it is all one class (or good enough). New instances start at the root node and takes the appropriate path until it reaches a lead node, which determines the classification by checking the classes of the training instances that reached that leaf, and the majority determines the class. For that leaf, the score is calculated by:

<a href="https://www.codecogs.com/eqnedit.php?latex=\frac{majority&space;instances}{(majority&space;instances&space;&plus;&space;minority&space;instances)&space;}" target="_blank"><img src="https://latex.codecogs.com/gif.latex?\frac{majority&space;instances}{(majority&space;instances&space;&plus;&space;minority&space;instances)&space;}" title="\frac{majority instances}{(majority instances + minority instances) }" /></a>

When using scikit-learn's DecisionTreeClassifier, always set min_samples_leaf to something like 5 or 10. Its default value of 1 is useless and is guaranteed to overfit. 

In [99]:
dt_scaled = make_pipeline(StandardScaler(),
                          DecisionTreeClassifier())
dt_scaled.fit(X_train, y_train)
print("\n")
Modeling.evaluate_model(dt_scaled, X_test, y_test)



Accuracy:  0.998650559054
F1:  0.125


Unnamed: 0,0,1
0,31079,11
1,31,3


In [125]:
dt_scaled_resampled = make_pipeline(StandardScaler(),
                          DecisionTreeClassifier())
dt_scaled_resampled.fit(X_train_resampled, y_train_resampled)
print("\n")
Modeling.evaluate_model(dt_scaled_resampled, X_test, y_test)



Accuracy:  0.998040097674
F1:  0.031746031746


Unnamed: 0,0,1
0,31062,28
1,33,1


## Random Forest

In [100]:
rf_scaled = make_pipeline(StandardScaler(),
                          RandomForestClassifier())
rf_scaled.fit(X_train, y_train)
print("\n")
Modeling.evaluate_model(rf_scaled, X_test, y_test)



Accuracy:  0.998875465878
F1:  0.0


Unnamed: 0,0,1
0,31089,1
1,34,0


In [126]:
rf_scaled_resampled = make_pipeline(StandardScaler(),
                          RandomForestClassifier())
rf_scaled_resampled.fit(X_train_resampled, y_train_resampled)
print("\n")
Modeling.evaluate_model(rf_scaled_resampled, X_test, y_test)



Accuracy:  0.998875465878
F1:  0.146341463415


Unnamed: 0,0,1
0,31086,4
1,31,3


This score was the best, but after running it a few times it wasn't consistent so it was just a fluke.

In [127]:
# rf_scaled_resampled = make_pipeline(StandardScaler(),
#                           RandomForestClassifier())
# rf_scaled_resampled.fit(X_train_resampled, y_train_resampled)
# print("\n")
# Modeling.evaluate_model(rf_scaled_resampled, X_test, y_test)



Accuracy:  0.998811206786
F1:  0.0512820512821


Unnamed: 0,0,1
0,31086,4
1,33,1


### K-Nearest Neighbor
If for example, `k=5`, for every new instance, 5 of its nearest neighbors are randomly selected and some function like majority is applied to the five neighbors. To assign a score, divide the number of positive instances by the total and return the fraction. 

In [102]:
kn_scaled = make_pipeline(StandardScaler(),
                          KNeighborsClassifier())
kn_scaled.fit(X_train, y_train)
print("\n")
Modeling.evaluate_model(kn_scaled, X_test, y_test)



Accuracy:  0.998907595425
F1:  0.0


  'precision', 'predicted', average, warn_for)


Unnamed: 0,0,1
0,31090,0
1,34,0


Warning explanation: when the model doesn't predict positive class at all, precision and recall are 0. That means you are dividing 0/0.

In [133]:
kn_scaled_resampled = make_pipeline(StandardScaler(),
                          KNeighborsClassifier())
kn_scaled_resampled.fit(X_train_resampled, y_train_resampled)
print("\n")
Modeling.evaluate_model(kn_scaled_resampled, X_test, y_test)



Accuracy:  0.996819174913
F1:  0.139130434783


Unnamed: 0,0,1
0,31017,73
1,26,8


In [137]:
kn_scaled_resampled = make_pipeline(StandardScaler(),
                          KNeighborsClassifier())
kn_scaled_resampled.fit(X_train_resampled, y_train_resampled)
print("\n")
Modeling.evaluate_model(kn_scaled_resampled, X_test, y_test)



Accuracy:  0.996819174913
F1:  0.139130434783


Unnamed: 0,0,1
0,31017,73
1,26,8


# Our best model: K-Nearest Neighbor on resampled data
With Decision Tree on non-resampled data coming in at a close second.