In [110]:
import pandas as pd 
import numpy as np
import import_ipynb
from functions import *

from sklearn import tree
from sklearn.impute import SimpleImputer


In [111]:
train= pd.read_csv("../data/raw/aps_failure_training_set.csv")
test = pd.read_csv("../data/raw/aps_failure_test_set.csv")

print('shape of training set is ',train.shape)
print('shape of test set is ',test.shape)

shape of training set is  (60000, 171)
shape of test set is  (16000, 171)


## Data Preprocessing

1. Replace 'na' with null value and convert column into numeric data type
2. Imputing missing variables with mean

In [None]:
###defining target variables(y) and predictor variables (x)
y_train = train['class']
X_train = train.drop('class',axis=1)

y_test = test['class']
X_test = test.drop('class',axis=1)

In [None]:
X_train.head()

In [None]:
## Replace 'na' with null value and convert column into numeric data type

X_train.replace('na',np.nan,inplace=True)
X_train=X_train[X_train.columns].apply(pd.to_numeric, errors='coerce')

X_test.replace('na',np.nan,inplace=True)
X_test=X_test[X_train.columns].apply(pd.to_numeric, errors='coerce')

X_train.head()

In [None]:
## imputing mising variables
imp = SimpleImputer(missing_values=np.nan, strategy='mean')

X_train_new_adjusted= pd.DataFrame(imp.fit_transform(X_train),columns=X_train.columns)
X_test_new_adjusted= pd.DataFrame(imp.fit_transform(X_test),columns=X_test.columns)

In [None]:
X_train_new_adjusted.head()

# Preprocess Function testing

In [114]:
def preprocess(df):
    
    X = df.drop('class',axis=1)

    X.replace('na',np.nan,inplace=True)
    X = X[X.columns].apply(pd.to_numeric, errors='coerce')

    imp = SimpleImputer(missing_values=np.nan, strategy='mean')

    X = pd.DataFrame(imp.fit_transform(X),columns=X.columns)

    return X

In [126]:
X_train = preprocess(train)
X_test = preprocess(test)

y_train = train['class']
y_test = test['class']

# The super fancy but copy-pastable machine learning

In [127]:
## Training/Fitting model using training data

clf = tree.DecisionTreeClassifier()
clf.fit(X_train,y_train)

In [128]:
## Using model to generate prediction using test data predictor variables

y_pred = clf.predict(X_test)
y_pred=pd.DataFrame(y_pred)

In [129]:
##Compute test dataset cost

cost(y_test,y_pred)

wrongly labelled positives = 166
wrongly labelled negatives = 100
total cost of wrongly labelling = 51660


# JW's section

## SMOTE

In [130]:
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline
smote = SMOTE()

In [131]:
def balance_data(df,label):

    over = SMOTE(sampling_strategy=0.3)
    under = RandomUnderSampler(sampling_strategy=0.5)
    steps = [('o', over), ('u', under)]
    pipeline = Pipeline(steps=steps)
    df, label = pipeline.fit_resample(df, label)
    return df, label

In [132]:
X_train_smote, y_train_smote = balance_data(X_train,y_train)

# Smote Logistic Regression

In [139]:
from sklearn.linear_model import LogisticRegression

In [142]:
clf = LogisticRegression()
clf.fit(X_train_smote,y_train_smote)
y_pred = clf.predict(X_test)
y_pred = pd.DataFrame(y_pred)
cost(y_test,y_pred)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


wrongly labelled positives = 382
wrongly labelled negatives = 43
total cost of wrongly labelling = 25320


# Smote Decision Tree

In [133]:
## Testing
clf = tree.DecisionTreeClassifier()
clf.fit(X_train_smote,y_train_smote)
y_pred = clf.predict(X_test)
y_pred = pd.DataFrame(y_pred)
cost(y_test,y_pred)

wrongly labelled positives = 579
wrongly labelled negatives = 83
total cost of wrongly labelling = 47290


# LGBM

In [125]:
import lightgbm as lgb
from sklearn import metrics

In [136]:
clf = lgb.LGBMClassifier()
clf.fit(X_train_smote,y_train_smote)
y_pred = clf.predict(X_test)
y_pred = pd.DataFrame(y_pred)
cost(y_test,y_pred)

wrongly labelled positives = 218
wrongly labelled negatives = 45
total cost of wrongly labelling = 24680


# Gradient Boosted Decision Tree

In [143]:
from sklearn.ensemble import GradientBoostingClassifier

In [144]:
clf = GradientBoostingClassifier()
clf.fit(X_train_smote,y_train_smote)
y_pred = clf.predict(X_test)
y_pred = pd.DataFrame(y_pred)
cost(y_test,y_pred)

wrongly labelled positives = 349
wrongly labelled negatives = 30
total cost of wrongly labelling = 18490


# Random Forest Classifier

In [145]:
from sklearn.ensemble import RandomForestClassifier

In [146]:
clf = RandomForestClassifier()
clf.fit(X_train_smote,y_train_smote)
y_pred = clf.predict(X_test)
y_pred = pd.DataFrame(y_pred)
cost(y_test,y_pred)

wrongly labelled positives = 172
wrongly labelled negatives = 47
total cost of wrongly labelling = 25220
