In [1]:
# data wrangling
import pandas as pd
import numpy as np

from sklearn.svm import LinearSVR
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import SGDRegressor, LassoCV
from sklearn.tree import DecisionTreeRegressor

# visualizations
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings("ignore")

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.svm import SVC

# py files
import acquire
import prepare
import explore
import model

In [2]:
df = acquire.acquire_agg_data()

In [3]:
df = prepare.prepare(df)

In [4]:
df = prepare.unique(df)

In [5]:
df = prepare.treat_nulls(df)

In [6]:
df = explore.early_failure(df)

In [7]:
df = explore.old_or_fail(df)

In [8]:
df = explore.make_binary_values(df)

In [9]:
df = explore.remove_manufacturers(df)

In [10]:
df.head()

Unnamed: 0,serial_number,manufacturer,model,capacity_terabytes,failure,drive_age_in_years,reallocated_sectors_count,reported_uncorrectable_errors,command_timeout,current_pending_sector_count,uncorrectable_sector_count,early_failure,smart_5_nonzero,smart_187_nonzero,smart_188_nonzero,smart_197_nonzero,smart_198_nonzero
0,PL1311LAG1SJAA,Hitachi,Hitachi HDS5C4040ALE630,4.0,0,5.0,0.0,0.0,0.0,0.0,0.0,0,False,False,False,False,False
1,Z305KB36,Seagate,ST4000DM000,4.0,0,3.5,0.0,0.0,0.0,0.0,0.0,0,False,False,False,False,False
2,MJ0351YNG9MZXA,Hitachi,Hitachi HDS5C3030ALA630,3.0,0,4.8,0.0,0.0,0.0,0.0,0.0,0,False,False,False,False,False
3,ZA11NHSN,Seagate,ST8000DM002,8.0,0,3.0,0.0,0.0,0.0,0.0,0.0,0,False,False,False,False,False
4,MJ1311YNG2ZSEA,Hitachi,Hitachi HDS5C3030ALA630,3.0,0,5.5,0.0,0.0,0.0,0.0,0.0,0,False,False,False,False,False


In [11]:
df.shape

(83940, 17)

In [12]:
def split_my_data(df):
    X = df.drop(columns = ['serial_number', 'failure', 'model', 'early_failure', 'drive_age_in_years', 'reallocated_sectors_count', 'reported_uncorrectable_errors', 'command_timeout', 'current_pending_sector_count', 'uncorrectable_sector_count'])
    y = df[['early_failure']]
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = .80, random_state = 123, stratify=df.early_failure)
    return X, y, X_train, X_test, y_train, y_test

In [13]:
def encode_hot(train, test, col_name):
    encoded_values = sorted(list(train[col_name].unique()))

    # Integer Encoding
    int_encoder = LabelEncoder()
    train.encoded = int_encoder.fit_transform(train[col_name])
    test.encoded = int_encoder.transform(test[col_name])

    # create 2D np arrays of the encoded variable (in train and test)
    train_array = np.array(train.encoded).reshape(len(train.encoded),1)
    test_array = np.array(test.encoded).reshape(len(test.encoded),1)

    # One Hot Encoding
    ohe = OneHotEncoder(sparse=False, categories='auto')
    train_ohe = ohe.fit_transform(train_array)
    test_ohe = ohe.transform(test_array)

    # Turn the array of new values into a data frame with columns names being the values
    # and index matching that of train/test
    # then merge the new dataframe with the existing train/test dataframe
    train_encoded = pd.DataFrame(data=train_ohe, columns=encoded_values, index=train.index)
    train = train.join(train_encoded)

    test_encoded = pd.DataFrame(data=test_ohe, columns=encoded_values, index=test.index)
    test = test.join(test_encoded)

    return train, test

In [14]:
X, y, X_train, X_test, y_train, y_test = split_my_data(df)

In [15]:
len(X_train), len(y_train), len(X_test), len(y_test), len(X), len(y)

(67152, 67152, 16788, 16788, 83940, 83940)

In [16]:
X_train, X_test = model.encode_hot(X_train, X_test, col_name = 'manufacturer')

In [17]:
X_train = X_train.drop(columns = 'manufacturer')

In [18]:
X_test = X_test.drop(columns = 'manufacturer')

In [19]:
X_train = X_train.drop(columns = 'capacity_terabytes')

In [20]:
X_test = X_test.drop(columns = 'capacity_terabytes')

In [21]:
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(67152, 9) (16788, 9) (67152, 1) (16788, 1)


In [22]:
X_train.head()

Unnamed: 0,smart_5_nonzero,smart_187_nonzero,smart_188_nonzero,smart_197_nonzero,smart_198_nonzero,Hitachi,Seagate,Toshiba,Western Digital
121424,True,False,False,True,True,0.0,1.0,0.0,0.0
51821,False,False,False,False,False,0.0,1.0,0.0,0.0
39076,False,False,False,False,False,1.0,0.0,0.0,0.0
139516,False,False,False,False,False,1.0,0.0,0.0,0.0
50771,False,False,False,False,False,1.0,0.0,0.0,0.0


### Model Train

In [23]:
weights = {0: 1, 1: 75}
svclassifier = SVC(kernel='sigmoid', probability = True, gamma = 100, C = 100, class_weight = weights)
svclassifier.fit(X_train, y_train)

SVC(C=100, cache_size=200, class_weight={0: 1, 1: 75}, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=100, kernel='sigmoid',
    max_iter=-1, probability=True, random_state=None, shrinking=True, tol=0.001,
    verbose=False)

In [24]:
y_pred_train = svclassifier.predict(X_train)
y_pred_train

array([1, 1, 0, ..., 1, 1, 0])

In [25]:
y_pred_proba_train = svclassifier.predict_proba(X_train)
y_pred_proba_train

array([[0.13551292, 0.86448708],
       [0.97495352, 0.02504648],
       [0.97495396, 0.02504604],
       ...,
       [0.97495352, 0.02504648],
       [0.97495352, 0.02504648],
       [0.97495396, 0.02504604]])

In [26]:
print(confusion_matrix(y_train, y_pred_train))
print(classification_report(y_train, y_pred_train))

[[27059 36904]
 [  346  2843]]
              precision    recall  f1-score   support

           0       0.99      0.42      0.59     63963
           1       0.07      0.89      0.13      3189

    accuracy                           0.45     67152
   macro avg       0.53      0.66      0.36     67152
weighted avg       0.94      0.45      0.57     67152



### Model Test

In [27]:
y_pred_test = svclassifier.predict(X_test)
y_pred_test

array([1, 1, 1, ..., 1, 0, 1])

In [28]:
y_pred_proba_test = svclassifier.predict_proba(X_test)
y_pred_proba_test

array([[0.97495352, 0.02504648],
       [0.97495352, 0.02504648],
       [0.97495352, 0.02504648],
       ...,
       [0.97495352, 0.02504648],
       [0.97495396, 0.02504604],
       [0.97495352, 0.02504648]])

In [29]:
print(confusion_matrix(y_test, y_pred_test))
print(classification_report(y_test, y_pred_test))

[[6727 9264]
 [  95  702]]
              precision    recall  f1-score   support

           0       0.99      0.42      0.59     15991
           1       0.07      0.88      0.13       797

    accuracy                           0.44     16788
   macro avg       0.53      0.65      0.36     16788
weighted avg       0.94      0.44      0.57     16788

