In [1]:
# data wrangling
import pandas as pd
import numpy as np

from sklearn.svm import LinearSVR
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import SGDRegressor, LassoCV
from sklearn.tree import DecisionTreeRegressor

# visualizations
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings("ignore")

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.svm import SVC

# py files
import acquire
import prepare
import explore
import model

In [2]:
df = acquire.acquire_agg_data()

In [3]:
df = prepare.prepare(df)

In [4]:
df = prepare.unique(df)

In [5]:
df = prepare.treat_nulls(df)

In [6]:
df = explore.early_failure(df,1.6)

In [7]:
df = explore.old_or_fail(df)

In [8]:
df = explore.make_binary_values(df)

In [9]:
df = explore.remove_manufacturers(df)

In [10]:
df.head()

Unnamed: 0,serial_number,manufacturer,model,capacity_terabytes,failure,drive_age_in_years,reallocated_sectors_count,reported_uncorrectable_errors,command_timeout,current_pending_sector_count,uncorrectable_sector_count,early_failure,smart_5_nonzero,smart_187_nonzero,smart_188_nonzero,smart_197_nonzero,smart_198_nonzero
0,PL1311LAG1SJAA,Hitachi,Hitachi HDS5C4040ALE630,4.0,0,5.0,0.0,0.0,0.0,0.0,0.0,0,False,False,False,False,False
1,Z305KB36,Seagate,ST4000DM000,4.0,0,3.5,0.0,0.0,0.0,0.0,0.0,0,False,False,False,False,False
2,MJ0351YNG9MZXA,Hitachi,Hitachi HDS5C3030ALA630,3.0,0,4.8,0.0,0.0,0.0,0.0,0.0,0,False,False,False,False,False
3,ZA11NHSN,Seagate,ST8000DM002,8.0,0,3.0,0.0,0.0,0.0,0.0,0.0,0,False,False,False,False,False
4,MJ1311YNG2ZSEA,Hitachi,Hitachi HDS5C3030ALA630,3.0,0,5.5,0.0,0.0,0.0,0.0,0.0,0,False,False,False,False,False


In [11]:
def split_my_data(df):
    X = df.drop(columns = ['serial_number', 'failure', 'model', 'early_failure', 'drive_age_in_years', 'reallocated_sectors_count', 'reported_uncorrectable_errors', 'command_timeout', 'current_pending_sector_count', 'uncorrectable_sector_count'])
    y = df[['early_failure']]
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = .80, random_state = 123, stratify=df.early_failure)
    return X, y, X_train, X_test, y_train, y_test

In [12]:
def encode_hot(train, test, col_name):
    encoded_values = sorted(list(train[col_name].unique()))

    # Integer Encoding
    int_encoder = LabelEncoder()
    train.encoded = int_encoder.fit_transform(train[col_name])
    test.encoded = int_encoder.transform(test[col_name])

    # create 2D np arrays of the encoded variable (in train and test)
    train_array = np.array(train.encoded).reshape(len(train.encoded),1)
    test_array = np.array(test.encoded).reshape(len(test.encoded),1)

    # One Hot Encoding
    ohe = OneHotEncoder(sparse=False, categories='auto')
    train_ohe = ohe.fit_transform(train_array)
    test_ohe = ohe.transform(test_array)

    # Turn the array of new values into a data frame with columns names being the values
    # and index matching that of train/test
    # then merge the new dataframe with the existing train/test dataframe
    train_encoded = pd.DataFrame(data=train_ohe, columns=encoded_values, index=train.index)
    train = train.join(train_encoded)

    test_encoded = pd.DataFrame(data=test_ohe, columns=encoded_values, index=test.index)
    test = test.join(test_encoded)

    return train, test

In [13]:
X, y, X_train, X_test, y_train, y_test = split_my_data(df)

In [14]:
len(X_train), len(y_train), len(X_test), len(y_test), len(X), len(y)

(67152, 67152, 16788, 16788, 83940, 83940)

In [15]:
X.columns

Index(['manufacturer', 'capacity_terabytes', 'smart_5_nonzero',
       'smart_187_nonzero', 'smart_188_nonzero', 'smart_197_nonzero',
       'smart_198_nonzero'],
      dtype='object')

In [16]:
X_train, X_test = model.encode_hot(X_train, X_test, col_name = 'manufacturer')

In [17]:
X_train = X_train.drop(columns = 'manufacturer')

In [18]:
X_test = X_test.drop(columns = 'manufacturer')

In [19]:
X_train.columns

Index(['capacity_terabytes', 'smart_5_nonzero', 'smart_187_nonzero',
       'smart_188_nonzero', 'smart_197_nonzero', 'smart_198_nonzero',
       'Hitachi', 'Seagate', 'Toshiba', 'Western Digital'],
      dtype='object')

In [20]:
X_train = X_train.drop(columns = 'capacity_terabytes')

In [21]:
X_test = X_test.drop(columns = 'capacity_terabytes')

In [22]:
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(67152, 9) (16788, 9) (67152, 1) (16788, 1)


In [23]:
X_train.head()

Unnamed: 0,smart_5_nonzero,smart_187_nonzero,smart_188_nonzero,smart_197_nonzero,smart_198_nonzero,Hitachi,Seagate,Toshiba,Western Digital
219,False,False,False,False,False,0.0,1.0,0.0,0.0
105062,False,False,False,False,False,0.0,1.0,0.0,0.0
101380,False,False,False,False,False,0.0,1.0,0.0,0.0
14420,False,False,False,False,False,0.0,1.0,0.0,0.0
9348,False,False,False,False,False,0.0,1.0,0.0,0.0


### weights = {0: 1, 1: 50}, G = 10, C = 10

# NOT GOOD

In [None]:
weights = {0: 1, 1: 50}
svclassifier = SVC(kernel='sigmoid', probability = True, gamma = 10, C = 10, class_weight = weights)
svclassifier.fit(X_train, y_train)

In [None]:
y_pred = svclassifier.predict(X_train)
y_pred

In [None]:
y_pred_proba = svclassifier.predict_proba(X_train)
y_pred_proba

In [None]:
print(confusion_matrix(y_train, y_pred))
print(classification_report(y_train, y_pred))

### weights = {0: 1, 1: 50}, G = 10, C = 100


# NOT GREAT

In [None]:
weights = {0: 1, 1: 50}
svclassifier = SVC(kernel='sigmoid', probability = True, gamma = 10, C = 100, class_weight = weights)
svclassifier.fit(X_train, y_train)

In [None]:
y_pred = svclassifier.predict(X_train)
y_pred

In [None]:
y_pred_proba = svclassifier.predict_proba(X_train)
y_pred_proba

In [None]:
print(confusion_matrix(y_train, y_pred))
print(classification_report(y_train, y_pred))

### weights = {0: 1, 1: 50}, G = 100, C = 100

In [24]:
weights = {0: 1, 1: 50}
svclassifier = SVC(kernel='sigmoid', probability = True, gamma = 100, C = 100, class_weight = weights)
svclassifier.fit(X_train, y_train)

SVC(C=100, cache_size=200, class_weight={0: 1, 1: 50}, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=100, kernel='sigmoid',
    max_iter=-1, probability=True, random_state=None, shrinking=True, tol=0.001,
    verbose=False)

In [25]:
y_pred = svclassifier.predict(X_train)
y_pred

array([1, 1, 1, ..., 1, 1, 1])

In [26]:
y_pred_proba = svclassifier.predict_proba(X_train)
y_pred_proba

array([[0.97822289, 0.02177711],
       [0.97822289, 0.02177711],
       [0.97822289, 0.02177711],
       ...,
       [0.97822289, 0.02177711],
       [0.97822289, 0.02177711],
       [0.97822289, 0.02177711]])

In [27]:
print(confusion_matrix(y_train, y_pred))
print(classification_report(y_train, y_pred))

[[    0 95442]
 [    0  2078]]
              precision    recall  f1-score   support

           0       0.00      0.00      0.00     95442
           1       0.02      1.00      0.04      2078

    accuracy                           0.02     97520
   macro avg       0.01      0.50      0.02     97520
weighted avg       0.00      0.02      0.00     97520



### weights = {0: 1, 1: 50}, G = 10, C = 1000

In [28]:
weights = {0: 1, 1: 50}
svclassifier = SVC(kernel='sigmoid', probability = True, gamma = 10, C = 1000, class_weight = weights)
svclassifier.fit(X_train, y_train)

SVC(C=1000, cache_size=200, class_weight={0: 1, 1: 50}, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=10, kernel='sigmoid',
    max_iter=-1, probability=True, random_state=None, shrinking=True, tol=0.001,
    verbose=False)

In [29]:
y_pred = svclassifier.predict(X_train)
y_pred

array([0, 1, 0, ..., 1, 1, 1])

In [30]:
y_pred_proba = svclassifier.predict_proba(X_train)
y_pred_proba

array([[0.97835441, 0.02164559],
       [0.9783538 , 0.0216462 ],
       [0.97835441, 0.02164559],
       ...,
       [0.9783538 , 0.0216462 ],
       [0.97835379, 0.02164621],
       [0.97835379, 0.02164621]])

In [31]:
print(confusion_matrix(y_train, y_pred))
print(classification_report(y_train, y_pred))

[[35004 60438]
 [ 1221   857]]
              precision    recall  f1-score   support

           0       0.97      0.37      0.53     95442
           1       0.01      0.41      0.03      2078

    accuracy                           0.37     97520
   macro avg       0.49      0.39      0.28     97520
weighted avg       0.95      0.37      0.52     97520



### weights = {0: 1, 1: 25}, G = 10, C = 10

In [32]:
weights = {0: 1, 1: 25}
svclassifier = SVC(kernel='sigmoid', probability = True, gamma = 10, C = 10, class_weight = weights)
svclassifier.fit(X_train, y_train)

SVC(C=10, cache_size=200, class_weight={0: 1, 1: 25}, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=10, kernel='sigmoid',
    max_iter=-1, probability=True, random_state=None, shrinking=True, tol=0.001,
    verbose=False)

In [33]:
y_pred = svclassifier.predict(X_train)
y_pred

array([0, 0, 0, ..., 0, 0, 0])

In [34]:
y_pred_proba = svclassifier.predict_proba(X_train)
y_pred_proba

array([[0.97834728, 0.02165272],
       [0.97834651, 0.02165349],
       [0.97834728, 0.02165272],
       ...,
       [0.97834651, 0.02165349],
       [0.97834651, 0.02165349],
       [0.97834651, 0.02165349]])

In [35]:
print(confusion_matrix(y_train, y_pred))
print(classification_report(y_train, y_pred))

[[85156 10286]
 [ 1979    99]]
              precision    recall  f1-score   support

           0       0.98      0.89      0.93     95442
           1       0.01      0.05      0.02      2078

    accuracy                           0.87     97520
   macro avg       0.49      0.47      0.47     97520
weighted avg       0.96      0.87      0.91     97520



### weigths = {0: 1, 1: 75}, G = 10, C = 100

In [26]:
weights = {0: 1, 1: 75}
svclassifier = SVC(kernel='sigmoid', probability = True, gamma = 10, C = 100, class_weight = weights)
svclassifier.fit(X_train, y_train)

SVC(C=100, cache_size=200, class_weight={0: 1, 1: 75}, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=10, kernel='sigmoid',
    max_iter=-1, probability=True, random_state=None, shrinking=True, tol=0.001,
    verbose=False)

In [27]:
y_pred = svclassifier.predict(X_train)
y_pred

array([0, 0, 0, ..., 0, 0, 0])

In [28]:
y_pred_proba = svclassifier.predict_proba(X_train)
y_pred_proba

array([[0.97866251, 0.02133749],
       [0.97866251, 0.02133749],
       [0.97866251, 0.02133749],
       ...,
       [0.97866251, 0.02133749],
       [0.97866251, 0.02133749],
       [0.97866251, 0.02133749]])

In [29]:
print(confusion_matrix(y_train, y_pred))
print(classification_report(y_train, y_pred))

[[90723  4719]
 [  884  1194]]
              precision    recall  f1-score   support

           0       0.99      0.95      0.97     95442
           1       0.20      0.57      0.30      2078

    accuracy                           0.94     97520
   macro avg       0.60      0.76      0.63     97520
weighted avg       0.97      0.94      0.96     97520



### weights = {0: 1, 1: 75}, G = 100, C = 10

In [30]:
weights = {0: 1, 1: 75}
svclassifier = SVC(kernel='sigmoid', probability = True, gamma = 100, C = 10, class_weight = weights)
svclassifier.fit(X_train, y_train)

SVC(C=10, cache_size=200, class_weight={0: 1, 1: 75}, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=100, kernel='sigmoid',
    max_iter=-1, probability=True, random_state=None, shrinking=True, tol=0.001,
    verbose=False)

In [31]:
y_pred = svclassifier.predict(X_train)
y_pred

array([0, 0, 0, ..., 0, 0, 0])

In [32]:
y_pred_proba = svclassifier.predict_proba(X_train)
y_pred_proba

array([[0.97872675, 0.02127325],
       [0.97872675, 0.02127325],
       [0.97872675, 0.02127325],
       ...,
       [0.97872675, 0.02127325],
       [0.97872675, 0.02127325],
       [0.97872675, 0.02127325]])

In [33]:
print(confusion_matrix(y_train, y_pred))
print(classification_report(y_train, y_pred))

[[90723  4719]
 [  884  1194]]
              precision    recall  f1-score   support

           0       0.99      0.95      0.97     95442
           1       0.20      0.57      0.30      2078

    accuracy                           0.94     97520
   macro avg       0.60      0.76      0.63     97520
weighted avg       0.97      0.94      0.96     97520



### weights = {0: 1, 1: 75}, G = 100, C = 100

In [34]:
weights = {0: 1, 1: 75}
svclassifier = SVC(kernel='sigmoid', probability = True, gamma = 100, C = 100, class_weight = weights)
svclassifier.fit(X_train, y_train)

SVC(C=100, cache_size=200, class_weight={0: 1, 1: 75}, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=100, kernel='sigmoid',
    max_iter=-1, probability=True, random_state=None, shrinking=True, tol=0.001,
    verbose=False)

In [35]:
y_pred = svclassifier.predict(X_train)
y_pred

array([0, 0, 0, ..., 0, 0, 0])

In [36]:
y_pred_proba = svclassifier.predict_proba(X_train)
y_pred_proba

array([[0.97963833, 0.02036167],
       [0.97963833, 0.02036167],
       [0.97963833, 0.02036167],
       ...,
       [0.97963833, 0.02036167],
       [0.97963833, 0.02036167],
       [0.97963833, 0.02036167]])

In [None]:
print(confusion_matrix(y_train, y_pred))
print(classification_report(y_train, y_pred))

### weights = {0: 1, 1: 25}, G = 100, C = 100

In [23]:
weights = {0: 1, 1: 25}
svclassifier = SVC(kernel='sigmoid', probability = True, gamma = 100, C = 100, class_weight = weights)
svclassifier.fit(X_train, y_train)

SVC(C=100, cache_size=200, class_weight={0: 1, 1: 25}, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=100, kernel='sigmoid',
    max_iter=-1, probability=True, random_state=None, shrinking=True, tol=0.001,
    verbose=False)

In [24]:
y_pred = svclassifier.predict(X_train)
y_pred

array([0, 0, 0, ..., 0, 0, 0])

In [25]:
y_pred_proba = svclassifier.predict_proba(X_train)
y_pred_proba

array([[0.98668905, 0.01331095],
       [0.98668905, 0.01331095],
       [0.98668905, 0.01331095],
       ...,
       [0.98668905, 0.01331095],
       [0.98668905, 0.01331095],
       [0.98668905, 0.01331095]])

In [26]:
print(confusion_matrix(y_train, y_pred))
print(classification_report(y_train, y_pred))

[[91838  3604]
 [ 1338   740]]
              precision    recall  f1-score   support

           0       0.99      0.96      0.97     95442
           1       0.17      0.36      0.23      2078

    accuracy                           0.95     97520
   macro avg       0.58      0.66      0.60     97520
weighted avg       0.97      0.95      0.96     97520



### weights = {0: 1, 1: 100}, G = 10, C = 100

In [27]:
weights = {0: 1, 1: 100}
svclassifier = SVC(kernel='sigmoid', probability = True, gamma = 10, C = 100, class_weight = weights)
svclassifier.fit(X_train, y_train)

SVC(C=100, cache_size=200, class_weight={0: 1, 1: 100}, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=10, kernel='sigmoid',
    max_iter=-1, probability=True, random_state=None, shrinking=True, tol=0.001,
    verbose=False)

In [28]:
y_pred = svclassifier.predict(X_train)
y_pred

array([1, 1, 1, ..., 1, 0, 0])

In [29]:
y_pred_proba = svclassifier.predict_proba(X_train)
y_pred_proba

array([[0.97921826, 0.02078174],
       [0.97921826, 0.02078174],
       [0.97921826, 0.02078174],
       ...,
       [0.97921826, 0.02078174],
       [0.97921829, 0.02078171],
       [0.97921829, 0.02078171]])

In [30]:
print(confusion_matrix(y_train, y_pred))
print(classification_report(y_train, y_pred))

[[33265 62177]
 [  219  1859]]
              precision    recall  f1-score   support

           0       0.99      0.35      0.52     95442
           1       0.03      0.89      0.06      2078

    accuracy                           0.36     97520
   macro avg       0.51      0.62      0.29     97520
weighted avg       0.97      0.36      0.51     97520



### weights = {0: 1, 1: 200}, G = 10, C = 100

In [31]:
weights = {0: 1, 1: 200}
svclassifier = SVC(kernel='sigmoid', probability = True, gamma = 10, C = 100, class_weight = weights)
svclassifier.fit(X_train, y_train)

SVC(C=100, cache_size=200, class_weight={0: 1, 1: 200}, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=10, kernel='sigmoid',
    max_iter=-1, probability=True, random_state=None, shrinking=True, tol=0.001,
    verbose=False)

In [32]:
y_pred = svclassifier.predict(X_train)
y_pred

array([1, 1, 1, ..., 1, 0, 0])

In [33]:
y_pred_proba = svclassifier.predict_proba(X_train)
y_pred_proba

array([[0.97942137, 0.02057863],
       [0.97942137, 0.02057863],
       [0.97942137, 0.02057863],
       ...,
       [0.97942137, 0.02057863],
       [0.97942139, 0.02057861],
       [0.97942139, 0.02057861]])

In [34]:
print(confusion_matrix(y_train, y_pred))
print(classification_report(y_train, y_pred))

[[33265 62177]
 [  219  1859]]
              precision    recall  f1-score   support

           0       0.99      0.35      0.52     95442
           1       0.03      0.89      0.06      2078

    accuracy                           0.36     97520
   macro avg       0.51      0.62      0.29     97520
weighted avg       0.97      0.36      0.51     97520



# Small Test Data

In [24]:
svclassifier = SVC(kernel='sigmoid', probability = True, gamma = 10, C = 10, class_weight = weights))
svclassifier.fit(X_train, y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
    kernel='sigmoid', max_iter=-1, probability=True, random_state=None,
    shrinking=True, tol=0.001, verbose=False)

In [25]:
y_pred = svclassifier.predict(X_test)
y_pred

array([0, 0, 0, ..., 0, 0, 0])

In [26]:
y_pred_proba = svclassifier.predict_proba(X_test)
y_pred_proba

array([[0.96891514, 0.03108486],
       [0.96816201, 0.03183799],
       [0.96816293, 0.03183707],
       ...,
       [0.96816243, 0.03183757],
       [0.96816293, 0.03183707],
       [0.96816293, 0.03183707]])

In [27]:
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

[[16156   112]
 [  458    62]]
              precision    recall  f1-score   support

           0       0.97      0.99      0.98     16268
           1       0.36      0.12      0.18       520

    accuracy                           0.97     16788
   macro avg       0.66      0.56      0.58     16788
weighted avg       0.95      0.97      0.96     16788



### Running Train

In [33]:
svclassifier = SVC(kernel='sigmoid', probability = True)
svclassifier.fit(X_train, y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
    kernel='sigmoid', max_iter=-1, probability=True, random_state=None,
    shrinking=True, tol=0.001, verbose=False)

In [34]:
y_pred = svclassifier.predict(X_train)
y_pred

array([0, 0, 0, ..., 0, 0, 0])

In [35]:
y_pred_proba = svclassifier.predict_proba(X_train)
y_pred_proba

array([[0.96817353, 0.03182647],
       [0.96817353, 0.03182647],
       [0.96817353, 0.03182647],
       ...,
       [0.96817353, 0.03182647],
       [0.9681729 , 0.0318271 ],
       [0.96817353, 0.03182647]])

In [36]:
print(confusion_matrix(y_train, y_pred))
print(classification_report(y_train, y_pred))

[[64664   410]
 [ 1853   225]]
              precision    recall  f1-score   support

           0       0.97      0.99      0.98     65074
           1       0.35      0.11      0.17      2078

    accuracy                           0.97     67152
   macro avg       0.66      0.55      0.57     67152
weighted avg       0.95      0.97      0.96     67152

