In [1]:
# data wrangling
import pandas as pd
import numpy as np

from sklearn.svm import LinearSVR
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import SGDRegressor, LassoCV
from sklearn.tree import DecisionTreeRegressor

# visualizations
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings("ignore")

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.svm import SVC

# py files
import acquire
import prepare
import explore
import model

In [2]:
df = acquire.acquire_agg_data()

In [3]:
df = prepare.prepare(df)

In [4]:
df = prepare.unique(df)

In [5]:
df = prepare.treat_nulls(df)

In [6]:
df = explore.early_failure(df)

In [7]:
df = explore.old_or_fail(df)

In [8]:
df = explore.make_binary_values(df)

In [9]:
df = explore.remove_manufacturers(df)

In [10]:
df.head()

Unnamed: 0,serial_number,manufacturer,model,capacity_terabytes,failure,drive_age_in_years,reallocated_sectors_count,reported_uncorrectable_errors,command_timeout,current_pending_sector_count,uncorrectable_sector_count,early_failure,smart_5_nonzero,smart_187_nonzero,smart_188_nonzero,smart_197_nonzero,smart_198_nonzero
0,PL1311LAG1SJAA,Hitachi,Hitachi HDS5C4040ALE630,4.0,0,5.0,0.0,0.0,0.0,0.0,0.0,0,False,False,False,False,False
1,Z305KB36,Seagate,ST4000DM000,4.0,0,3.5,0.0,0.0,0.0,0.0,0.0,0,False,False,False,False,False
2,MJ0351YNG9MZXA,Hitachi,Hitachi HDS5C3030ALA630,3.0,0,4.8,0.0,0.0,0.0,0.0,0.0,0,False,False,False,False,False
3,ZA11NHSN,Seagate,ST8000DM002,8.0,0,3.0,0.0,0.0,0.0,0.0,0.0,0,False,False,False,False,False
4,MJ1311YNG2ZSEA,Hitachi,Hitachi HDS5C3030ALA630,3.0,0,5.5,0.0,0.0,0.0,0.0,0.0,0,False,False,False,False,False


Unnamed: 0,serial_number,manufacturer,model,capacity_terabytes,failure,drive_age_in_years,reallocated_sectors_count,reported_uncorrectable_errors,command_timeout,current_pending_sector_count,uncorrectable_sector_count,early_failure,smart_5_nonzero,smart_187_nonzero,smart_188_nonzero,smart_197_nonzero,smart_198_nonzero
0,PL1311LAG1SJAA,Hitachi,Hitachi HDS5C4040ALE630,4.0,0,5.0,0.0,0.0,0.0,0.0,0.0,0,False,False,False,False,False
1,Z305KB36,Seagate,ST4000DM000,4.0,0,3.5,0.0,0.0,0.0,0.0,0.0,0,False,False,False,False,False
2,MJ0351YNG9MZXA,Hitachi,Hitachi HDS5C3030ALA630,3.0,0,4.8,0.0,0.0,0.0,0.0,0.0,0,False,False,False,False,False
3,ZA11NHSN,Seagate,ST8000DM002,8.0,0,3.0,0.0,0.0,0.0,0.0,0.0,0,False,False,False,False,False
4,MJ1311YNG2ZSEA,Hitachi,Hitachi HDS5C3030ALA630,3.0,0,5.5,0.0,0.0,0.0,0.0,0.0,0,False,False,False,False,False


In [11]:
from sklearn.utils import resample

df_majority = df[df.early_failure==0]
df_minority = df[df.early_failure==1]

In [12]:
df_minority.count()

serial_number                    3986
manufacturer                     3986
model                            3986
capacity_terabytes               3986
failure                          3986
drive_age_in_years               3986
reallocated_sectors_count        3986
reported_uncorrectable_errors    3986
command_timeout                  3986
current_pending_sector_count     3986
uncorrectable_sector_count       3986
early_failure                    3986
smart_5_nonzero                  3986
smart_187_nonzero                3986
smart_188_nonzero                3986
smart_197_nonzero                3986
smart_198_nonzero                3986
dtype: int64

serial_number                    3986
manufacturer                     3986
model                            3986
capacity_terabytes               3986
failure                          3986
drive_age_in_years               3986
reallocated_sectors_count        3986
reported_uncorrectable_errors    3986
command_timeout                  3986
current_pending_sector_count     3986
uncorrectable_sector_count       3986
early_failure                    3986
smart_5_nonzero                  3986
smart_187_nonzero                3986
smart_188_nonzero                3986
smart_197_nonzero                3986
smart_198_nonzero                3986
dtype: int64

In [13]:
df_majority_downsampled = resample(df_majority, 
                                 replace=False,     # sample with replacement
                                 n_samples=3986,    # to match minority class
                                 random_state=123) # reproducible results

In [14]:
df_downsampled = pd.concat([df_majority_downsampled, df_minority])

In [15]:
df_downsampled.early_failure.value_counts()

1    3986
0    3986
Name: early_failure, dtype: int64

1    3986
0    3986
Name: early_failure, dtype: int64

In [16]:
from sklearn.metrics import roc_auc_score

In [17]:
df = df_downsampled

In [18]:
# int_encoder = LabelEncoder()
# int_encoder.fit(train.embarked)
# train.embarked = int_encoder.transform(train.embarked)

In [19]:
def split_my_data(df):
    X = df.drop(columns = ['serial_number', 'failure', 'model', 'early_failure', 'drive_age_in_years', 'reallocated_sectors_count', 'reported_uncorrectable_errors', 'command_timeout', 'current_pending_sector_count', 'uncorrectable_sector_count'])
    y = df[['early_failure']]
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = .80, random_state = 123, stratify=df.early_failure)
    return X, y, X_train, X_test, y_train, y_test

In [20]:
def encode_hot(train, test, col_name):
    encoded_values = sorted(list(train[col_name].unique()))

    # Integer Encoding
    int_encoder = LabelEncoder()
    train.encoded = int_encoder.fit_transform(train[col_name])
    test.encoded = int_encoder.transform(test[col_name])

    # create 2D np arrays of the encoded variable (in train and test)
    train_array = np.array(train.encoded).reshape(len(train.encoded),1)
    test_array = np.array(test.encoded).reshape(len(test.encoded),1)

    # One Hot Encoding
    ohe = OneHotEncoder(sparse=False, categories='auto')
    train_ohe = ohe.fit_transform(train_array)
    test_ohe = ohe.transform(test_array)

    # Turn the array of new values into a data frame with columns names being the values
    # and index matching that of train/test
    # then merge the new dataframe with the existing train/test dataframe
    train_encoded = pd.DataFrame(data=train_ohe, columns=encoded_values, index=train.index)
    train = train.join(train_encoded)

    test_encoded = pd.DataFrame(data=test_ohe, columns=encoded_values, index=test.index)
    test = test.join(test_encoded)

    return train, test

In [21]:
def encode(train, test, col_name):

    encoded_values = sorted(list(train[col_name].unique()))

    # Integer Encoding
    int_encoder = LabelEncoder()
    train.encoded = int_encoder.fit_transform(train[col_name])
    test.encoded = int_encoder.transform(test[col_name])

    # create 2D np arrays of the encoded variable (in train and test)
    train_array = np.array(train.encoded).reshape(len(train.encoded),1)
    test_array = np.array(test.encoded).reshape(len(test.encoded),1)

    # One Hot Encoding
    ohe = OneHotEncoder(sparse=False, categories='auto')
    train_ohe = ohe.fit_transform(train_array)
    test_ohe = ohe.transform(test_array)

    # Turn the array of new values into a data frame with columns names being the values
    # and index matching that of train/test
    # then merge the new dataframe with the existing train/test dataframe
    train_encoded = pd.DataFrame(data=train_ohe,
                            columns=encoded_values, index=train.index)
    train = train.join(train_encoded)

    test_encoded = pd.DataFrame(data=test_ohe,
                               columns=encoded_values, index=test.index)
    test = test.join(test_encoded)

    return train, test

In [22]:
X, y, X_train, X_test, y_train, y_test = split_my_data(df)

In [23]:
len(X_train), len(y_train), len(X_test), len(y_test), len(X), len(y)

(6377, 6377, 1595, 1595, 7972, 7972)

(6377, 6377, 1595, 1595, 7972, 7972)

In [24]:
X.columns

Index(['manufacturer', 'capacity_terabytes', 'smart_5_nonzero',
       'smart_187_nonzero', 'smart_188_nonzero', 'smart_197_nonzero',
       'smart_198_nonzero'],
      dtype='object')

Index(['manufacturer', 'capacity_terabytes', 'smart_5_nonzero',
       'smart_187_nonzero', 'smart_188_nonzero', 'smart_197_nonzero',
       'smart_198_nonzero'],
      dtype='object')

In [25]:
X_train.shape , X_test.shape

((6377, 7), (1595, 7))

((6377, 7), (1595, 7))

In [26]:
X_train, X_test = encode(X_train, X_test, col_name = 'manufacturer')

In [27]:
X_train.shape , X_test.shape

((6377, 11), (1595, 11))

((6377, 11), (1595, 11))

In [28]:
X_train = X_train.drop(columns = 'manufacturer')

In [29]:
X_test = X_test.drop(columns = 'manufacturer')

In [30]:
X_train.columns

Index(['capacity_terabytes', 'smart_5_nonzero', 'smart_187_nonzero',
       'smart_188_nonzero', 'smart_197_nonzero', 'smart_198_nonzero',
       'Hitachi', 'Seagate', 'Toshiba', 'Western Digital'],
      dtype='object')

Index(['capacity_terabytes', 'smart_5_nonzero', 'smart_187_nonzero',
       'smart_188_nonzero', 'smart_197_nonzero', 'smart_198_nonzero',
       'Hitachi', 'Seagate', 'Toshiba', 'Western Digital'],
      dtype='object')

In [31]:
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(6377, 10) (1595, 10) (6377, 1) (1595, 1)
(6377, 10) (1595, 10) (6377, 1) (1595, 1)


In [32]:
X_train = X_train.drop(columns = 'capacity_terabytes')

In [33]:
X_test = X_test.drop(columns = 'capacity_terabytes')

In [34]:
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(6377, 9) (1595, 9) (6377, 1) (1595, 1)
(6377, 9) (1595, 9) (6377, 1) (1595, 1)


In [35]:
X_train.head()

Unnamed: 0,smart_5_nonzero,smart_187_nonzero,smart_188_nonzero,smart_197_nonzero,smart_198_nonzero,Hitachi,Seagate,Toshiba,Western Digital
17876,False,False,False,False,False,1.0,0.0,0.0,0.0
52765,False,True,False,False,False,0.0,1.0,0.0,0.0
123585,False,False,False,False,False,1.0,0.0,0.0,0.0
3494,False,False,False,False,False,1.0,0.0,0.0,0.0
15186,False,False,False,False,False,0.0,1.0,0.0,0.0


Unnamed: 0,smart_5_nonzero,smart_187_nonzero,smart_188_nonzero,smart_197_nonzero,smart_198_nonzero,Hitachi,Seagate,Toshiba,Western Digital
17876,False,False,False,False,False,1.0,0.0,0.0,0.0
52765,False,True,False,False,False,0.0,1.0,0.0,0.0
123585,False,False,False,False,False,1.0,0.0,0.0,0.0
3494,False,False,False,False,False,1.0,0.0,0.0,0.0
15186,False,False,False,False,False,0.0,1.0,0.0,0.0


### weights = {0: 1, 1: 75}, G = 10, C = 10

In [36]:
svclassifier = SVC(kernel='sigmoid', probability = True)
svclassifier.fit(X_train, y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
    kernel='sigmoid', max_iter=-1, probability=True, random_state=None,
    shrinking=True, tol=0.001, verbose=False)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
    kernel='sigmoid', max_iter=-1, probability=True, random_state=None,
    shrinking=True, tol=0.001, verbose=False)

In [37]:
y_pred = svclassifier.predict(X_train)
y_pred

array([0, 1, 0, ..., 0, 1, 0])

array([0, 1, 0, ..., 0, 1, 0])

In [38]:
y_pred_proba = svclassifier.predict_proba(X_train)
y_pred_proba

array([[0.71442989, 0.28557011],
       [0.22883019, 0.77116981],
       [0.71442989, 0.28557011],
       ...,
       [0.71453677, 0.28546323],
       [0.22879833, 0.77120167],
       [0.71453677, 0.28546323]])

array([[0.71442989, 0.28557011],
       [0.22883019, 0.77116981],
       [0.71442989, 0.28557011],
       ...,
       [0.71453677, 0.28546323],
       [0.22879833, 0.77120167],
       [0.71453677, 0.28546323]])

In [39]:
print(confusion_matrix(y_train, y_pred))
print(classification_report(y_train, y_pred))

[[3021  167]
 [1129 2060]]
              precision    recall  f1-score   support

           0       0.73      0.95      0.82      3188
           1       0.93      0.65      0.76      3189

    accuracy                           0.80      6377
   macro avg       0.83      0.80      0.79      6377
weighted avg       0.83      0.80      0.79      6377

[[3021  167]
 [1129 2060]]
              precision    recall  f1-score   support

           0       0.73      0.95      0.82      3188
           1       0.93      0.65      0.76      3189

    accuracy                           0.80      6377
   macro avg       0.83      0.80      0.79      6377
weighted avg       0.83      0.80      0.79      6377



In [40]:

# from sklearn.model_selection import GridSearchCV

# def grid_logit_model(X_train, y_train):
#     # create object
#     logit = LogisticRegression(solver = 'liblinear', class_weight='balanced', random_state = 123)
#     # set a range of hyperparameters
#     grid_values = {'C':[0.001,.009,0.01,.09,1,5,10,25]}
    
#     # grid search returns recall values
#     grid = GridSearchCV(logit, grid_values, cv=3, scoring='recall')
#     grid.fit(X_train, y_train)    
    
#     # retrieve results
#     results = grid.cv_results_
#     test_scores = results['mean_test_score']
#     params = results['params']
    
#     # zip results together for dataframe
#     for p, s in zip(params, test_scores):
#         p['score'] = s    
        
#     return pd.DataFrame(params).sort_values(by='score')