In [1]:
# data wrangling
import pandas as pd
import numpy as np

from sklearn.svm import LinearSVR
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import SGDRegressor, LassoCV
from sklearn.tree import DecisionTreeRegressor

# visualizations
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings("ignore")

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

# py files
import acquire
import prepare
import explore
import model

In [2]:
df = acquire.acquire_agg_data()

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 169073 entries, 0 to 169072
Data columns (total 10 columns):
serial_number         169072 non-null object
model                 169073 non-null object
capacity_bytes        169073 non-null int64
max(failure)          169073 non-null int64
max(smart_9_raw)      161975 non-null float64
max(smart_5_raw)      161851 non-null float64
max(smart_187_raw)    104189 non-null float64
max(smart_188_raw)    104179 non-null float64
max(smart_197_raw)    161841 non-null float64
max(smart_198_raw)    161841 non-null float64
dtypes: float64(6), int64(2), object(2)
memory usage: 12.9+ MB


In [4]:
df = prepare.prepare(df)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 169073 entries, 0 to 169072
Data columns (total 11 columns):
serial_number                    169072 non-null object
manufacturer                     169073 non-null object
model                            169073 non-null object
capacity_terabytes               169073 non-null float64
failure                          169073 non-null int64
drive_age_in_years               161975 non-null float64
reallocated_sectors_count        161851 non-null float64
reported_uncorrectable_errors    104189 non-null float64
command_timeout                  104179 non-null float64
current_pending_sector_count     161841 non-null float64
uncorrectable_sector_count       161841 non-null float64
dtypes: float64(7), int64(1), object(3)
memory usage: 14.2+ MB


In [6]:
df = prepare.unique(df)

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 162025 entries, 0 to 169067
Data columns (total 11 columns):
serial_number                    162024 non-null object
manufacturer                     162025 non-null object
model                            162025 non-null object
capacity_terabytes               162025 non-null float64
failure                          162025 non-null int64
drive_age_in_years               161965 non-null float64
reallocated_sectors_count        161841 non-null float64
reported_uncorrectable_errors    104186 non-null float64
command_timeout                  104176 non-null float64
current_pending_sector_count     161831 non-null float64
uncorrectable_sector_count       161831 non-null float64
dtypes: float64(7), int64(1), object(3)
memory usage: 14.8+ MB


In [8]:
df = prepare.treat_nulls(df)

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 161830 entries, 0 to 169067
Data columns (total 11 columns):
serial_number                    161830 non-null object
manufacturer                     161830 non-null object
model                            161830 non-null object
capacity_terabytes               161830 non-null float64
failure                          161830 non-null int64
drive_age_in_years               161830 non-null float64
reallocated_sectors_count        161830 non-null float64
reported_uncorrectable_errors    161830 non-null float64
command_timeout                  161830 non-null float64
current_pending_sector_count     161830 non-null float64
uncorrectable_sector_count       161830 non-null float64
dtypes: float64(7), int64(1), object(3)
memory usage: 14.8+ MB


In [10]:
#df = prepare.treat_nulls(df)

In [11]:
# agg_model = df.groupby(['model']).agg({'failure' : 'sum', 'model' : 'count', 'drive_age_in_years':'mean'})
# agg_model.rename(columns={'failure':'failures', 'model':'total_count'}, inplace=True)

# # add a failure rate column
# agg_model['failure_rate_percent'] = agg_model['failures'] / agg_model['total_count'] * 100
# agg_model.sort_values(by=['failures'], ascending = False)

# Early Failure Column ---> Cutoff = 1.6

In [12]:
df = explore.early_failure(df,1.6)

In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 161830 entries, 0 to 169067
Data columns (total 12 columns):
serial_number                    161830 non-null object
manufacturer                     161830 non-null object
model                            161830 non-null object
capacity_terabytes               161830 non-null float64
failure                          161830 non-null int64
drive_age_in_years               161830 non-null float64
reallocated_sectors_count        161830 non-null float64
reported_uncorrectable_errors    161830 non-null float64
command_timeout                  161830 non-null float64
current_pending_sector_count     161830 non-null float64
uncorrectable_sector_count       161830 non-null float64
early_failure                    161830 non-null int64
dtypes: float64(7), int64(2), object(3)
memory usage: 16.1+ MB


# Keep Only Old & Failed

In [14]:
df = explore.old_or_fail(df)

In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 121902 entries, 0 to 168960
Data columns (total 12 columns):
serial_number                    121902 non-null object
manufacturer                     121902 non-null object
model                            121902 non-null object
capacity_terabytes               121902 non-null float64
failure                          121902 non-null int64
drive_age_in_years               121902 non-null float64
reallocated_sectors_count        121902 non-null float64
reported_uncorrectable_errors    121902 non-null float64
command_timeout                  121902 non-null float64
current_pending_sector_count     121902 non-null float64
uncorrectable_sector_count       121902 non-null float64
early_failure                    121902 non-null int64
dtypes: float64(7), int64(2), object(3)
memory usage: 12.1+ MB


# SMART ---> Bool

In [16]:
df = explore.make_binary_values(df)

In [17]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 121902 entries, 0 to 168960
Data columns (total 17 columns):
serial_number                    121902 non-null object
manufacturer                     121902 non-null object
model                            121902 non-null object
capacity_terabytes               121902 non-null float64
failure                          121902 non-null int64
drive_age_in_years               121902 non-null float64
reallocated_sectors_count        121902 non-null float64
reported_uncorrectable_errors    121902 non-null float64
command_timeout                  121902 non-null float64
current_pending_sector_count     121902 non-null float64
uncorrectable_sector_count       121902 non-null float64
early_failure                    121902 non-null int64
smart_5_nonzero                  121902 non-null bool
smart_187_nonzero                121902 non-null bool
smart_188_nonzero                121902 non-null bool
smart_197_nonzero                121902 non-null b

# Remove Manufacturers

In [18]:
df = explore.remove_manufacturers(df)

# Split Data

In [230]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 121900 entries, 0 to 168960
Data columns (total 17 columns):
serial_number                    121900 non-null object
manufacturer                     121900 non-null object
model                            121900 non-null object
capacity_terabytes               121900 non-null float64
failure                          121900 non-null int64
drive_age_in_years               121900 non-null float64
reallocated_sectors_count        121900 non-null float64
reported_uncorrectable_errors    121900 non-null float64
command_timeout                  121900 non-null float64
current_pending_sector_count     121900 non-null float64
uncorrectable_sector_count       121900 non-null float64
early_failure                    121900 non-null int64
smart_5_nonzero                  121900 non-null bool
smart_187_nonzero                121900 non-null bool
smart_188_nonzero                121900 non-null bool
smart_197_nonzero                121900 non-null b

In [184]:
def encode_hot_X(train, col_name):
    encoded_values = sorted(list(train[col_name].unique()))

    # Integer Encoding
    int_encoder = LabelEncoder()
    train.encoded = int_encoder.fit_transform(train[col_name])
#     test.encoded = int_encoder.transform(test[col_name])

    # create 2D np arrays of the encoded variable (in train and test)
    train_array = np.array(train.encoded).reshape(len(train.encoded),1)
#     test_array = np.array(test.encoded).reshape(len(test.encoded),1)

    # One Hot Encoding
    ohe = OneHotEncoder(sparse=False, categories='auto')
    train_ohe = ohe.fit_transform(train_array)
#     test_ohe = ohe.transform(test_array)

    # Turn the array of new values into a data frame with columns names being the values
    # and index matching that of train/test
    # then merge the new dataframe with the existing train/test dataframe
    train_encoded = pd.DataFrame(data=train_ohe, columns=encoded_values, index=train.index)
    train = train.join(train_encoded)

#     test_encoded = pd.DataFrame(data=test_ohe, columns=encoded_values, index=test.index)
#     test = test.join(test_encoded)

    return train

In [186]:
df_1 = encode_hot_X(df, col_name= "model")

In [170]:
def split_my_data(df):
    X = df.drop(columns = ['serial_number', 'failure', 'model', 'early_failure', 'drive_age_in_years', 'reallocated_sectors_count', 'reported_uncorrectable_errors', 'command_timeout', 'current_pending_sector_count', 'uncorrectable_sector_count'])
    y = df[['early_failure']]
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = .80, random_state = 123, stratify=df.early_failure)
    return X, y, X_train, X_test, y_train, y_test

In [187]:
X, y, X_train, X_test, y_train, y_test = split_my_data(df_1)

In [188]:
len(X_train), len(y_train), len(X_test), len(y_test)

(97520, 97520, 24380, 24380)

In [189]:
X.columns

Index(['manufacturer', 'model', 'capacity_terabytes', 'smart_5_nonzero',
       'smart_187_nonzero', 'smart_188_nonzero', 'smart_197_nonzero',
       'smart_198_nonzero', 'HGST HDS5C4040ALE630', 'HGST HDS724040ALE640',
       'HGST HMS5C4040ALE640', 'HGST HMS5C4040BLE640', 'HGST HUH721212ALE600',
       'HGST HUH721212ALN604', 'HGST HUH728080ALE600', 'HGST HUS726040ALE610',
       'Hitachi HDS5C3030ALA630', 'Hitachi HDS5C3030BLE630',
       'Hitachi HDS5C4040ALE630', 'Hitachi HDS722020ALA330',
       'Hitachi HDS723020BLA642', 'Hitachi HDS723030ALA640',
       'Hitachi HDS723030BLE640', 'Hitachi HDS724040ALE640',
       'Hitachi HDT725025VLA380', 'ST10000NM0086', 'ST1000LM024 HN',
       'ST12000NM0007', 'ST12000NM0117', 'ST2000VN000', 'ST250LM004 HN',
       'ST250LT007', 'ST31500341AS', 'ST31500541AS', 'ST3160316AS',
       'ST3160318AS', 'ST320LT007', 'ST33000651AS', 'ST3500320AS',
       'ST4000DM000', 'ST4000DM001', 'ST4000DM005', 'ST4000DX000',
       'ST4000DX002', 'ST500LM012 H

In [190]:
X_train.columns

Index(['manufacturer', 'model', 'capacity_terabytes', 'smart_5_nonzero',
       'smart_187_nonzero', 'smart_188_nonzero', 'smart_197_nonzero',
       'smart_198_nonzero', 'HGST HDS5C4040ALE630', 'HGST HDS724040ALE640',
       'HGST HMS5C4040ALE640', 'HGST HMS5C4040BLE640', 'HGST HUH721212ALE600',
       'HGST HUH721212ALN604', 'HGST HUH728080ALE600', 'HGST HUS726040ALE610',
       'Hitachi HDS5C3030ALA630', 'Hitachi HDS5C3030BLE630',
       'Hitachi HDS5C4040ALE630', 'Hitachi HDS722020ALA330',
       'Hitachi HDS723020BLA642', 'Hitachi HDS723030ALA640',
       'Hitachi HDS723030BLE640', 'Hitachi HDS724040ALE640',
       'Hitachi HDT725025VLA380', 'ST10000NM0086', 'ST1000LM024 HN',
       'ST12000NM0007', 'ST12000NM0117', 'ST2000VN000', 'ST250LM004 HN',
       'ST250LT007', 'ST31500341AS', 'ST31500541AS', 'ST3160316AS',
       'ST3160318AS', 'ST320LT007', 'ST33000651AS', 'ST3500320AS',
       'ST4000DM000', 'ST4000DM001', 'ST4000DM005', 'ST4000DX000',
       'ST4000DX002', 'ST500LM012 H

# Encode

In [191]:
X_train, X_test = model.encode_hot(X_train, X_test, col_name = 'manufacturer')

In [192]:
X_train = X_train.drop(columns = 'manufacturer')

In [193]:
X_test = X_test.drop(columns = 'manufacturer')

In [194]:
X_train.head()

Unnamed: 0,model,capacity_terabytes,smart_5_nonzero,smart_187_nonzero,smart_188_nonzero,smart_197_nonzero,smart_198_nonzero,HGST HDS5C4040ALE630,HGST HDS724040ALE640,HGST HMS5C4040ALE640,...,WDC WD800AAJB,WDC WD800AAJS,WDC WD800BB,WDC WD800JB,WDC WD800JD,WDC WD800LB,Hitachi,Seagate,Toshiba,Western Digital
116194,ST8000NM0055,8.0,False,False,False,False,False,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
142942,ST4000DM000,4.0,False,False,False,False,False,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
97790,ST12000NM0007,12.0,False,False,False,False,False,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
125503,ST12000NM0007,12.0,False,False,False,False,False,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
162599,ST6000DX000,6.0,False,False,False,False,False,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [199]:
X_train.columns

Index(['capacity_terabytes', 'smart_5_nonzero', 'smart_187_nonzero',
       'smart_188_nonzero', 'smart_197_nonzero', 'smart_198_nonzero',
       'HGST HDS5C4040ALE630', 'HGST HDS724040ALE640', 'HGST HMS5C4040ALE640',
       'HGST HMS5C4040BLE640', 'HGST HUH721212ALE600', 'HGST HUH721212ALN604',
       'HGST HUH728080ALE600', 'HGST HUS726040ALE610',
       'Hitachi HDS5C3030ALA630', 'Hitachi HDS5C3030BLE630',
       'Hitachi HDS5C4040ALE630', 'Hitachi HDS722020ALA330',
       'Hitachi HDS723020BLA642', 'Hitachi HDS723030ALA640',
       'Hitachi HDS723030BLE640', 'Hitachi HDS724040ALE640',
       'Hitachi HDT725025VLA380', 'ST10000NM0086', 'ST1000LM024 HN',
       'ST12000NM0007', 'ST12000NM0117', 'ST2000VN000', 'ST250LM004 HN',
       'ST250LT007', 'ST31500341AS', 'ST31500541AS', 'ST3160316AS',
       'ST3160318AS', 'ST320LT007', 'ST33000651AS', 'ST3500320AS',
       'ST4000DM000', 'ST4000DM001', 'ST4000DM005', 'ST4000DX000',
       'ST4000DX002', 'ST500LM012 HN', 'ST500LM030', 'ST6000

In [196]:
X_train = X_train.drop(columns = 'model')

In [197]:
X_test = X_test.drop(columns = 'model')

In [207]:
X_train = X_train.drop(columns = 'capacity_terabytes')

In [208]:
X_test = X_test.drop(columns = 'capacity_terabytes')

In [209]:
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(97520, 99) (24380, 99) (97520, 1) (24380, 1)


# Modeling - SVM

#### Training the Algorithm

The fit method of SVC class is called to train the algorithm on the training data, which is passed as a parameter to the fit method.

In [78]:
from sklearn.svm import SVC
svclassifier = SVC(kernel='linear', probability = True)
svclassifier.fit(X_train, y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
    kernel='linear', max_iter=-1, probability=True, random_state=None,
    shrinking=True, tol=0.001, verbose=False)

#### Making Predictions

To make predictions, the predict method of the SVC class is used.

In [79]:
y_pred = svclassifier.predict(X_test)
y_pred

array([0, 0, 0, ..., 0, 0, 0])

In [80]:
y_pred_proba = svclassifier.predict_proba(X_test)
y_pred_proba

array([[0.99125665, 0.00874335],
       [0.97302642, 0.02697358],
       [0.97302642, 0.02697358],
       ...,
       [0.99125665, 0.00874335],
       [0.97302642, 0.02697358],
       [0.99125665, 0.00874335]])

In [31]:
#y_train['prediction'] = y_pred

In [32]:
#y_train["pred_prob_1"] = pd.DataFrame(y_pred_proba).iloc[:,1]

In [33]:
#y_train [y_train.early_failure == 1].sort_values('pred_prob_1', ascending=False).dropna()

#### Evaluating the Algorithm

Scikit-Learn's metrics library contains the classification_report and confusion_matrix methods, which can be readily used to find out the values for these important metrics.

In [81]:
from sklearn.metrics import classification_report, confusion_matrix
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))

[[23860     0]
 [  520     0]]
              precision    recall  f1-score   support

           0       0.98      1.00      0.99     23860
           1       0.00      0.00      0.00       520

    accuracy                           0.98     24380
   macro avg       0.49      0.50      0.49     24380
weighted avg       0.96      0.98      0.97     24380



# Drop Terabytes

In [42]:
#X_train = X_train.drop(columns='capacity_terabytes')

Unnamed: 0,smart_5_nonzero,smart_187_nonzero,smart_188_nonzero,smart_197_nonzero,smart_198_nonzero,Hitachi,Seagate,Toshiba,Western Digital
116194,False,False,False,False,False,0.0,1.0,0.0,0.0
142942,False,False,False,False,False,0.0,1.0,0.0,0.0
97790,False,False,False,False,False,0.0,1.0,0.0,0.0
125503,False,False,False,False,False,0.0,1.0,0.0,0.0
162599,False,False,False,False,False,0.0,1.0,0.0,0.0
138013,False,False,False,False,False,0.0,1.0,0.0,0.0
122513,False,False,False,False,False,0.0,1.0,0.0,0.0
165062,False,False,False,False,False,1.0,0.0,0.0,0.0
29086,False,False,False,False,False,0.0,1.0,0.0,0.0
51897,False,False,False,False,False,1.0,0.0,0.0,0.0


In [56]:
X_test = X_test.drop(columns='capacity_terabytes')

# Linear Support Vector Regressor

In [47]:
regr = LinearSVR(random_state=123, tol=1e-5, loss='squared_epsilon_insensitive', fit_intercept=False, dual=False)
regr.fit(X_train, y_train)
y_pred_SVR = regr.predict(X_train)
print(mean_squared_error(y_train, y_pred_SVR)**1/2)

0.009081614104124643


In [48]:
y_pred_SVR

array([0.01144464, 0.01144464, 0.01144464, ..., 0.01144464, 0.00065013,
       0.00065013])

In [49]:
# print(confusion_matrix(y_train,y_pred_SVR))
# print(classification_report(y_train,y_pred_SVR))

In [75]:
svclassifier = SVC(kernel='poly', degree=8, probability=True)
svclassifier.fit(X_train, y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=8, gamma='auto_deprecated',
    kernel='poly', max_iter=-1, probability=True, random_state=None,
    shrinking=True, tol=0.001, verbose=False)

In [76]:
y_pred = svclassifier.predict(X_test)


In [64]:
#y_pred_proba = svclassifier.predict_proba(X_train)
#y_pred_proba

array([[0.02907984, 0.97092016],
       [0.02907984, 0.97092016],
       [0.02907984, 0.97092016],
       ...,
       [0.02907984, 0.97092016],
       [0.02914954, 0.97085046],
       [0.02914954, 0.97085046]])

In [77]:
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))

[[23860     0]
 [  520     0]]
              precision    recall  f1-score   support

           0       0.98      1.00      0.99     23860
           1       0.00      0.00      0.00       520

    accuracy                           0.98     24380
   macro avg       0.49      0.50      0.49     24380
weighted avg       0.96      0.98      0.97     24380



In [71]:
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

2078

# Gaussian Kernel

In [83]:
svclassifier = SVC(kernel='rbf')
svclassifier.fit(X_train, y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
    kernel='rbf', max_iter=-1, probability=False, random_state=None,
    shrinking=True, tol=0.001, verbose=False)

In [84]:
y_pred = svclassifier.predict(X_test)

In [85]:
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

[[23860     0]
 [  520     0]]
              precision    recall  f1-score   support

           0       0.98      1.00      0.99     23860
           1       0.00      0.00      0.00       520

    accuracy                           0.98     24380
   macro avg       0.49      0.50      0.49     24380
weighted avg       0.96      0.98      0.97     24380



# Sigmoid Kernel

# Gamma = .1

In [96]:
svclassifier = SVC(kernel='sigmoid', probability = True, gamma = .1)
svclassifier.fit(X_train, y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=0.1, kernel='sigmoid',
    max_iter=-1, probability=True, random_state=None, shrinking=True, tol=0.001,
    verbose=False)

In [97]:
y_pred = svclassifier.predict(X_train)
y_pred

array([0, 0, 0, ..., 0, 0, 0])

In [98]:
y_pred_proba = svclassifier.predict_proba(X_train)
y_pred_proba

array([[0.98280128, 0.01719872],
       [0.98280128, 0.01719872],
       [0.98280128, 0.01719872],
       ...,
       [0.98280128, 0.01719872],
       [0.98280128, 0.01719872],
       [0.98280128, 0.01719872]])

In [99]:
print(confusion_matrix(y_train, y_pred))
print(classification_report(y_train, y_pred))

[[94947   495]
 [ 1839   239]]
              precision    recall  f1-score   support

           0       0.98      0.99      0.99     95442
           1       0.33      0.12      0.17      2078

    accuracy                           0.98     97520
   macro avg       0.65      0.55      0.58     97520
weighted avg       0.97      0.98      0.97     97520



# Gamma = 1

In [100]:
svclassifier = SVC(kernel='sigmoid', probability = True, gamma = 1)
svclassifier.fit(X_train, y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=1, kernel='sigmoid',
    max_iter=-1, probability=True, random_state=None, shrinking=True, tol=0.001,
    verbose=False)

In [101]:
y_pred = svclassifier.predict(X_train)
y_pred

array([0, 0, 0, ..., 0, 0, 0])

In [102]:
y_pred_proba = svclassifier.predict_proba(X_train)
y_pred_proba

array([[0.9781057 , 0.0218943 ],
       [0.9781057 , 0.0218943 ],
       [0.9781057 , 0.0218943 ],
       ...,
       [0.9781057 , 0.0218943 ],
       [0.97961512, 0.02038488],
       [0.97961512, 0.02038488]])

In [103]:
print(confusion_matrix(y_train, y_pred))
print(classification_report(y_train, y_pred))

[[94297  1145]
 [ 1847   231]]
              precision    recall  f1-score   support

           0       0.98      0.99      0.98     95442
           1       0.17      0.11      0.13      2078

    accuracy                           0.97     97520
   macro avg       0.57      0.55      0.56     97520
weighted avg       0.96      0.97      0.97     97520



# Gamma = 10

In [104]:
svclassifier = SVC(kernel='sigmoid', probability = True, gamma = 10)
svclassifier.fit(X_train, y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=10, kernel='sigmoid',
    max_iter=-1, probability=True, random_state=None, shrinking=True, tol=0.001,
    verbose=False)

In [105]:
y_pred = svclassifier.predict(X_train)
y_pred

array([0, 0, 0, ..., 0, 0, 0])

In [106]:
y_pred_proba = svclassifier.predict_proba(X_train)
y_pred_proba

array([[0.97823105, 0.02176895],
       [0.97823105, 0.02176895],
       [0.97823105, 0.02176895],
       ...,
       [0.97823105, 0.02176895],
       [0.97845622, 0.02154378],
       [0.97845622, 0.02154378]])

In [107]:
print(confusion_matrix(y_train, y_pred))
print(classification_report(y_train, y_pred))

[[93957  1485]
 [ 1713   365]]
              precision    recall  f1-score   support

           0       0.98      0.98      0.98     95442
           1       0.20      0.18      0.19      2078

    accuracy                           0.97     97520
   macro avg       0.59      0.58      0.58     97520
weighted avg       0.97      0.97      0.97     97520



# Gamma = 100

In [108]:
svclassifier = SVC(kernel='sigmoid', probability = True, gamma = 100)
svclassifier.fit(X_train, y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=100, kernel='sigmoid',
    max_iter=-1, probability=True, random_state=None, shrinking=True, tol=0.001,
    verbose=False)

In [109]:
y_pred = svclassifier.predict(X_train)
y_pred

array([0, 0, 0, ..., 0, 0, 0])

In [110]:
y_pred_proba = svclassifier.predict_proba(X_train)
y_pred_proba

array([[0.97818774, 0.02181226],
       [0.97818774, 0.02181226],
       [0.97818774, 0.02181226],
       ...,
       [0.97818774, 0.02181226],
       [0.97865518, 0.02134482],
       [0.97865518, 0.02134482]])

In [111]:
print(confusion_matrix(y_train, y_pred))
print(classification_report(y_train, y_pred))

[[93957  1485]
 [ 1713   365]]
              precision    recall  f1-score   support

           0       0.98      0.98      0.98     95442
           1       0.20      0.18      0.19      2078

    accuracy                           0.97     97520
   macro avg       0.59      0.58      0.58     97520
weighted avg       0.97      0.97      0.97     97520



# Cost = 10

In [117]:
svclassifier = SVC(kernel='sigmoid', probability = True, C= 10)
svclassifier.fit(X_train, y_train)

SVC(C=10, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
    kernel='sigmoid', max_iter=-1, probability=True, random_state=None,
    shrinking=True, tol=0.001, verbose=False)

In [118]:
y_pred = svclassifier.predict(X_train)
y_pred

array([0, 0, 0, ..., 0, 0, 0])

In [119]:
y_pred_proba = svclassifier.predict_proba(X_train)
y_pred_proba

array([[0.98303983, 0.01696017],
       [0.98303983, 0.01696017],
       [0.98303983, 0.01696017],
       ...,
       [0.98303983, 0.01696017],
       [0.97488422, 0.02511578],
       [0.97488422, 0.02511578]])

In [120]:
print(confusion_matrix(y_train, y_pred))
print(classification_report(y_train, y_pred))

[[95384    58]
 [ 2071     7]]
              precision    recall  f1-score   support

           0       0.98      1.00      0.99     95442
           1       0.11      0.00      0.01      2078

    accuracy                           0.98     97520
   macro avg       0.54      0.50      0.50     97520
weighted avg       0.96      0.98      0.97     97520



# Cost = 100

In [129]:
svclassifier = SVC(kernel='sigmoid', probability = True, C=100)
svclassifier.fit(X_train, y_train)

SVC(C=100, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
    kernel='sigmoid', max_iter=-1, probability=True, random_state=None,
    shrinking=True, tol=0.001, verbose=False)

In [130]:
y_pred = svclassifier.predict(X_train)
y_pred

array([0, 0, 0, ..., 0, 0, 0])

In [131]:
y_pred_proba = svclassifier.predict_proba(X_train)
y_pred_proba

array([[0.98341927, 0.01658073],
       [0.98341927, 0.01658073],
       [0.98341927, 0.01658073],
       ...,
       [0.98341927, 0.01658073],
       [0.98341933, 0.01658067],
       [0.98341933, 0.01658067]])

In [132]:
print(confusion_matrix(y_train, y_pred))
print(classification_report(y_train, y_pred))

[[94913   529]
 [ 1819   259]]
              precision    recall  f1-score   support

           0       0.98      0.99      0.99     95442
           1       0.33      0.12      0.18      2078

    accuracy                           0.98     97520
   macro avg       0.65      0.56      0.58     97520
weighted avg       0.97      0.98      0.97     97520



# Cost = 1000

In [133]:
svclassifier = SVC(kernel='sigmoid', probability = True, C = 1000)
svclassifier.fit(X_train, y_train)

SVC(C=1000, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
    kernel='sigmoid', max_iter=-1, probability=True, random_state=None,
    shrinking=True, tol=0.001, verbose=False)

In [134]:
y_pred = svclassifier.predict(X_train)
y_pred

array([0, 0, 0, ..., 0, 0, 0])

In [135]:
y_pred_proba = svclassifier.predict_proba(X_train)
y_pred_proba

array([[0.98323408, 0.01676592],
       [0.98323408, 0.01676592],
       [0.98323408, 0.01676592],
       ...,
       [0.98323408, 0.01676592],
       [0.98323409, 0.01676591],
       [0.98323409, 0.01676591]])

In [136]:
print(confusion_matrix(y_train, y_pred))
print(classification_report(y_train, y_pred))

[[94913   529]
 [ 1819   259]]
              precision    recall  f1-score   support

           0       0.98      0.99      0.99     95442
           1       0.33      0.12      0.18      2078

    accuracy                           0.98     97520
   macro avg       0.65      0.56      0.58     97520
weighted avg       0.97      0.98      0.97     97520



# Combining Cost & Gamma

# C = 100 & G = 100

In [137]:
svclassifier = SVC(kernel='sigmoid', probability = True, gamma = 100, C = 100)
svclassifier.fit(X_train, y_train)

SVC(C=100, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=100, kernel='sigmoid',
    max_iter=-1, probability=True, random_state=None, shrinking=True, tol=0.001,
    verbose=False)

In [138]:
y_pred = svclassifier.predict(X_train)
y_pred

array([0, 0, 0, ..., 0, 0, 0])

In [139]:
y_pred_proba = svclassifier.predict_proba(X_train)
y_pred_proba

array([[0.97816809, 0.02183191],
       [0.97816809, 0.02183191],
       [0.97816809, 0.02183191],
       ...,
       [0.97816809, 0.02183191],
       [0.97870776, 0.02129224],
       [0.97870776, 0.02129224]])

In [140]:
print(confusion_matrix(y_train, y_pred))
print(classification_report(y_train, y_pred))

[[93957  1485]
 [ 1713   365]]
              precision    recall  f1-score   support

           0       0.98      0.98      0.98     95442
           1       0.20      0.18      0.19      2078

    accuracy                           0.97     97520
   macro avg       0.59      0.58      0.58     97520
weighted avg       0.97      0.97      0.97     97520



# C = 1000 & G = 100

In [142]:
svclassifier = SVC(kernel='sigmoid', probability = True, gamma = 100, C = 1000)
svclassifier.fit(X_train, y_train)

SVC(C=1000, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=100, kernel='sigmoid',
    max_iter=-1, probability=True, random_state=None, shrinking=True, tol=0.001,
    verbose=False)

In [143]:
y_pred = svclassifier.predict(X_train)
y_pred

array([0, 0, 0, ..., 0, 0, 0])

In [144]:
y_pred_proba = svclassifier.predict_proba(X_train)
y_pred_proba

array([[0.97821849, 0.02178151],
       [0.97821849, 0.02178151],
       [0.97821849, 0.02178151],
       ...,
       [0.97821849, 0.02178151],
       [0.97850869, 0.02149131],
       [0.97850869, 0.02149131]])

In [145]:
print(confusion_matrix(y_train, y_pred))
print(classification_report(y_train, y_pred))

[[93957  1485]
 [ 1713   365]]
              precision    recall  f1-score   support

           0       0.98      0.98      0.98     95442
           1       0.20      0.18      0.19      2078

    accuracy                           0.97     97520
   macro avg       0.59      0.58      0.58     97520
weighted avg       0.97      0.97      0.97     97520



# Remove Terabytes & OHE Model to Train

In [211]:
svclassifier = SVC(kernel='sigmoid', probability = True, gamma = 100, C = 1000)
svclassifier.fit(X_train, y_train)

SVC(C=1000, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=100, kernel='sigmoid',
    max_iter=-1, probability=True, random_state=None, shrinking=True, tol=0.001,
    verbose=False)

In [212]:
y_pred = svclassifier.predict(X_train)
y_pred

array([0, 0, 0, ..., 0, 0, 0])

In [213]:
y_pred_proba = svclassifier.predict_proba(X_train)
y_pred_proba

array([[0.9766556 , 0.0233444 ],
       [0.9766556 , 0.0233444 ],
       [0.9766556 , 0.0233444 ],
       ...,
       [0.9766556 , 0.0233444 ],
       [0.89589671, 0.10410329],
       [0.89589671, 0.10410329]])

In [214]:
print(confusion_matrix(y_train, y_pred))
print(classification_report(y_train, y_pred))

[[93957  1485]
 [ 1713   365]]
              precision    recall  f1-score   support

           0       0.98      0.98      0.98     95442
           1       0.20      0.18      0.19      2078

    accuracy                           0.97     97520
   macro avg       0.59      0.58      0.58     97520
weighted avg       0.97      0.97      0.97     97520



# Attempting to adjust for Class Weight

### weights = {0: 1, 1: 2.5}, G = 100, C = 100

In [218]:
weights = {0: 1, 1: 2.5}
svclassifier = SVC(kernel='sigmoid', probability = True, gamma = 100, C = 100, class_weight = weights)
svclassifier.fit(X_train, y_train)

SVC(C=100, cache_size=200, class_weight={0: 1, 1: 2.5}, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=100, kernel='sigmoid',
    max_iter=-1, probability=True, random_state=None, shrinking=True, tol=0.001,
    verbose=False)

In [219]:
y_pred = svclassifier.predict(X_train)
y_pred

array([0, 0, 0, ..., 0, 0, 0])

In [220]:
y_pred_proba = svclassifier.predict_proba(X_train)
y_pred_proba

array([[0.97814459, 0.02185541],
       [0.97814459, 0.02185541],
       [0.97814459, 0.02185541],
       ...,
       [0.97814459, 0.02185541],
       [0.97835861, 0.02164139],
       [0.97835861, 0.02164139]])

In [221]:
print(confusion_matrix(y_train, y_pred))
print(classification_report(y_train, y_pred))

[[93280  2162]
 [ 1697   381]]
              precision    recall  f1-score   support

           0       0.98      0.98      0.98     95442
           1       0.15      0.18      0.16      2078

    accuracy                           0.96     97520
   macro avg       0.57      0.58      0.57     97520
weighted avg       0.96      0.96      0.96     97520



# weights = {0: 1, 1: 50}, G = 10, C = 10

# BEST SO FAR

In [222]:
weights = {0: 1, 1: 50}
svclassifier = SVC(kernel='sigmoid', probability = True, gamma = 10, C = 10, class_weight = weights)
svclassifier.fit(X_train, y_train)

SVC(C=10, cache_size=200, class_weight={0: 1, 1: 50}, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=10, kernel='sigmoid',
    max_iter=-1, probability=True, random_state=None, shrinking=True, tol=0.001,
    verbose=False)

In [223]:
y_pred = svclassifier.predict(X_train)
y_pred

array([0, 0, 0, ..., 0, 0, 0])

In [224]:
y_pred_proba = svclassifier.predict_proba(X_train)
y_pred_proba

array([[0.97885837, 0.02114163],
       [0.97885837, 0.02114163],
       [0.97885837, 0.02114163],
       ...,
       [0.97885837, 0.02114163],
       [0.97885837, 0.02114163],
       [0.97885837, 0.02114163]])

In [225]:
print(confusion_matrix(y_train, y_pred))
print(classification_report(y_train, y_pred))

[[90737  4705]
 [  885  1193]]
              precision    recall  f1-score   support

           0       0.99      0.95      0.97     95442
           1       0.20      0.57      0.30      2078

    accuracy                           0.94     97520
   macro avg       0.60      0.76      0.63     97520
weighted avg       0.97      0.94      0.96     97520



### weights = {0: 11, 1: .89}, G = 10, C = 10

In [226]:
weights = {0: .11, 1: .89}
svclassifier = SVC(kernel='sigmoid', probability = True, gamma = 10, C = 10, class_weight = weights)
svclassifier.fit(X_train, y_train)

SVC(C=10, cache_size=200, class_weight={0: 0.11, 1: 0.89}, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=10, kernel='sigmoid',
    max_iter=-1, probability=True, random_state=None, shrinking=True, tol=0.001,
    verbose=False)

In [227]:
y_pred = svclassifier.predict(X_train)
y_pred

array([0, 0, 0, ..., 0, 0, 0])

In [228]:
y_pred_proba = svclassifier.predict_proba(X_train)
y_pred_proba

array([[0.97836716, 0.02163284],
       [0.97836716, 0.02163284],
       [0.97836716, 0.02163284],
       ...,
       [0.97836716, 0.02163284],
       [0.97778547, 0.02221453],
       [0.97778547, 0.02221453]])

In [229]:
print(confusion_matrix(y_train, y_pred))
print(classification_report(y_train, y_pred))

[[94435  1007]
 [ 1979    99]]
              precision    recall  f1-score   support

           0       0.98      0.99      0.98     95442
           1       0.09      0.05      0.06      2078

    accuracy                           0.97     97520
   macro avg       0.53      0.52      0.52     97520
weighted avg       0.96      0.97      0.96     97520



### weights = {0: 1, 1: 250}, G = 10, C = 100

In [231]:
weights = {0: 1, 1: 250}
svclassifier = SVC(kernel='sigmoid', probability = True, gamma = 10, C = 100, class_weight = weights)
svclassifier.fit(X_train, y_train)

SVC(C=100, cache_size=200, class_weight={0: 1, 1: 250}, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=10, kernel='sigmoid',
    max_iter=-1, probability=True, random_state=None, shrinking=True, tol=0.001,
    verbose=False)

In [232]:
y_pred = svclassifier.predict(X_train)
y_pred

array([1, 1, 1, ..., 1, 1, 1])

In [233]:
y_pred_proba = svclassifier.predict_proba(X_train)
y_pred_proba

array([[0.97961093, 0.02038907],
       [0.97961093, 0.02038907],
       [0.97961093, 0.02038907],
       ...,
       [0.97961093, 0.02038907],
       [0.97961093, 0.02038907],
       [0.97961093, 0.02038907]])

In [234]:
print(confusion_matrix(y_train, y_pred))
print(classification_report(y_train, y_pred))

[[ 3519 91923]
 [  127  1951]]
              precision    recall  f1-score   support

           0       0.97      0.04      0.07     95442
           1       0.02      0.94      0.04      2078

    accuracy                           0.06     97520
   macro avg       0.49      0.49      0.06     97520
weighted avg       0.95      0.06      0.07     97520



# Trying new options

## weights = {0: 1, 1: 50}, G = 10, C = 100

# Best result and increase Cost == Same Results

In [235]:
weights = {0: 1, 1: 50}
svclassifier = SVC(kernel='sigmoid', probability = True, gamma = 10, C = 100, class_weight = weights)
svclassifier.fit(X_train, y_train)

SVC(C=100, cache_size=200, class_weight={0: 1, 1: 50}, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=10, kernel='sigmoid',
    max_iter=-1, probability=True, random_state=None, shrinking=True, tol=0.001,
    verbose=False)

In [236]:
y_pred = svclassifier.predict(X_train)
y_pred

array([0, 0, 0, ..., 0, 0, 0])

In [237]:
y_pred_proba = svclassifier.predict_proba(X_train)
y_pred_proba

array([[0.97879025, 0.02120975],
       [0.97879025, 0.02120975],
       [0.97879025, 0.02120975],
       ...,
       [0.97879025, 0.02120975],
       [0.97879025, 0.02120975],
       [0.97879025, 0.02120975]])

In [238]:
print(confusion_matrix(y_train, y_pred))
print(classification_report(y_train, y_pred))

[[90737  4705]
 [  885  1193]]
              precision    recall  f1-score   support

           0       0.99      0.95      0.97     95442
           1       0.20      0.57      0.30      2078

    accuracy                           0.94     97520
   macro avg       0.60      0.76      0.63     97520
weighted avg       0.97      0.94      0.96     97520



## weights = {0: 1, 1: 50}, G = 100, C = 100

In [239]:
weights = {0: 1, 1: 50}
svclassifier = SVC(kernel='sigmoid', probability = True, gamma = 100, C = 100, class_weight = weights)
svclassifier.fit(X_train, y_train)

SVC(C=100, cache_size=200, class_weight={0: 1, 1: 50}, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=100, kernel='sigmoid',
    max_iter=-1, probability=True, random_state=None, shrinking=True, tol=0.001,
    verbose=False)

In [240]:
y_pred = svclassifier.predict(X_train)
y_pred

array([0, 0, 0, ..., 0, 0, 0])

In [241]:
y_pred_proba = svclassifier.predict_proba(X_train)
y_pred_proba

array([[0.97898418, 0.02101582],
       [0.97898418, 0.02101582],
       [0.97898418, 0.02101582],
       ...,
       [0.97898418, 0.02101582],
       [0.97898418, 0.02101582],
       [0.97898418, 0.02101582]])

In [242]:
print(confusion_matrix(y_train, y_pred))
print(classification_report(y_train, y_pred))

[[90737  4705]
 [  885  1193]]
              precision    recall  f1-score   support

           0       0.99      0.95      0.97     95442
           1       0.20      0.57      0.30      2078

    accuracy                           0.94     97520
   macro avg       0.60      0.76      0.63     97520
weighted avg       0.97      0.94      0.96     97520



## weights = {0: 1, 1: 75}, G = 10, C = 10

In [243]:
weights = {0: 1, 1: 75}
svclassifier = SVC(kernel='sigmoid', probability = True, gamma = 10, C = 10, class_weight = weights)
svclassifier.fit(X_train, y_train)

SVC(C=10, cache_size=200, class_weight={0: 1, 1: 75}, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=10, kernel='sigmoid',
    max_iter=-1, probability=True, random_state=None, shrinking=True, tol=0.001,
    verbose=False)

In [244]:
y_pred = svclassifier.predict(X_train)
y_pred

array([0, 0, 0, ..., 0, 0, 0])

In [245]:
y_pred_proba = svclassifier.predict_proba(X_train)
y_pred_proba

array([[0.97875337, 0.02124663],
       [0.97875337, 0.02124663],
       [0.97875337, 0.02124663],
       ...,
       [0.97875337, 0.02124663],
       [0.97875337, 0.02124663],
       [0.97875337, 0.02124663]])

In [246]:
print(confusion_matrix(y_train, y_pred))
print(classification_report(y_train, y_pred))

[[90723  4719]
 [  884  1194]]
              precision    recall  f1-score   support

           0       0.99      0.95      0.97     95442
           1       0.20      0.57      0.30      2078

    accuracy                           0.94     97520
   macro avg       0.60      0.76      0.63     97520
weighted avg       0.97      0.94      0.96     97520

