In [1]:
import pandas as pd
import numpy as np
import re

import acquire
import prepare
import explore
import model

import seaborn as sns
%matplotlib inline
import matplotlib.pyplot as plt 


import warnings
warnings.filterwarnings("ignore")

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.tree import export_graphviz
from sklearn.metrics import recall_score

Use this to drop normalized columns if necessary:


    columns = []
    for i in df.columns:
        columns.append(i)
    columns = str(columns)
    normalized_columns = re.findall(r'(smart_\d+_normalized)', columns)
    df.drop(columns=normalized_columns, inplace=True)"""

In [2]:
df = pd.read_csv('hard_drives_smart_5.csv')

In [3]:
df.shape

(169073, 11)

In [4]:
df.drop(columns=('Unnamed: 0'), inplace = True)

In [5]:
df.head(10)

Unnamed: 0,serial_number,model,capacity_bytes,max(failure),max(smart_9_raw),max(smart_5_raw),max(smart_187_raw),max(smart_188_raw),max(smart_197_raw),max(smart_198_raw)
0,PL1311LAG1SJAA,Hitachi HDS5C4040ALE630,4000787030016,0,43819.0,0.0,,,0.0,0.0
1,Z305KB36,ST4000DM000,4000787030016,0,31045.0,0.0,0.0,0.0,0.0,0.0
2,MJ0351YNG9MZXA,Hitachi HDS5C3030ALA630,3000592982016,0,41668.0,0.0,,,0.0,0.0
3,ZA11NHSN,ST8000DM002,8001563222016,0,26284.0,0.0,0.0,0.0,0.0,0.0
4,MJ1311YNG2ZSEA,Hitachi HDS5C3030ALA630,3000592982016,0,47994.0,0.0,,,0.0,0.0
5,Z305D5VF,ST4000DM000,4000787030016,0,31883.0,0.0,0.0,0.0,0.0,0.0
6,Z3015L2E,ST4000DM000,4000787030016,0,36768.0,0.0,0.0,0.0,0.0,0.0
7,PL1331LAHDYJYH,HGST HMS5C4040BLE640,4000787030016,0,30144.0,0.0,,,0.0,0.0
8,Z304LAHP,ST4000DM000,4000787030016,0,33885.0,0.0,0.0,0.0,0.0,0.0
9,Z302B0GK,ST4000DM000,4000787030016,0,36443.0,0.0,0.0,0.0,0.0,0.0


# Prepare

In [6]:
df = prepare.prepare(df)

In [7]:
type(df)

pandas.core.frame.DataFrame

In [8]:
df = prepare.unique(df)

KeyError: '[68273 50408 31687 20823  4177 78104 95249] not found in axis'

In [None]:
df = prepare.treat_nulls(df)

In [None]:
df.failure.value_counts()

In [None]:
df = explore.old_or_fail(df)

In [None]:
df.failure.value_counts()

In [None]:
df.head()

# Explore

In [None]:
df = explore.early_failure(df)

In [None]:
df = explore.get_quartile(df)

In [None]:
df = explore.remove_manufacturers(df)

In [None]:
df = explore.make_binary_values(df)

In [None]:
df.head()

In [None]:
df.failure.value_counts()

In [None]:
df.early_failure.value_counts()

## Manufacturers

- There are 7 different manufacturers
- Seagate and Hitachi make up 97% of all hard drives
- There are 103 unqiue  hard drive models. 43 of these models have less than 10 hard drives. 72 models have less than 100.

In [None]:
df['manufacturer'].nunique()

In [None]:
df.shape

In [None]:
# How many hard drives per manufacturer?
df['manufacturer'].value_counts()

In [None]:
(105136+51417)/162025 *100

In [None]:
# How many unique models?
df['model'].nunique()

In [None]:
# How many hard drives models with less than 10 hard drives?
(df['model'].value_counts()  < 10).sum()

In [None]:
# How many hard drives by individual model?
df['model'].value_counts()

In [None]:
# Average drive age by manufacturer
manu = df[['manufacturer', 'drive_age_in_years']]
manu.groupby('manufacturer',sort=True).mean()

### Failures vs. total by model number

In [None]:
agg_model = df.groupby(['model']).agg({'early_failure' : 'sum', 'model' : 'count', 'drive_age_in_years':'mean'})
agg_model.rename(columns={'early_failure':'early_failures', 'model':'total_count'}, inplace=True)

In [None]:
agg_model['early_failure_rate_percent'] = agg_model['early_failures'] / agg_model['total_count'] * 100
agg_model.sort_values(by=['early_failure_rate_percent'], ascending = False)

In [None]:
agg_model.sort_values(by=['total_count'], ascending = False)

In [None]:
agg_model [agg_model.total_count > 100].sort_values('early_failure_rate_percent')

In [None]:
plt.scatter(x=agg_model.index, y=agg_model.drive_age_in_years)

In [None]:
plt.scatter(x=agg_model.index, y=agg_model.early_failure_rate_percent)

In [None]:
#failures = df[df.failure == 1]
#non_failures = df[df.failure == 0]

In [None]:
(df.model.value_counts() > 50).sum()

In [None]:
df.model.nunique()

In [None]:
df.shape

In [None]:
train, test = train_test_split(df,train_size =.80, random_state = 123)

In [None]:
train.shape


In [None]:
test.shape

### Aggregation by Manufacturer

In [None]:
# 6 unique manufacturers
df.manufacturer.unique()

In [None]:
manufacturers = df.groupby(['manufacturer']).agg({'early_failure' : 'sum', 'manufacturer' : 'count', 'drive_age_in_years':'mean'})
manufacturers.rename(columns={'early_failure':'early_failures', 'manufacturer':'total_count'}, inplace=True)

In [None]:
manufacturers['early_failure_rate_percent'] = manufacturers['early_failures'] / manufacturers['total_count'] * 100
manufacturers.sort_values(by=['early_failure_rate_percent'], ascending = False)

In [None]:
plt.scatter(x=manufacturers.index, y=manufacturers.drive_age_in_years)
plt.title('Average Drive Age (Years) by Manufacturer')
plt.xticks(rotation=20)

In [None]:
plt.scatter(x=manufacturers.index, y=manufacturers.early_failure_rate_percent)
plt.title('Early Failure Rate (%) by Manufacturer')
plt.xticks(rotation=20)

# Model

In [None]:
df.head()

In [None]:
# Split data
X_train, X_test, y_train, y_test = model.split_my_data(df)

In [None]:
# drop non numeric columns
X_train.drop(columns={'serial_number','model','quartile','drive_age_in_years'}, inplace = True)
X_test.drop(columns={'serial_number','model','quartile','drive_age_in_years'}, inplace = True)

In [None]:
X_train.columns

#### One hote encode manufacturer column

In [None]:
X_train, X_test = model.encode_hot(X_train, X_test, 'manufacturer')

In [None]:
X_train.drop(columns='manufacturer', inplace = True)
X_test.drop(columns='manufacturer', inplace = True)

N = 3

In [None]:
# Create
knn=KNeighborsClassifier(n_neighbors=3,weights='distance')
# Fit
knn.fit(X_train,y_train)
# Predict
pred=knn.predict(X_train)
# Evaluate
print('Accuracy of k nearest neighbor on training set: {:.5f}'
     .format(knn.score(X_train, y_train)))
# Confusion Matrix
cm = confusion_matrix(y_train,pred)
# Classification Report
cr=(classification_report(y_train,pred, output_dict=False))
print("Classification Report: \n\n",cr)

In [None]:
# Run Model on test data
test_pred=knn.predict(X_test)

In [None]:
# Evaluate
print('Accuracy of k nearest neighbor on test set: {:.5f}'
     .format(knn.score(X_test, y_test)))
# Confusion Matrix
tcm = confusion_matrix(y_test,test_pred)
# Classification Report
tcr=(classification_report(y_test,test_pred, output_dict=False))
print("Classification Report: \n\n",tcr)

In [None]:
recall_score(y_test, test_pred)

In [None]:
k_range = range(3, 6)
scores = []
for k in k_range:
    knn = KNeighborsClassifier(n_neighbors = k, weights = 'distance')
    knn.fit(X_train, y_train)
    scores.append(knn.score(X_test, y_test))

In [None]:
min(k_range)

In [None]:
plt.figure()
plt.xlabel('k')
plt.ylabel('accuracy')
plt.scatter(k_range, scores)
plt.xticks([0,5,10,15,20])
plt.title('Accuracy for different K values')

In [None]:
print(f'The best accuracy is at k= {scores.index(max(scores))} with an accuracy score of {max(scores)}')

In [None]:
X_train.columns

In [None]:
X_train.describe()