In [1]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import LabelEncoder, OneHotEncoder


import acquire
import prepare
import explore
import model

### Acquire

In [2]:
df = acquire.acquire_agg_data()

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 169073 entries, 0 to 169072
Data columns (total 10 columns):
serial_number         169072 non-null object
model                 169073 non-null object
capacity_bytes        169073 non-null int64
max(failure)          169073 non-null int64
max(smart_9_raw)      161975 non-null float64
max(smart_5_raw)      161851 non-null float64
max(smart_187_raw)    104189 non-null float64
max(smart_188_raw)    104179 non-null float64
max(smart_197_raw)    161841 non-null float64
max(smart_198_raw)    161841 non-null float64
dtypes: float64(6), int64(2), object(2)
memory usage: 12.9+ MB


In [4]:
df.head()

Unnamed: 0,serial_number,model,capacity_bytes,max(failure),max(smart_9_raw),max(smart_5_raw),max(smart_187_raw),max(smart_188_raw),max(smart_197_raw),max(smart_198_raw)
0,PL1311LAG1SJAA,Hitachi HDS5C4040ALE630,4000787030016,0,43819.0,0.0,,,0.0,0.0
1,Z305KB36,ST4000DM000,4000787030016,0,31045.0,0.0,0.0,0.0,0.0,0.0
2,MJ0351YNG9MZXA,Hitachi HDS5C3030ALA630,3000592982016,0,41668.0,0.0,,,0.0,0.0
3,ZA11NHSN,ST8000DM002,8001563222016,0,26284.0,0.0,0.0,0.0,0.0,0.0
4,MJ1311YNG2ZSEA,Hitachi HDS5C3030ALA630,3000592982016,0,47994.0,0.0,,,0.0,0.0


### Prepare

In [5]:
df = prepare.prepare(df)

In [6]:
df.head()

Unnamed: 0,serial_number,manufacturer,model,capacity_gigabytes,failure,drive_age_in_years,reallocated_sectors_count,reported_uncorrectable_errors,command_timeout,current_pending_sector_count,uncorrectable_sector_count
0,PL1311LAG1SJAA,Hitachi,Hitachi HDS5C4040ALE630,4001.0,0,5.0,0.0,,,0.0,0.0
1,Z305KB36,Seagate,ST4000DM000,4001.0,0,3.5,0.0,0.0,0.0,0.0,0.0
2,MJ0351YNG9MZXA,Hitachi,Hitachi HDS5C3030ALA630,3001.0,0,4.8,0.0,,,0.0,0.0
3,ZA11NHSN,Seagate,ST8000DM002,8002.0,0,3.0,0.0,0.0,0.0,0.0,0.0
4,MJ1311YNG2ZSEA,Hitachi,Hitachi HDS5C3030ALA630,3001.0,0,5.5,0.0,,,0.0,0.0


### SMART_stat_5 
- reallocated

In [7]:
#df.reallocated_sectors_count.plot()

In [8]:
#df.reallocated_sectors_count.plot()

In [9]:
#sns.scatterplot(x='capacity_gigabytes',y='drive_age_in_years',hue='failure',sizes=(20,200),data=df)


In [10]:
# drive age is 0 for 2779 observation

df['drive_age_in_years'].loc[lambda x: x== 0].index

Int64Index([   593,    748,    755,    763,    769,    798,    821,   1549,
              1617,   1618,
            ...
            168993, 169011, 169016, 169017, 169025, 169028, 169038, 169047,
            169050, 169052],
           dtype='int64', length=2779)

In [11]:
# Max age but drive has failed

df.iloc[1288]

serial_number                    WD-WCADW2290554
manufacturer                     Western Digital
model                                WDC WD800LB
capacity_gigabytes                            80
failure                                        1
drive_age_in_years                          10.3
reallocated_sectors_count                      0
reported_uncorrectable_errors                NaN
command_timeout                              NaN
current_pending_sector_count                   0
uncorrectable_sector_count                     0
Name: 1288, dtype: object

In [12]:
# Why is 0 listed in age???

df.iloc[169025]

serial_number                         ZJV3BFK8
manufacturer                           Seagate
model                            ST12000NM0007
capacity_gigabytes                       12000
failure                                      0
drive_age_in_years                           0
reallocated_sectors_count                    0
reported_uncorrectable_errors                0
command_timeout                              0
current_pending_sector_count                 0
uncorrectable_sector_count                   0
Name: 169025, dtype: object

In [13]:
(df.drive_age_in_years == 0).sum()

2779

In [14]:
top_model = [df.model=='ST12000NM0007']

In [15]:
top_model = pd.DataFrame(top_model) 

In [16]:
df

Unnamed: 0,serial_number,manufacturer,model,capacity_gigabytes,failure,drive_age_in_years,reallocated_sectors_count,reported_uncorrectable_errors,command_timeout,current_pending_sector_count,uncorrectable_sector_count
0,PL1311LAG1SJAA,Hitachi,Hitachi HDS5C4040ALE630,4001.0,0,5.0,0.0,,,0.0,0.0
1,Z305KB36,Seagate,ST4000DM000,4001.0,0,3.5,0.0,0.0,0.0,0.0,0.0
2,MJ0351YNG9MZXA,Hitachi,Hitachi HDS5C3030ALA630,3001.0,0,4.8,0.0,,,0.0,0.0
3,ZA11NHSN,Seagate,ST8000DM002,8002.0,0,3.0,0.0,0.0,0.0,0.0,0.0
4,MJ1311YNG2ZSEA,Hitachi,Hitachi HDS5C3030ALA630,3001.0,0,5.5,0.0,,,0.0,0.0
5,Z305D5VF,Seagate,ST4000DM000,4001.0,0,3.6,0.0,0.0,0.0,0.0,0.0
6,Z3015L2E,Seagate,ST4000DM000,4001.0,0,4.2,0.0,0.0,0.0,0.0,0.0
7,PL1331LAHDYJYH,Hitachi,HGST HMS5C4040BLE640,4001.0,0,3.4,0.0,,,0.0,0.0
8,Z304LAHP,Seagate,ST4000DM000,4001.0,0,3.9,0.0,0.0,0.0,0.0,0.0
9,Z302B0GK,Seagate,ST4000DM000,4001.0,0,4.2,0.0,0.0,0.0,0.0,0.0


In [17]:
df.capacity_gigabytes.value_counts()

 4001.0     64886
 12000.0    50729
 8002.0     26108
-0.0         7031
 3001.0      6789
 2000.0      4678
 500.0       2547
 6001.0      2411
 14001.0     1302
 10001.0     1255
 1000.0       388
 1500.0       267
 250.0        212
 160.0        209
 320.0        116
 480.0         60
 5001.0        46
 80.0          32
 137.0          7
Name: capacity_gigabytes, dtype: int64

In [18]:
df.head(170)

Unnamed: 0,serial_number,manufacturer,model,capacity_gigabytes,failure,drive_age_in_years,reallocated_sectors_count,reported_uncorrectable_errors,command_timeout,current_pending_sector_count,uncorrectable_sector_count
0,PL1311LAG1SJAA,Hitachi,Hitachi HDS5C4040ALE630,4001.0,0,5.0,0.0,,,0.0,0.0
1,Z305KB36,Seagate,ST4000DM000,4001.0,0,3.5,0.0,0.0,0.0,0.0,0.0
2,MJ0351YNG9MZXA,Hitachi,Hitachi HDS5C3030ALA630,3001.0,0,4.8,0.0,,,0.0,0.0
3,ZA11NHSN,Seagate,ST8000DM002,8002.0,0,3.0,0.0,0.0,0.0,0.0,0.0
4,MJ1311YNG2ZSEA,Hitachi,Hitachi HDS5C3030ALA630,3001.0,0,5.5,0.0,,,0.0,0.0
5,Z305D5VF,Seagate,ST4000DM000,4001.0,0,3.6,0.0,0.0,0.0,0.0,0.0
6,Z3015L2E,Seagate,ST4000DM000,4001.0,0,4.2,0.0,0.0,0.0,0.0,0.0
7,PL1331LAHDYJYH,Hitachi,HGST HMS5C4040BLE640,4001.0,0,3.4,0.0,,,0.0,0.0
8,Z304LAHP,Seagate,ST4000DM000,4001.0,0,3.9,0.0,0.0,0.0,0.0,0.0
9,Z302B0GK,Seagate,ST4000DM000,4001.0,0,4.2,0.0,0.0,0.0,0.0,0.0


In [19]:
df.shape

(169073, 11)

In [20]:
df = prepare.unique(df)

In [21]:
df.shape

(162025, 11)

In [22]:
df_fail = pd.DataFrame(df.failure == 1)

In [23]:
type(df_fail)

pandas.core.frame.DataFrame

In [24]:
df_fail.sum()

failure    5922
dtype: int64

In [25]:
df_fail.head()

Unnamed: 0,failure
0,False
1,False
2,False
3,False
4,False


In [26]:
#df.reported_uncorrectable_errors.plot()

In [27]:
#x = df_fail.failure

#plt.scatter(x = df.failure, y = df.reported_uncorrectable_errors, alpha=0.5)
#plt.show()

In [28]:
#for col in df.columns:
#     print(col)
#     print(df[col].value_counts())
#     print()

In [29]:
null = df[['reported_uncorrectable_errors', 'command_timeout']]


In [30]:
(df.reported_uncorrectable_errors == 0).sum()

100428

In [31]:
df.isnull().sum()

serial_number                        1
manufacturer                         0
model                                0
capacity_gigabytes                   0
failure                              0
drive_age_in_years                  60
reallocated_sectors_count          184
reported_uncorrectable_errors    57839
command_timeout                  57849
current_pending_sector_count       194
uncorrectable_sector_count         194
dtype: int64

In [32]:
def treat_nulls(df):
    df.reported_uncorrectable_errors = df.reported_uncorrectable_errors.fillna(value=0)
    df.command_timeout = df.command_timeout.fillna(value=0)
    df = df.dropna(axis=0)
    return df

In [33]:
df = treat_nulls(df)

In [34]:
df.isnull().sum()

serial_number                    0
manufacturer                     0
model                            0
capacity_gigabytes               0
failure                          0
drive_age_in_years               0
reallocated_sectors_count        0
reported_uncorrectable_errors    0
command_timeout                  0
current_pending_sector_count     0
uncorrectable_sector_count       0
dtype: int64

In [35]:
df.shape

(161830, 11)

In [36]:
df.head()

Unnamed: 0,serial_number,manufacturer,model,capacity_gigabytes,failure,drive_age_in_years,reallocated_sectors_count,reported_uncorrectable_errors,command_timeout,current_pending_sector_count,uncorrectable_sector_count
0,PL1311LAG1SJAA,Hitachi,Hitachi HDS5C4040ALE630,4001.0,0,5.0,0.0,0.0,0.0,0.0,0.0
1,Z305KB36,Seagate,ST4000DM000,4001.0,0,3.5,0.0,0.0,0.0,0.0,0.0
2,MJ0351YNG9MZXA,Hitachi,Hitachi HDS5C3030ALA630,3001.0,0,4.8,0.0,0.0,0.0,0.0,0.0
3,ZA11NHSN,Seagate,ST8000DM002,8002.0,0,3.0,0.0,0.0,0.0,0.0,0.0
4,MJ1311YNG2ZSEA,Hitachi,Hitachi HDS5C3030ALA630,3001.0,0,5.5,0.0,0.0,0.0,0.0,0.0


In [37]:
find_uniques = df[['model', 'capacity_gigabytes']]
find_uniques.head(10)

Unnamed: 0,model,capacity_gigabytes
0,Hitachi HDS5C4040ALE630,4001.0
1,ST4000DM000,4001.0
2,Hitachi HDS5C3030ALA630,3001.0
3,ST8000DM002,8002.0
4,Hitachi HDS5C3030ALA630,3001.0
5,ST4000DM000,4001.0
6,ST4000DM000,4001.0
7,HGST HMS5C4040BLE640,4001.0
8,ST4000DM000,4001.0
9,ST4000DM000,4001.0


In [38]:
find_uniques.capacity_gigabytes.unique()

array([ 4001.,  3001.,  8002.,   500.,  6001.,  2000.,  1000.,    80.,
       12000., 10001., 14001.,  1500.,   160.,   320.,   250.,  5001.])

In [39]:
find_uniques.model.unique()

array(['Hitachi HDS5C4040ALE630', 'ST4000DM000',
       'Hitachi HDS5C3030ALA630', 'ST8000DM002', 'HGST HMS5C4040BLE640',
       'ST500LM012 HN', 'ST6000DX000', 'HGST HMS5C4040ALE640',
       'WDC WD30EFRX', 'WDC WD5000BPKT', 'Hitachi HDS722020ALA330',
       'WDC WD10EADS', 'Hitachi HDS723030ALA640', 'HGST HDS5C4040ALE630',
       'WDC WD60EFRX', 'HGST HUH728080ALE600', 'TOSHIBA MD04ABA400V',
       'TOSHIBA MQ01ABF050', 'WDC WD20EFRX', 'WDC WD10EADX',
       'WDC WD800AAJS', 'ST8000NM0055', 'ST12000NM0007', 'ST4000DM001',
       'ST10000NM0086', 'ST4000DM005', 'HGST HUH721212ALN604',
       'TOSHIBA MG07ACA14TA', 'ST500LM030', 'HGST HUH721212ALE600',
       'WDC WD10EACS', 'ST31500541AS', 'ST3160318AS', 'ST4000DX000',
       'WDC WD5000LPVX', 'WDC WD1600AAJS', 'WDC WD800LB',
       'TOSHIBA MQ01ABF050M', 'TOSHIBA HDWF180', 'TOSHIBA HDWE160',
       'ST9320325AS', 'ST250LM004 HN', 'HGST HUS726040ALE610',
       'ST3160316AS', 'ST320LT007', 'HGST HDS724040ALE640', 'ST4000DX002',
      

In [40]:
find_uniques.groupby('model').count()

Unnamed: 0_level_0,capacity_gigabytes
model,Unnamed: 1_level_1
00MD00,1
HGST HDS5C4040ALE630,116
HGST HDS724040ALE640,40
HGST HMS5C4040ALE640,8637
HGST HMS5C4040BLE640,16314
HGST HMS5C4040BLE641,1
HGST HUH721010ALE600,20
HGST HUH721212ALE600,1564
HGST HUH721212ALN604,10875
HGST HUH728080ALE600,1082


In [41]:
find_uniques.groupby(['model','capacity_gigabytes']).count()

model,capacity_gigabytes
00MD00,4001.0
HGST HDS5C4040ALE630,4001.0
HGST HDS724040ALE640,4001.0
HGST HMS5C4040ALE640,4001.0
HGST HMS5C4040BLE640,4001.0
HGST HMS5C4040BLE641,4001.0
HGST HUH721010ALE600,10001.0
HGST HUH721212ALE600,12000.0
HGST HUH721212ALN604,12000.0
HGST HUH728080ALE600,8002.0


In [42]:
df['model'].value_counts()

ST12000NM0007              38271
ST4000DM000                36156
HGST HMS5C4040BLE640       16314
ST8000NM0055               14810
HGST HUH721212ALN604       10875
ST8000DM002                10160
HGST HMS5C4040ALE640        8637
Hitachi HDS5C3030ALA630     4563
Hitachi HDS722020ALA330     4503
Hitachi HDS5C4040ALE630     2639
ST6000DX000                 1912
HGST HUH721212ALE600        1564
TOSHIBA MG07ACA14TA         1302
ST10000NM0086               1235
WDC WD30EFRX                1167
HGST HUH728080ALE600        1082
Hitachi HDS723030ALA640     1000
ST500LM012 HN                804
TOSHIBA MQ01ABF050           589
WDC WD60EFRX                 471
ST4000DM001                  425
TOSHIBA MQ01ABF050M          410
WDC WD5000LPVX               343
WDC WD10EADS                 295
ST31500541AS                 264
ST500LM030                   261
ST4000DX000                  212
WDC WD20EFRX                 158
TOSHIBA MD04ABA400V          147
HGST HDS5C4040ALE630         116
          

In [43]:
df['model'].loc[lambda x: x== 'ST3500320AS'].index

Int64Index([122808], dtype='int64')

In [44]:
df.loc[122808]

serial_number                       9QM04GKZ
manufacturer                         Seagate
model                            ST3500320AS
capacity_gigabytes                       500
failure                                    1
drive_age_in_years                       4.9
reallocated_sectors_count                  6
reported_uncorrectable_errors              0
command_timeout                           11
current_pending_sector_count               0
uncorrectable_sector_count                 0
Name: 122808, dtype: object

In [45]:
df[df.model==' 00MD00']

Unnamed: 0,serial_number,manufacturer,model,capacity_gigabytes,failure,drive_age_in_years,reallocated_sectors_count,reported_uncorrectable_errors,command_timeout,current_pending_sector_count,uncorrectable_sector_count
33381,0F112CC45,Unknown,00MD00,4001.0,0,1.7,0.0,0.0,0.0,0.0,0.0


In [46]:
df['manufacturer'].loc[lambda x: x== 'Unknown'].index

Int64Index([33381], dtype='int64')

In [47]:
df.loc[33381]

serial_number                    0F112CC45
manufacturer                       Unknown
model                               00MD00
capacity_gigabytes                    4001
failure                                  0
drive_age_in_years                     1.7
reallocated_sectors_count                0
reported_uncorrectable_errors            0
command_timeout                          0
current_pending_sector_count             0
uncorrectable_sector_count               0
Name: 33381, dtype: object

In [48]:
df = explore.old_or_fail(df)

In [49]:
df.shape

(121902, 11)

In [50]:
df.model.nunique()

92

In [51]:
df.capacity_gigabytes.nunique()

16

In [52]:
df.groupby('manufacturer').count()

Unnamed: 0_level_0,serial_number,model,capacity_gigabytes,failure,drive_age_in_years,reallocated_sectors_count,reported_uncorrectable_errors,command_timeout,current_pending_sector_count,uncorrectable_sector_count
manufacturer,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Hitachi,38414,38414,38414,38414,38414,38414,38414,38414,38414,38414
Samsung,1,1,1,1,1,1,1,1,1,1
Seagate,79886,79886,79886,79886,79886,79886,79886,79886,79886,79886
Toshiba,990,990,990,990,990,990,990,990,990,990
Unknown,1,1,1,1,1,1,1,1,1,1
Western Digital,2610,2610,2610,2610,2610,2610,2610,2610,2610,2610


In [53]:
df.groupby('capacity_gigabytes').count()

Unnamed: 0_level_0,serial_number,manufacturer,model,failure,drive_age_in_years,reallocated_sectors_count,reported_uncorrectable_errors,command_timeout,current_pending_sector_count,uncorrectable_sector_count
capacity_gigabytes,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
80.0,32,32,32,32,32,32,32,32,32,32
160.0,208,208,208,208,208,208,208,208,208,208
250.0,109,109,109,109,109,109,109,109,109,109
320.0,116,116,116,116,116,116,116,116,116,116
500.0,1911,1911,1911,1911,1911,1911,1911,1911,1911,1911
1000.0,378,378,378,378,378,378,378,378,378,378
1500.0,267,267,267,267,267,267,267,267,267,267
2000.0,4541,4541,4541,4541,4541,4541,4541,4541,4541,4541
3001.0,6703,6703,6703,6703,6703,6703,6703,6703,6703,6703
4001.0,63471,63471,63471,63471,63471,63471,63471,63471,63471,63471


In [54]:
df.capacity_gigabytes.value_counts().count()

16

In [55]:
df['capacity_gigabytes'].loc[lambda x: x== 250.0].index

Int64Index([  2012,   8427,   8455,  11150,  14354,  15920,  17802,  18336,
             21129,  21805,
            ...
            155926, 157614, 159902, 162690, 162751, 163323, 164379, 165956,
            166691, 167334],
           dtype='int64', length=109)

In [56]:
df.loc[95463]

serial_number                             VFG111R5E4HYEH
manufacturer                                     Hitachi
model                            Hitachi HDT725025VLA380
capacity_gigabytes                                   250
failure                                                0
drive_age_in_years                                   7.8
reallocated_sectors_count                              0
reported_uncorrectable_errors                          0
command_timeout                                        0
current_pending_sector_count                           0
uncorrectable_sector_count                             0
Name: 95463, dtype: object

In [57]:
df['capacity_gigabytes'].loc[lambda x: x== 250.1].index

Int64Index([], dtype='int64')

In [58]:
df.loc[2012]

serial_number                    S2T0J9ACB00427
manufacturer                            Seagate
model                             ST250LM004 HN
capacity_gigabytes                          250
failure                                       0
drive_age_in_years                          4.5
reallocated_sectors_count                     0
reported_uncorrectable_errors                 0
command_timeout                               0
current_pending_sector_count                  0
uncorrectable_sector_count                    0
Name: 2012, dtype: object

In [59]:
df.model.value_counts()

ST4000DM000                35738
HGST HMS5C4040BLE640       15844
ST12000NM0007              14881
ST8000NM0055               14499
ST8000DM002                10041
HGST HMS5C4040ALE640        8617
Hitachi HDS5C3030ALA630     4561
Hitachi HDS722020ALA330     4503
Hitachi HDS5C4040ALE630     2639
ST6000DX000                 1911
ST10000NM0086               1204
WDC WD30EFRX                1083
HGST HUH728080ALE600        1046
Hitachi HDS723030ALA640     1000
ST500LM012 HN                715
TOSHIBA MQ01ABF050           545
WDC WD60EFRX                 471
WDC WD5000LPVX               342
WDC WD10EADS                 295
ST31500541AS                 264
ST4000DX000                  212
TOSHIBA MQ01ABF050M          192
TOSHIBA MD04ABA400V          147
WDC WD1600AAJS               100
HGST HDS5C4040ALE630          94
ST320LT007                    73
WDC WD10EACS                  60
WDC WD5000LPCX                57
ST9250315AS                   56
ST3160316AS                   53
          

In [60]:
df.groupby('manufacturer').count()

Unnamed: 0_level_0,serial_number,model,capacity_gigabytes,failure,drive_age_in_years,reallocated_sectors_count,reported_uncorrectable_errors,command_timeout,current_pending_sector_count,uncorrectable_sector_count
manufacturer,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Hitachi,38414,38414,38414,38414,38414,38414,38414,38414,38414,38414
Samsung,1,1,1,1,1,1,1,1,1,1
Seagate,79886,79886,79886,79886,79886,79886,79886,79886,79886,79886
Toshiba,990,990,990,990,990,990,990,990,990,990
Unknown,1,1,1,1,1,1,1,1,1,1
Western Digital,2610,2610,2610,2610,2610,2610,2610,2610,2610,2610


In [61]:
df.manufacturer.value_counts()

Seagate            79886
Hitachi            38414
Western Digital     2610
Toshiba              990
Samsung                1
Unknown                1
Name: manufacturer, dtype: int64

In [62]:
df.groupby('manufacturer').count()

Unnamed: 0_level_0,serial_number,model,capacity_gigabytes,failure,drive_age_in_years,reallocated_sectors_count,reported_uncorrectable_errors,command_timeout,current_pending_sector_count,uncorrectable_sector_count
manufacturer,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Hitachi,38414,38414,38414,38414,38414,38414,38414,38414,38414,38414
Samsung,1,1,1,1,1,1,1,1,1,1
Seagate,79886,79886,79886,79886,79886,79886,79886,79886,79886,79886
Toshiba,990,990,990,990,990,990,990,990,990,990
Unknown,1,1,1,1,1,1,1,1,1,1
Western Digital,2610,2610,2610,2610,2610,2610,2610,2610,2610,2610


In [63]:
def remove_manufacturers(df):
    df = df[(df.manufacturer != 'Samsung') & (df.manufacturer != 'Unknown')]
    return df

In [64]:
df = remove_manufacturers(df)
df.groupby('manufacturer').count()

Unnamed: 0_level_0,serial_number,model,capacity_gigabytes,failure,drive_age_in_years,reallocated_sectors_count,reported_uncorrectable_errors,command_timeout,current_pending_sector_count,uncorrectable_sector_count
manufacturer,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Hitachi,38414,38414,38414,38414,38414,38414,38414,38414,38414,38414
Seagate,79886,79886,79886,79886,79886,79886,79886,79886,79886,79886
Toshiba,990,990,990,990,990,990,990,990,990,990
Western Digital,2610,2610,2610,2610,2610,2610,2610,2610,2610,2610


In [65]:
# modeling

In [66]:
df = explore.early_failure(df, cut_off = 1.6)

In [67]:
df = explore.get_quartile(df, Q1=1.6,Q2=2.6,Q3=4)

In [68]:
df.head()

Unnamed: 0,serial_number,manufacturer,model,capacity_gigabytes,failure,drive_age_in_years,reallocated_sectors_count,reported_uncorrectable_errors,command_timeout,current_pending_sector_count,uncorrectable_sector_count,early_failure,quartile
0,PL1311LAG1SJAA,Hitachi,Hitachi HDS5C4040ALE630,4001.0,0,5.0,0.0,0.0,0.0,0.0,0.0,0,Q4
1,Z305KB36,Seagate,ST4000DM000,4001.0,0,3.5,0.0,0.0,0.0,0.0,0.0,0,Q3
2,MJ0351YNG9MZXA,Hitachi,Hitachi HDS5C3030ALA630,3001.0,0,4.8,0.0,0.0,0.0,0.0,0.0,0,Q4
3,ZA11NHSN,Seagate,ST8000DM002,8002.0,0,3.0,0.0,0.0,0.0,0.0,0.0,0,Q3
4,MJ1311YNG2ZSEA,Hitachi,Hitachi HDS5C3030ALA630,3001.0,0,5.5,0.0,0.0,0.0,0.0,0.0,0,Q4


In [69]:
def split_my_data(df):
    X = df.drop(columns = ['serial_number', 'model', 'quartile', 'early_failure', 'failure'])
    y = df[['early_failure']]
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = .80, random_state = 123, stratify=df.early_failure)
    return X, y, X_train, X_test, y_train, y_test


In [70]:
X, y, X_train, X_test, y_train, y_test = split_my_data(df)

In [82]:
len(X), len(y), len(X_train), len(X_test)

(121900, 121900, 97520, 24380)

In [72]:
X_train_ohe, X_test_ohe = model.encode_hot(X_train, X_test, col_name = 'manufacturer')

In [73]:
X_train_ohe.drop(columns= 'manufacturer', inplace = True)

In [74]:
X_test_ohe.drop(columns= 'manufacturer', inplace = True)
X_test_ohe.head()

Unnamed: 0,capacity_gigabytes,drive_age_in_years,reallocated_sectors_count,reported_uncorrectable_errors,command_timeout,current_pending_sector_count,uncorrectable_sector_count,Hitachi,Seagate,Toshiba,Western Digital
18665,4001.0,3.4,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
30406,4001.0,4.6,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
89165,10001.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
108614,8002.0,2.2,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
151652,4001.0,4.1,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [75]:
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(97520, 8) (24380, 8) (97520, 1) (24380, 1)


In [76]:
X_test.columns

Index(['manufacturer', 'capacity_gigabytes', 'drive_age_in_years',
       'reallocated_sectors_count', 'reported_uncorrectable_errors',
       'command_timeout', 'current_pending_sector_count',
       'uncorrectable_sector_count'],
      dtype='object')

In [77]:
y_train.shape

(97520, 1)

In [78]:
y_test.shape

(24380, 1)

In [79]:
len(df)*.8

97520.0

In [80]:
# Train Model
# Create the Random Forest Object

#rf = RandomForestClassifier(bootstrap=True, 
                            class_weight=None, 
                            criterion='gini',
                            min_samples_leaf=1,
                            n_estimators=100,
                            max_depth=20, 
                            random_state=123)

IndentationError: unexpected indent (<ipython-input-80-677d2e1351ec>, line 5)

In [None]:
#rf

In [None]:
X_train.info()

In [None]:
# Fit the model to the training data

#rf.fit(X_train, y_train)

In [None]:
#print(rf.feature_importances_)

In [None]:
#y_pred = rf.predict(X_train)


In [None]:
#y_pred_proba = rf.predict_proba(X_train)


In [None]:
#print('Accuracy of random forest classifier on training set: {:.2f}'
     .format(rf.score(X_train, y_train)))

In [None]:
#cm = (confusion_matrix(y_train, y_pred))
#cm

In [None]:
#labels = sorted(y_train.early_failure.unique())

#pd.DataFrame(confusion_matrix(y_train, y_pred), index=labels, columns=labels)

In [None]:
'''
FP = cm.sum(axis=0) - np.diag(cm)  
FN = cm.sum(axis=1) - np.diag(cm)
TP = np.diag(cm)
TN = cm.sum() - (FP + FN + TP)

# Sensitivity, hit rate, RECALL, or TRUE POSITIVE rate
TPR = TP/(TP+FN)
# Specificity or TRUE NEGATIVE rate
TNR = TN/(TN+FP) 
# PRECISION or positive predictive value
PPV = TP/(TP+FP)
# Negative predictive value
NPV = TN/(TN+FN)
# Fall out or FALSE POSITIVE rate
FPR = FP/(FP+TN)
# FALSE NEGATIVE rate
FNR = FN/(TP+FN)
# False discovery rate
FDR = FP/(TP+FP)

# Overall ACCURACY
ACC = (TP+TN)/(TP+FP+FN+TN)
'''

## Test Data Model

In [None]:
#y_pred_rf_test = rf.predict(X_test)

In [None]:
#y_pred_proba_rf_test = rf.predict_proba(X_test)

In [None]:
#print('Accuracy of random forest classifier on test set: {:.2f}'
     .format(rf.score(X_test, y_test)))

In [None]:
#cm_test = (confusion_matrix(y_test, y_pred_rf_test))
#cm_test

In [None]:
#print(classification_report(y_test, y_pred_rf_test))

## SVM

In [None]:
from sklearn.svm import LinearSVR
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import SGDRegressor, LassoCV
from sklearn.tree import DecisionTreeRegressor

#### Training the Algorithm

The fit method of SVC class is called to train the algorithm on the training data, which is passed as a parameter to the fit method.

In [81]:
X_train_ohe.head()


Unnamed: 0,capacity_gigabytes,drive_age_in_years,reallocated_sectors_count,reported_uncorrectable_errors,command_timeout,current_pending_sector_count,uncorrectable_sector_count,Hitachi,Seagate,Toshiba,Western Digital
116194,8002.0,2.1,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
142942,4001.0,4.6,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
97790,12000.0,1.7,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
125503,12000.0,1.9,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
162599,6001.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [None]:
from sklearn.svm import SVC
svclassifier = SVC(kernel='linear')
svclassifier.fit(X_train_ohe, y_train)

#### Making Predictions

To make predictions, the predict method of the SVC class is used.

In [None]:
y_pred = svclassifier.predict(X_test_ohe)

#### Evaluating the Algorithm

Scikit-Learn's metrics library contains the classification_report and confusion_matrix methods, which can be readily used to find out the values for these important metrics.

In [None]:
from sklearn.metrics import classification_report, confusion_matrix
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))

In [None]:
sgd = SGDRegressor(fit_intercept=False, max_iter=1000, random_state=123)
sgd.fit(X_train_ohe, y_train)
y_pred_svm = sgd.predict(X_train_ohe)
mean_squared_error(y_train, y_pred_svm)**1/2

In [None]:
regr = LinearSVR(random_state=123, tol=1e-5, loss='squared_epsilon_insensitive', fit_intercept=False, dual=False)
regr.fit(X_train, y_train)
y_pred_lsvr = regr.predict(X_train)
print(mean_squared_error(y_train, y_pred)**1/2)

In [None]:
X_test.shape

In [None]:
df.columns

In [None]:
X_train.columns

In [None]:
X_train_ohe = encode_hot(X_train, X_test, col_name = 'model')

In [None]:
X_train