In [1]:
import pandas as pd
import numpy as np
import re
import prepare
import seaborn as sns
%matplotlib inline
import matplotlib.pyplot as plt 


Use this to drop normalized columns if necessary:


    columns = []
    for i in df.columns:
        columns.append(i)
    columns = str(columns)
    normalized_columns = re.findall(r'(smart_\d+_normalized)', columns)
    df.drop(columns=normalized_columns, inplace=True)"""

In [2]:
df = pd.read_csv('hard_drives_smart_5.csv')

In [3]:
df.shape

(169073, 11)

In [4]:
df.drop(columns=('Unnamed: 0'), inplace = True)

In [5]:
df.head(10)

Unnamed: 0,serial_number,model,capacity_bytes,max(failure),max(smart_9_raw),max(smart_5_raw),max(smart_187_raw),max(smart_188_raw),max(smart_197_raw),max(smart_198_raw)
0,PL1311LAG1SJAA,Hitachi HDS5C4040ALE630,4000787030016,0,43819.0,0.0,,,0.0,0.0
1,Z305KB36,ST4000DM000,4000787030016,0,31045.0,0.0,0.0,0.0,0.0,0.0
2,MJ0351YNG9MZXA,Hitachi HDS5C3030ALA630,3000592982016,0,41668.0,0.0,,,0.0,0.0
3,ZA11NHSN,ST8000DM002,8001563222016,0,26284.0,0.0,0.0,0.0,0.0,0.0
4,MJ1311YNG2ZSEA,Hitachi HDS5C3030ALA630,3000592982016,0,47994.0,0.0,,,0.0,0.0
5,Z305D5VF,ST4000DM000,4000787030016,0,31883.0,0.0,0.0,0.0,0.0,0.0
6,Z3015L2E,ST4000DM000,4000787030016,0,36768.0,0.0,0.0,0.0,0.0,0.0
7,PL1331LAHDYJYH,HGST HMS5C4040BLE640,4000787030016,0,30144.0,0.0,,,0.0,0.0
8,Z304LAHP,ST4000DM000,4000787030016,0,33885.0,0.0,0.0,0.0,0.0,0.0
9,Z302B0GK,ST4000DM000,4000787030016,0,36443.0,0.0,0.0,0.0,0.0,0.0


In [6]:
df = prepare.prepare(df)

In [7]:
df = prepare.unique(df)

In [8]:
df.head()

Unnamed: 0,serial_number,manufacturer,model,capacity_gigabytes,failure,drive_age_in_years,reallocated_sectors_count,reported_uncorrectable_errors,command_timeout,current_pending_sector_count,uncorrectable_sector_count
0,PL1311LAG1SJAA,Hitachi,Hitachi HDS5C4040ALE630,4000.8,0,5.0,0.0,,,0.0,0.0
1,Z305KB36,Seagate,ST4000DM000,4000.8,0,3.5,0.0,0.0,0.0,0.0,0.0
2,MJ0351YNG9MZXA,Hitachi,Hitachi HDS5C3030ALA630,3000.6,0,4.8,0.0,,,0.0,0.0
3,ZA11NHSN,Seagate,ST8000DM002,8001.6,0,3.0,0.0,0.0,0.0,0.0,0.0
4,MJ1311YNG2ZSEA,Hitachi,Hitachi HDS5C3030ALA630,3000.6,0,5.5,0.0,,,0.0,0.0


## Manufacturers

- There are 7 different manufacturers
- Seagate and Hitachi make up 97% of all hard drives
- There are 103 unqiue  hard drive models. 43 of these models have less than 10 hard drives. 72 models have less than 100.

In [9]:
df['manufacturer'].nunique()

7

In [10]:
df.shape

(162025, 11)

In [11]:
# How many hard drives per manufacturer?
df['manufacturer'].value_counts()

Seagate            105136
Hitachi             51417
Western Digital      2826
Toshiba              2573
Dell                   60
Samsung                11
Unknown                 2
Name: manufacturer, dtype: int64

In [12]:
(105136+51417)/162025 *100

96.62274340379571

In [13]:
# How many unique models?
df['model'].nunique()

103

In [14]:
# How many hard drives models with less than 10 hard drives?
(df['model'].value_counts()  < 10).sum()

43

In [15]:
# How many hard drives by individual model?
df['model'].value_counts()

ST12000NM0007                          38271
ST4000DM000                            36156
HGST HMS5C4040BLE640                   16314
ST8000NM0055                           14810
HGST HUH721212ALN604                   10875
ST8000DM002                            10160
HGST HMS5C4040ALE640                    8637
Hitachi HDS5C3030ALA630                 4563
Hitachi HDS722020ALA330                 4503
Hitachi HDS5C4040ALE630                 2639
ST6000DX000                             1912
HGST HUH721212ALE600                    1564
TOSHIBA MG07ACA14TA                     1302
ST10000NM0086                           1235
WDC WD30EFRX                            1167
HGST HUH728080ALE600                    1082
Hitachi HDS723030ALA640                 1000
ST500LM012 HN                            804
TOSHIBA MQ01ABF050                       589
WDC WD60EFRX                             471
ST4000DM001                              425
TOSHIBA MQ01ABF050M                      410
WDC WD5000

In [16]:
# Average drive age by manufacturer
manu = df[['manufacturer', 'drive_age_in_years']]
manu.groupby('manufacturer',sort=True).mean()

Unnamed: 0_level_0,drive_age_in_years
manufacturer,Unnamed: 1_level_1
Dell,
Hitachi,3.057432
Samsung,0.3
Seagate,2.555232
Toshiba,1.611698
Unknown,1.7
Western Digital,3.765322


### Failures vs. total by model number

In [17]:
agg = df.groupby(['model']).agg({'failure' : 'sum', 'model' : 'count', 'drive_age_in_years':'mean'})

In [18]:
agg.rename(columns={'failure':'failures', 'model':'total_count'}, inplace=True)

In [19]:
agg['failure_rate_percent'] = agg['failures'] / agg['total_count'] * 100

In [20]:
agg.sort_values(by=['failure_rate_percent'], ascending = False)

Unnamed: 0_level_0,failures,total_count,drive_age_in_years,failure_rate_percent
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
WDC WD800LB,1,1,10.300000,100.000000
WDC WD3200BEKT,1,1,2.500000,100.000000
WDC WD800JD,1,1,4.800000,100.000000
WDC WD800JB,4,4,6.475000,100.000000
WDC WD800BB,5,5,9.240000,100.000000
WDC WD800AAJS,14,14,4.078571,100.000000
Samsung SSD 850 EVO 1TB,10,10,0.090000,100.000000
ST3500320AS,1,1,4.900000,100.000000
ST250LT007,4,4,2.600000,100.000000
ST320LT007,63,73,3.382192,86.301370


In [21]:
agg.sort_values(by=['failure_rate_percent'], ascending = False)

Unnamed: 0_level_0,failures,total_count,drive_age_in_years,failure_rate_percent
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
WDC WD800LB,1,1,10.300000,100.000000
WDC WD3200BEKT,1,1,2.500000,100.000000
WDC WD800JD,1,1,4.800000,100.000000
WDC WD800JB,4,4,6.475000,100.000000
WDC WD800BB,5,5,9.240000,100.000000
WDC WD800AAJS,14,14,4.078571,100.000000
Samsung SSD 850 EVO 1TB,10,10,0.090000,100.000000
ST3500320AS,1,1,4.900000,100.000000
ST250LT007,4,4,2.600000,100.000000
ST320LT007,63,73,3.382192,86.301370


In [22]:
#failures = df[df.failure == 1]
#non_failures = df[df.failure == 0]

In [23]:
cap = df.groupby(['model']).agg({'capacity_gigabytes' : ['max','min']})
#cap['difference'] = cap['max'] - cap['min']

In [24]:
df.isna().sum()

serial_number                        1
manufacturer                         0
model                                0
capacity_gigabytes                   0
failure                              0
drive_age_in_years                  60
reallocated_sectors_count          184
reported_uncorrectable_errors    57839
command_timeout                  57849
current_pending_sector_count       194
uncorrectable_sector_count         194
dtype: int64

In [25]:
df.shape

(162025, 11)

In [26]:
df.command_timeout.value_counts()

0.000000e+00    99803
1.000000e+00     1404
4.295033e+09      712
2.000000e+00      482
3.000000e+00      217
8.590066e+09      134
4.000000e+00      116
5.000000e+00       75
1.288510e+10       62
6.000000e+00       58
7.000000e+00       42
6.553700e+04       39
2.147516e+10       31
8.000000e+00       30
1.718013e+10       30
4.295033e+09       27
1.100000e+01       21
1.200000e+01       20
1.310740e+05       20
6.553800e+04       20
1.000000e+01       18
9.000000e+00       17
1.900000e+01       16
1.400000e+01       15
1.300000e+01       14
1.800000e+01       14
1.600000e+01       13
1.700000e+01       13
4.295033e+09       11
6.553900e+04       11
                ...  
1.590000e+02        1
3.865530e+10        1
4.295623e+09        1
1.110000e+02        1
4.295033e+09        1
3.436184e+10        1
4.295098e+09        1
5.111888e+06        1
1.417361e+11        1
4.295819e+09        1
8.590066e+09        1
6.800000e+01        1
3.050000e+02        1
1.288516e+10        1
4.295033e+

In [27]:
df.head()

Unnamed: 0,serial_number,manufacturer,model,capacity_gigabytes,failure,drive_age_in_years,reallocated_sectors_count,reported_uncorrectable_errors,command_timeout,current_pending_sector_count,uncorrectable_sector_count
0,PL1311LAG1SJAA,Hitachi,Hitachi HDS5C4040ALE630,4000.8,0,5.0,0.0,,,0.0,0.0
1,Z305KB36,Seagate,ST4000DM000,4000.8,0,3.5,0.0,0.0,0.0,0.0,0.0
2,MJ0351YNG9MZXA,Hitachi,Hitachi HDS5C3030ALA630,3000.6,0,4.8,0.0,,,0.0,0.0
3,ZA11NHSN,Seagate,ST8000DM002,8001.6,0,3.0,0.0,0.0,0.0,0.0,0.0
4,MJ1311YNG2ZSEA,Hitachi,Hitachi HDS5C3030ALA630,3000.6,0,5.5,0.0,,,0.0,0.0


In [28]:
prepare.treat_nulls(df)

Unnamed: 0,serial_number,manufacturer,model,capacity_gigabytes,failure,drive_age_in_years,reallocated_sectors_count,reported_uncorrectable_errors,command_timeout,current_pending_sector_count,uncorrectable_sector_count
0,PL1311LAG1SJAA,Hitachi,Hitachi HDS5C4040ALE630,4000.8,0,5.0,0.0,0.0,0.0,0.0,0.0
1,Z305KB36,Seagate,ST4000DM000,4000.8,0,3.5,0.0,0.0,0.0,0.0,0.0
2,MJ0351YNG9MZXA,Hitachi,Hitachi HDS5C3030ALA630,3000.6,0,4.8,0.0,0.0,0.0,0.0,0.0
3,ZA11NHSN,Seagate,ST8000DM002,8001.6,0,3.0,0.0,0.0,0.0,0.0,0.0
4,MJ1311YNG2ZSEA,Hitachi,Hitachi HDS5C3030ALA630,3000.6,0,5.5,0.0,0.0,0.0,0.0,0.0
5,Z305D5VF,Seagate,ST4000DM000,4000.8,0,3.6,0.0,0.0,0.0,0.0,0.0
6,Z3015L2E,Seagate,ST4000DM000,4000.8,0,4.2,0.0,0.0,0.0,0.0,0.0
7,PL1331LAHDYJYH,Hitachi,HGST HMS5C4040BLE640,4000.8,0,3.4,0.0,0.0,0.0,0.0,0.0
8,Z304LAHP,Seagate,ST4000DM000,4000.8,0,3.9,0.0,0.0,0.0,0.0,0.0
9,Z302B0GK,Seagate,ST4000DM000,4000.8,0,4.2,0.0,0.0,0.0,0.0,0.0


In [29]:
df.isna().sum()

serial_number                      1
manufacturer                       0
model                              0
capacity_gigabytes                 0
failure                            0
drive_age_in_years                60
reallocated_sectors_count        184
reported_uncorrectable_errors      0
command_timeout                    0
current_pending_sector_count     194
uncorrectable_sector_count       194
dtype: int64

In [30]:
df = treat_nulls(df)

NameError: name 'treat_nulls' is not defined

In [None]:
df.isna().sum()

In [None]:
df.shape