In [1]:
import pandas as pd
import numpy as np
import re

import acquire
import prepare
import explore
import model

import seaborn as sns
%matplotlib inline
import matplotlib.pyplot as plt 


import warnings
warnings.filterwarnings("ignore")

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.tree import export_graphviz
from sklearn.metrics import recall_score

# Acquire

In [2]:
def drop_normalized(df):
    columns = []
    for i in df.columns:
        columns.append(i)
    columns = str(columns)
    normalized_columns = re.findall(r'(max\(smart_\d+_normalized\))', columns)
    df = df.drop(columns=(normalized_columns))
    return df


In [3]:
df = pd.read_csv('all_smart.csv')

In [4]:
df.head()

Unnamed: 0.1,Unnamed: 0,serial_number,model,capacity_bytes,max(capacity_bytes),max(failure),max(smart_1_normalized),max(smart_1_raw),max(smart_2_normalized),max(smart_2_raw),...,max(smart_242_normalized),max(smart_242_raw),max(smart_250_normalized),max(smart_250_raw),max(smart_251_normalized),max(smart_251_raw),max(smart_252_normalized),max(smart_252_raw),max(smart_254_normalized),max(smart_254_raw)
0,0,PL1311LAG1SJAA,Hitachi HDS5C4040ALE630,4000787030016,4000787030016,0,100.0,1.0,135.0,99.0,...,,,,,,,,,,
1,1,Z305KB36,ST4000DM000,4000787030016,4000787030016,0,120.0,243787512.0,,,...,100.0,152202100000.0,,,,,,,,
2,2,MJ0351YNG9MZXA,Hitachi HDS5C3030ALA630,3000592982016,3000592982016,0,100.0,1.0,134.0,112.0,...,,,,,,,,,,
3,3,ZA11NHSN,ST8000DM002,8001563222016,8001563222016,0,100.0,244007456.0,,,...,100.0,146653400000.0,,,,,,,,
4,4,MJ1311YNG2ZSEA,Hitachi HDS5C3030ALA630,3000592982016,3000592982016,0,100.0,1245191.0,100.0,0.0,...,,,,,,,,,,


In [5]:
df.shape

(169073, 88)

In [6]:
df = drop_normalized(df)

In [7]:
df.shape

(169073, 47)

In [8]:
df.columns

Index(['Unnamed: 0', 'serial_number', 'model', 'capacity_bytes',
       'max(capacity_bytes)', 'max(failure)', 'max(smart_1_raw)',
       'max(smart_2_raw)', 'max(smart_3_raw)', 'max(smart_4_raw)',
       'max(smart_5_raw)', 'max(smart_7_raw)', 'max(smart_8_raw)',
       'max(smart_9_raw)', 'max(smart_10_raw)', 'max(smart_11_raw)',
       'max(smart_12_raw)', 'max(smart_22_raw)', 'max(smart_183_raw)',
       'max(smart_184_raw)', 'max(smart_187_raw)', 'max(smart_188_raw)',
       'max(smart_189_raw)', 'max(smart_190_raw)', 'max(smart_191_raw)',
       'max(smart_192_raw)', 'max(smart_193_raw)', 'max(smart_194_raw)',
       'max(smart_195_raw)', 'max(smart_196_raw)', 'max(smart_197_raw)',
       'max(smart_198_raw)', 'max(smart_199_raw)', 'max(smart_200_raw)',
       'max(smart_220_raw)', 'max(smart_222_raw)', 'max(smart_223_raw)',
       'max(smart_224_raw)', 'max(smart_225_raw)', 'max(smart_226_raw)',
       'max(smart_240_raw)', 'max(smart_241_raw)', 'max(smart_242_raw)',
       'max

In [9]:
def narrow_the_search(df):
    df = df.drop(columns={
        # vendor specific features
        'max(smart_1_raw)','max(smart_7_raw)',
        
        # value is dependant on changes, thus aggregation irrelevant
        'max(smart_2_raw)','max(smart_8_raw)',
        
        # irrelevant features due to hard drives not routinely powering off
        'max(smart_4_raw)','max(smart_12_raw)','max(smart_190_raw)',
        'max(smart_192_raw)','max(smart_226_raw)','max(smart_240_raw)',
        'max(smart_241_raw)','max(smart_242_raw)',
        
        # values are mostly null
        'max(smart_250_raw)','max(smart_251_raw)',
        'max(smart_252_raw)','max(smart_254_raw)',
    })
    
    return df

In [10]:
df.shape

(169073, 47)

In [11]:
df = narrow_the_search(df)

In [12]:
df.shape

(169073, 31)

# Prepare

In [13]:
df.columns

Index(['Unnamed: 0', 'serial_number', 'model', 'capacity_bytes',
       'max(capacity_bytes)', 'max(failure)', 'max(smart_3_raw)',
       'max(smart_5_raw)', 'max(smart_9_raw)', 'max(smart_10_raw)',
       'max(smart_11_raw)', 'max(smart_22_raw)', 'max(smart_183_raw)',
       'max(smart_184_raw)', 'max(smart_187_raw)', 'max(smart_188_raw)',
       'max(smart_189_raw)', 'max(smart_191_raw)', 'max(smart_193_raw)',
       'max(smart_194_raw)', 'max(smart_195_raw)', 'max(smart_196_raw)',
       'max(smart_197_raw)', 'max(smart_198_raw)', 'max(smart_199_raw)',
       'max(smart_200_raw)', 'max(smart_220_raw)', 'max(smart_222_raw)',
       'max(smart_223_raw)', 'max(smart_224_raw)', 'max(smart_225_raw)'],
      dtype='object')

In [14]:
df = prepare.prepare(df)

In [15]:
df = prepare.unique(df)

In [16]:
df = prepare.treat_nulls(df)

In [17]:
df = explore.old_or_fail(df)

In [18]:
df.columns

Index(['serial_number', 'manufacturer', 'model', 'capacity_terabytes',
       'failure', 'drive_age_in_years', 'reallocated_sectors_count',
       'reported_uncorrectable_errors', 'command_timeout',
       'current_pending_sector_count', 'uncorrectable_sector_count'],
      dtype='object')