In [1]:
# spark libraries
import pyspark
from pyspark.sql.functions import max

# data wrangling
import pandas as pd
import numpy as np

## Acquire

#### Use spark! 

In [2]:
# create local spark enviroment
spark = pyspark.sql.SparkSession.builder.getOrCreate()

#### Read in 2016 data

In [40]:
df_2016 = spark.read.csv('../data/data_Q*_2016/*.csv', header=True)

In [41]:
len(df_2016.columns)

95

#### Read in 2017 data

In [42]:
df_2017 = spark.read.csv('../data/data_Q*_2017/*.csv', header=True)

In [43]:
len(df_2017.columns)

95

#### Read in 2018 data

In [44]:
# read quarters in separately since they have different column lengths
df_2018_1 = spark.read.csv('../data/data_Q1_2018/*.csv', header=True)
df_2018_2 = spark.read.csv('../data/data_Q2_2018/*.csv', header=True)
df_2018_3 = spark.read.csv('../data/data_Q3_2018/*.csv', header=True)
df_2018_4 = spark.read.csv('../data/data_Q4_2018/*.csv', header=True)

In [45]:
for df in [df_2018_1,df_2018_2,df_2018_3,df_2018_4]:
    print(len(df.columns))

105
109
109
129


#### Read in 2019 data

In [46]:
df_2019 = spark.read.csv('../data/data_Q*_2019/*.csv', header=True)

In [47]:
len(df_2019.columns)

129

#### Limit all dataframe to the original 95 columns

In [48]:
cols_2016 = df_2016.columns
cols_2019 = df_2019.columns
remove_traits_list = list(set(cols_2019).difference(cols_2016))

In [57]:
for trait in remove_traits_list:
    df_2018_1 = df_2018_1.drop(trait)
    df_2018_2 = df_2018_2.drop(trait)
    df_2018_3 = df_2018_3.drop(trait)
    df_2018_4 = df_2018_4.drop(trait)
    df_2019 = df_2019.drop(trait)

In [59]:
for df in [df_2016, df_2017, df_2018_1,df_2018_2,df_2018_3,df_2018_4, df_2019]:
    print(len(df.columns))

95
95
95
95
95
95
95


#### Combine all dataframes together

In [61]:
df_raw = df_2016.union(df_2017).union(df_2018_1).union(df_2018_2).union(df_2018_3).union(df_2018_4).union(df_2019)

In [63]:
len(df_raw.columns), df_raw.count()

(95, 121390247)

#### Extract max values from top five SMART attributes

In [64]:
df = df_raw.groupby('serial_number', 'model', 'capacity_bytes'
                   ).agg(max('failure'), max('smart_9_raw'), max('smart_5_raw'), 
                        max('smart_187_raw') , max('smart_197_raw'), max('smart_198_raw')
                        )

In [65]:
len(df.columns), df.count()

(9, 169073)

#### Transform to pandas

In [11]:
# df = df.select("*").toPandas()

#### Save dfs as csv for easy access

In [12]:
# df.to_csv(r'./hard_drives_smart_5.csv')

In [71]:
df = pd.read_csv('hard_drives_smart_5.csv').drop('Unnamed: 0',axis=1)

> Will now code everying in pandas! 

## Prepare

In [72]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 169073 entries, 0 to 169072
Data columns (total 9 columns):
serial_number         169072 non-null object
model                 169073 non-null object
capacity_bytes        169073 non-null int64
max(failure)          169073 non-null int64
max(smart_9_raw)      161975 non-null float64
max(smart_5_raw)      161851 non-null float64
max(smart_187_raw)    104189 non-null float64
max(smart_197_raw)    161841 non-null float64
max(smart_198_raw)    161841 non-null float64
dtypes: float64(5), int64(2), object(2)
memory usage: 11.6+ MB


In [73]:
df.head()

Unnamed: 0,serial_number,model,capacity_bytes,max(failure),max(smart_9_raw),max(smart_5_raw),max(smart_187_raw),max(smart_197_raw),max(smart_198_raw)
0,176FT026T,TOSHIBA MQ01ABF050M,500107862016,0,9965.0,0.0,,0.0,0.0
1,17eddeea3c620010,DELLBOSS VD,480036847616,0,,,,,
2,18K0A02ZF97G,TOSHIBA MG07ACA14TA,14000519643136,1,99.0,43.0,,0.0,0.0
3,2AGHLPBY,HGST HUH721212ALN604,12000138625024,0,990.0,0.0,,0.0,0.0
4,2AGM5WEY,HGST HUH721212ALN604,12000138625024,0,978.0,0.0,,0.0,0.0
