In [1]:
# spark libraries
import pyspark
from pyspark.sql.functions import max

# data wrangling
import pandas as pd
import numpy as np

## Acquire

#### Use spark! 

In [2]:
# create local spark enviroment
spark = pyspark.sql.SparkSession.builder.getOrCreate()

#### Read in 2016 data

In [18]:
df_2016 = spark.read.csv('../data/data_Q*_2016/*.csv', header=True, inferSchema=True)

In [19]:
len(df_2016.columns)

95

#### Read in 2017 data

In [21]:
df_2017 = spark.read.csv('../data/data_Q*_2017/*.csv', header=True, inferSchema=True)

In [22]:
len(df_2017.columns)

95

#### Read in 2018 data

In [23]:
# read quarters in separately since they have different column lengths
df_2018_1 = spark.read.csv('../data/data_Q1_2018/*.csv', header=True, inferSchema=True)
df_2018_2 = spark.read.csv('../data/data_Q2_2018/*.csv', header=True, inferSchema=True)
df_2018_3 = spark.read.csv('../data/data_Q3_2018/*.csv', header=True, inferSchema=True)
df_2018_4 = spark.read.csv('../data/data_Q4_2018/*.csv', header=True, inferSchema=True)

In [24]:
for df in [df_2018_1,df_2018_2,df_2018_3,df_2018_4]:
    print(len(df.columns))

105
109
109
129


#### Read in 2019 data

In [34]:
df_2019 = spark.read.csv('../data/data_Q*_2019/*.csv', header=True, inferSchema=True)

In [35]:
len(df_2019.columns)

129

#### Limit all dataframe to the original 95 columns

In [36]:
cols_2016 = df_2016.columns
cols_2019 = df_2019.columns
remove_traits_list = list(set(cols_2019).difference(cols_2016))

In [37]:
for trait in remove_traits_list:
    df_2018_1 = df_2018_1.drop(trait)
    df_2018_2 = df_2018_2.drop(trait)
    df_2018_3 = df_2018_3.drop(trait)
    df_2018_4 = df_2018_4.drop(trait)
    df_2019 = df_2019.drop(trait)

In [38]:
for df in [df_2016, df_2017, df_2018_1,df_2018_2,df_2018_3,df_2018_4, df_2019]:
    print(len(df.columns))

95
95
95
95
95
95
95


#### Combine all dataframes together

In [39]:
df_raw = df_2016.union(df_2017).union(df_2018_1).union(df_2018_2).union(df_2018_3).union(df_2018_4).union(df_2019)

In [63]:
len(df_raw.columns), df_raw.count()

(95, 121390247)

In [47]:
df_raw.select('serial_number', 'model', 'capacity_bytes',
             'failure', 'smart_9_raw', 'smart_5_raw','smart_187_raw',
             'smart_188_raw', 'smart_197_raw', 'smart_198_raw').printSchema()

root
 |-- serial_number: string (nullable = true)
 |-- model: string (nullable = true)
 |-- capacity_bytes: long (nullable = true)
 |-- failure: integer (nullable = true)
 |-- smart_9_raw: integer (nullable = true)
 |-- smart_5_raw: integer (nullable = true)
 |-- smart_187_raw: integer (nullable = true)
 |-- smart_188_raw: long (nullable = true)
 |-- smart_197_raw: integer (nullable = true)
 |-- smart_198_raw: integer (nullable = true)



#### Extract max values from top five SMART attributes

In [48]:
df = df_raw.groupby('serial_number', 'model', 'capacity_bytes'
                   ).agg(max('failure'), max('smart_9_raw'), max('smart_5_raw'), max('smart_187_raw'),
                        max('smart_188_raw'), max('smart_197_raw'), max('smart_198_raw')
                        )

In [65]:
len(df.columns), df.count()

(9, 169073)

#### Transform to pandas

In [49]:
df = df.select("*").toPandas()

#### Save dfs as csv for easy access

In [50]:
df.to_csv(r'./hard_drives_smart_5.csv')

In [51]:
# df = pd.read_csv('hard_drives_smart_5.csv').drop('Unnamed: 0',axis=1)

> Will now code everying in pandas! 

In [52]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 169073 entries, 0 to 169072
Data columns (total 10 columns):
serial_number         169072 non-null object
model                 169073 non-null object
capacity_bytes        169073 non-null int64
max(failure)          169073 non-null int32
max(smart_9_raw)      161975 non-null float64
max(smart_5_raw)      161851 non-null float64
max(smart_187_raw)    104189 non-null float64
max(smart_188_raw)    104179 non-null float64
max(smart_197_raw)    161841 non-null float64
max(smart_198_raw)    161841 non-null float64
dtypes: float64(6), int32(1), int64(1), object(2)
memory usage: 12.3+ MB


In [53]:
df.head()

Unnamed: 0,serial_number,model,capacity_bytes,max(failure),max(smart_9_raw),max(smart_5_raw),max(smart_187_raw),max(smart_188_raw),max(smart_197_raw),max(smart_198_raw)
0,PL1311LAG1SJAA,Hitachi HDS5C4040ALE630,4000787030016,0,43819.0,0.0,,,0.0,0.0
1,Z305KB36,ST4000DM000,4000787030016,0,31045.0,0.0,0.0,0.0,0.0,0.0
2,MJ0351YNG9MZXA,Hitachi HDS5C3030ALA630,3000592982016,0,41668.0,0.0,,,0.0,0.0
3,ZA11NHSN,ST8000DM002,8001563222016,0,26284.0,0.0,0.0,0.0,0.0,0.0
4,MJ1311YNG2ZSEA,Hitachi HDS5C3030ALA630,3000592982016,0,47994.0,0.0,,,0.0,0.0
