# Data Cleaning Test #3 (Using Scikit Learn)

In [1]:
import time
import numpy as np
import pandas as pd
from pyspark.sql.types import StructType, StructField, LongType
from pyspark.sql import SparkSession
import pyspark.sql
import pyspark.sql.functions

In [2]:
%reload_ext autoreload
%autoreload 2

In [3]:
# store path to notebook
PWD = !pwd
PWD = PWD[0]

In [4]:
# start Spark Session
app_name = "data_cleaning_notebook"
master = "local[*]"
spark = SparkSession\
        .builder\
        .appName(app_name)\
        .master(master)\
        .getOrCreate()
sc = spark.sparkContext

Do this on the command line first: hadoop fs -mkdir /RddCheckPoint

In [5]:
#sc.setCheckpointDir("hdfs://quickstart.cloudera:8020/RddCheckPoint")

## Load the data

Load training and test data

In [6]:
trainRDD = sc.textFile('data/train1percentsample.txt')
#testRDD = sc.textFile('data/test1percentsample.txt')

In [7]:
def ConvertNumber(idx, num):
    if num != '':
        if idx > 13:
            return int(num, 16)
        else:
            return int(num)
    else:
        return None

In [8]:
trainRDD2 = trainRDD.map(lambda x: [ConvertNumber(idx, num) for idx,num in enumerate(x.split('\t'))]).cache()
#testRDD2 = testRDD.map(lambda x: [ConvertNumber(idx, num) for idx,num in enumerate(x.split('\t'))]).cache()

In [9]:
trainRDDRow = trainRDD2.take(1)
numColumns = len(trainRDDRow[0])

In [10]:
structFieldList = [StructField('field_' + str(num), LongType(), True) for num in range(numColumns)]
schema = StructType(structFieldList)

trainDF = spark.createDataFrame(trainRDD2, schema)
#testDF = spark.createDataFrame(testRDD2, schema)

## Define functions

In [11]:
def constrainRow(row):
    rowDict = row.asDict()
    
    for field in valueDict.keys():

        if rowDict[field] not in valueDict[field]:
            # Set to unknown / rare
            rowDict[field] = 1000    # sys.maxint
        else:
            # Set to the new index. This is so existing values do not collide with the unknown / rare value (1000)
            rowDict[field] = valueDict[field][rowDict[field]]
    
    return pyspark.sql.Row(**rowDict)

In [12]:
def oheDataFrame_getCategories(fieldList, df):

    for field in fieldList:
        # Find the frequency of items in the category
        fieldFreqRDD = df.rdd.map(lambda x: (x[field[0]], 1)).\
                              reduceByKey(lambda x, y: x+y)

        validValuesDict = {}
        index=0
        for value in fieldFreqRDD.takeOrdered(field[1], key=lambda x: -x[1]):
            validValuesDict[value[0]] = index
            index += 1

        # Use the top N frequent values
        valueDict[field[0]] = validValuesDict

In [13]:
def oheDataFrame_transform(df):
    oheRDD = df.rdd.map(lambda row: constrainRow(row))
    return oheRDD.toDF()

# TODO: Optimize this to do all columns at once

In [14]:
def imputeWithMean(field, df):
    fieldMean = df.rdd.map(lambda row: row[field]).filter(lambda x: x != None).mean()
    return df.fillna(fieldMean, [field])

## Clean the data

In [15]:
startTime = time.time()

In [16]:
print('Started processing the fields...')

Started processing the fields...


### Field 1

This field has about 42% null values.  Drop it.

In [17]:
trainDF = trainDF.drop('field_1')
#testDF = testDF.drop('field_1')

### Field 2

No null values, so scale this field.

### Field 3

This field has about 23% null values.  Drop it.

In [18]:
trainDF = trainDF.drop('field_3')
#testDF = testDF.drop('field_3')

### Field 4

This field has about 25% null values.  Drop it.

In [19]:
trainDF = trainDF.drop('field_4')
#testDF = testDF.drop('field_4')

### Field 5

This field has about 3% null values.  Impute with the mean and scale.

In [20]:
trainDF = imputeWithMean('field_5', trainDF)
#testDF = imputeWithMean('field_5', testDF)

### Field 6

This field has about 22% null values. Drop it.

In [21]:
trainDF = trainDF.drop('field_6')
#testDF = testDF.drop('field_6')

### Field 7

This field has about 4% null values. Impute with the mean and scale.

In [22]:
trainDF = imputeWithMean('field_7', trainDF)
#testDF = imputeWithMean('field_7', testDF)

### Field 8

This field has < 1% null values. Impute with the mean and scale.

In [23]:
trainDF = imputeWithMean('field_8', trainDF)
#testDF = imputeWithMean('field_8', testDF)

## Field 9

This field has 4% null values. Impute with the mean and scale.

In [24]:
trainDF = imputeWithMean('field_9', trainDF)
#testDF = imputeWithMean('field_9', testDF)

### Field 10

This field has 42% null values. Drop it.

In [25]:
trainDF = trainDF.drop('field_10')
#testDF = testDF.drop('field_10')

### Field 11

This field has 4% null values. Impute with the mean and scale.

In [26]:
trainDF = imputeWithMean('field_11', trainDF)
#testDF = imputeWithMean('field_11', testDF)

### Field 12

This field has 77% null values. Drop it.

In [27]:
trainDF = trainDF.drop('field_12')
#testDF = testDF.drop('field_12')

### Field 13

This field has 25% null values. Drop it.

In [28]:
trainDF = trainDF.drop('field_13')
#testDF = testDF.drop('field_13')

### Field 14

This field has 0% null values. It has a modest amount of distinct values, so OHE.

### Field 15

This field has 0% null values. It has a modest amount of distinct values, so OHE.

### Field 16

This field has 4% null values. It has a large amount of distinct values, so drop.

In [29]:
trainDF = trainDF.drop('field_16')
#testDF = testDF.drop('field_16')

### Field 17

This field has 4% null values. It has a large amount of distinct values, so drop.

In [30]:
trainDF = trainDF.drop('field_17')
#testDF = testDF.drop('field_17')

### Field 18

This field has 0% null values. It has a modest amount of distinct values, so OHE.

### Field 19

This field has 12% null values, so delete those observations (NOTE: this is categorical, so we can't impute).
It has a modest amount of distinct values, so OHE.

### Field 20

This field has 0% null values. It has a modest amount of distinct values, so OHE.

### Field 21

This field has 0% null values. It has a modest amount of distinct values, so OHE.

### Field 22

This field has 0% null values. It has a small amount of distinct values, so OHE.

### Field 23

This field has 0% null values. It has a large amount of distinct values, so drop.

In [31]:
trainDF = trainDF.drop('field_23')
#testDF = testDF.drop('field_23')

### Field 24

This field has 0% null values. It has a modest amount of distinct values, so OHE.

### Field 25

This field has 4% null values. It has a large amount of distinct values, so drop.

In [32]:
trainDF = trainDF.drop('field_25')
#testDF = testDF.drop('field_25')

### Field 26

This field has 0% null values. It has a modest amount of distinct values, so OHE.

### Field 27

This field has 0% null values. It has a small amount of distinct values, so OHE.

### Field 28

This field has 0% null values. It has a modest amount of distinct values, so OHE.

### Field 29

This field has 4% null values. It has a large amount of distinct values, so drop.

In [33]:
trainDF = trainDF.drop('field_29')
#testDF = testDF.drop('field_29')

### Field 30

This field has 0% null values. It has a small amount of distinct values, so OHE.

### Field 31

This field has 0% null values. It has a modest amount of distinct values, so OHE.

### Field 32

This field has 48% null values, so drop.

In [34]:
trainDF = trainDF.drop('field_32')
#testDF = testDF.drop('field_32')

### Field 33

This field has 48% null values, so drop.

In [35]:
trainDF = trainDF.drop('field_33')
#testDF = testDF.drop('field_33')

### Field 34

This field has 4% null values. It has a large amount of distinct values, so drop.

In [36]:
trainDF = trainDF.drop('field_34')
#testDF = testDF.drop('field_34')

### Field 35

This field has 74% null values, so drop.

In [37]:
trainDF = trainDF.drop('field_35')
#testDF = testDF.drop('field_35')

### Field 36

This field has 0% null values. It has a small amount of distinct values, so OHE.

### Field 37

This field has 4% null values. It has a large amount of distinct values, so drop.

In [38]:
trainDF = trainDF.drop('field_37')
#testDF = testDF.drop('field_37')

### Field 38

This field has 48% null values, so drop.

In [39]:
trainDF = trainDF.drop('field_38')
#testDF = testDF.drop('field_38')

### Field 39

This field has 48% null values, so drop.

In [40]:
trainDF = trainDF.drop('field_39')
#testDF = testDF.drop('field_39')

In [41]:
print('...Finished processing the fields')

...Finished processing the fields


## Delete null observations in a batch (this is for categorical values)

In [42]:
trainDF = trainDF.dropna(how='any', subset=['field_19'])
#testDF = testDF.dropna(how='any', subset=['field_19'])

## TODO: Do Imputation In a Batch (this is for numerical values)

## OHE

In [43]:
print('Started OHE...')

Started OHE...


Preprocess columns to renumber values so that OHE will create the appropriate number of categories

In [44]:
valueDict = {}
oheFieldsAndSizes = [('field_14',10),('field_15',10),('field_18',10),('field_19',10),('field_20',10),('field_21',10),('field_22',3),('field_24',10),('field_26',10),('field_27',10),('field_28',10),('field_30',10),('field_31',10),('field_36',10)]
oheFields = [x[0] for x in oheFieldsAndSizes]

In [45]:
oheDataFrame_getCategories(oheFieldsAndSizes, trainDF)

In [46]:
trainDF = oheDataFrame_transform(trainDF)

In [47]:
#trainDF.show(n=20)

Use Scikit Learn to OHE

In [48]:
# Get a Pandas DF
trainDF_pd = trainDF.toPandas()

In [49]:
#trainDF_pd.iloc[0,:]

In [50]:
from sklearn.preprocessing import OneHotEncoder

enc = OneHotEncoder(handle_unknown='ignore')
enc.fit(trainDF_pd[oheFields])
trainDF_ohe_sparse = enc.transform(trainDF_pd[oheFields])
trainDF_ohe_dense = trainDF_ohe_sparse.todense()

In [51]:
print('...Finished OHE')

...Finished OHE


## Scaling

Use Scikit Learn to Scale

In [52]:
print('Started scaling...')

Started scaling...


In [53]:
numericFields = ['field_0','field_2','field_5','field_7','field_8','field_9','field_11']

In [54]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
scaler.fit(trainDF_pd[numericFields])
trainDF_scaled = scaler.transform(trainDF_pd[numericFields])

In [55]:
print('...Finished scaling')

...Finished scaling


## Recombine the matricies

In [56]:
trainDF_combined = np.hstack((trainDF_scaled, trainDF_ohe_dense))

Reclaim memory

In [57]:
del trainDF_scaled
del trainDF_ohe_dense

import gc
gc.collect()

28

## Convert back to a Spark DataFrame

In [58]:
print('Started converting to DataFrame...')

Started converting to DataFrame...


In [59]:
indicies = [i for i in range(trainDF_combined.shape[0])]
columnNames = [str(x) for x in range(trainDF_combined.shape[1])]
trainDF = spark.createDataFrame(pd.DataFrame(data=trainDF_combined, index=indicies, columns=columnNames))

In [60]:
print('...Finished converting to DataFrame')

...Finished converting to DataFrame


Reclaim memory

In [61]:
del trainDF_combined

import gc
gc.collect()

105

## Save cleaned DataFrame to a file

In [62]:
print('Started saving the DataFrames...')

Started saving the DataFrames...


In [63]:
trainDF.write.csv(path='data/train1percentsample_cleaned_tmp.txt', header=False, sep='\t')
#testDF.write.csv(path='data/test1percentsample_cleaned_tmp.txt', header=False, sep='\t')

KeyboardInterrupt: 

In [None]:
!rm -rf data/train1percentsample_cleaned.txt
#!rm -rf data/test1percentsample_cleaned.txt
!mv data/train1percentsample_cleaned_tmp.txt data/train1percentsample_cleaned.txt
#!mv data/test1percentsample_cleaned_tmp.txt data/test1percentsample_cleaned.txt

In [None]:
print('...Finished saving the DataFrames')

In [None]:
print('The total time was: {} seconds'.format(time.time() - startTime))