# 2.4 Train Data Cleaning

##### Description

Basic data visualization and data formatting for train.csv

##### Notebook Steps

1. Connect Spark
1. Input Data
1. Examine Data
1. Data Cleaning
1. Output Data

## 1. Connect Spark

In [1]:
import pyspark
sc = pyspark.SparkContext(appName="train-clean")
sc.setLogLevel("INFO")

from pyspark.sql import SQLContext
sqlContext = SQLContext(sc)

## 2. Input Data

In [2]:
import os
import zipfile

with zipfile.ZipFile('../../data/1-data_acquisition/1-train.output.zip', 'r') as zip_ref:
    zip_ref.extractall()
    
df = sqlContext.read.csv('1-train.output.csv', header=True)

## 3. Examine Data

##### show()

In [3]:
df.show()

+--------------------+--------+
|                msno|is_churn|
+--------------------+--------+
|ugx0CjOMzazClkFzU...|       1|
|f/NmvEzHfhINFEYZT...|       1|
|zLo9f73nGGT1p21lt...|       1|
|8iF/+8HY8lJKFrTc7...|       1|
|K6fja4+jmoZ5xG6By...|       1|
|ibIHVYBqxGwrSExE6...|       1|
|kVmM8X4iBPCOfK/m1...|       1|
|moRTKhKIDvb+C8ZHO...|       1|
|dW/tPZMDh2Oz/ksdu...|       1|
|otEcMhAX3mU4gumUS...|       1|
|t5rqTxCnG7s5VBgEf...|       1|
|dfLS2/Pom6O3iUpo+...|       1|
|a7AtvhlY8KnKZGpiV...|       1|
|F45GsXJIeLvzUJqz/...|       1|
|SJCoxreWp6Cu9WPit...|       1|
|Oo2RDQixJ0pRWqec4...|       1|
|f91n3lDipDjRtAVNg...|       1|
|/L2095JD4M/BNLTCb...|       1|
|1AzXWFlRO6EfMBzfB...|       1|
|WkF/FvlzpBLFoa+Hm...|       1|
+--------------------+--------+
only showing top 20 rows



##### count()

In [4]:
df.count()

970960

##### describe()

In [5]:
df.describe().show()

+-------+--------------------+-------------------+
|summary|                msno|           is_churn|
+-------+--------------------+-------------------+
|  count|              970960|             970960|
|   mean|                null|0.08994191315811156|
| stddev|                null| 0.2860986712938496|
|    min|+++hVY1rZox/33Ytv...|                  0|
|    max|zzzF1KsGfHH3qI6qi...|                  1|
+-------+--------------------+-------------------+



##### printSchema()

In [6]:
df.printSchema()

root
 |-- msno: string (nullable = true)
 |-- is_churn: string (nullable = true)



##### columns

In [7]:
df.columns

['msno', 'is_churn']

##### head(5)

In [8]:
df.head(5)

[Row(msno='ugx0CjOMzazClkFzU2xasmDZaoIqOUAZPsH1q0teWCg=', is_churn='1'),
 Row(msno='f/NmvEzHfhINFEYZTR05prUdr+E+3+oewvweYz9cCQE=', is_churn='1'),
 Row(msno='zLo9f73nGGT1p21ltZC3ChiRnAVvgibMyazbCxvWPcg=', is_churn='1'),
 Row(msno='8iF/+8HY8lJKFrTc7iR9ZYGCG2Ecrogbc2Vy5YhsfhQ=', is_churn='1'),
 Row(msno='K6fja4+jmoZ5xG6BypqX80Uw/XKpMgrEMdG2edFOxnA=', is_churn='1')]

##### tail(5)

In [9]:
df.tail(5)

[Row(msno='OHnZbu+EVaP+vN7Z+OfT5OMcp90MWFZonmM0o3pb8FY=', is_churn='0'),
 Row(msno='S92bDK//uI6hk3u1vuApro0qJBQOToBozZ7lL1yTC+w=', is_churn='0'),
 Row(msno='eUa3xo16vpAjr43Cjlb6Kjf1NTILYyJIkBayJQdXWnw=', is_churn='0'),
 Row(msno='iZE41tbAQ65rJq60olkJT4BJzuUAYgQdfbEemXe/TTk=', is_churn='0'),
 Row(msno='oECkzJik4wKsbOEVY6UACLbmgM8qymFdb5cJaHrodY8=', is_churn='0')]

##### Null per Column

In [10]:
from pyspark.sql.functions import isnan, when, count, col

df.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in df.columns]).show()

+----+--------+
|msno|is_churn|
+----+--------+
|   0|       0|
+----+--------+



##### Value Counts

In [11]:
df.groupBy('is_churn').count().orderBy('count').show()

+--------+------+
|is_churn| count|
+--------+------+
|       1| 87330|
|       0|883630|
+--------+------+



## 4. Data Cleaning

In [12]:
from pyspark.sql import types
from pyspark.sql.functions import col, to_date

##### msno
The msno column corresponds to user ids for the dataset, so the column is renamed from msno to user_id.

In [13]:
df = df.withColumnRenamed("msno","user_id")

##### is_churn
The is_churn column is cast from string to boolean.

In [14]:
df = df.withColumn("is_churn",col("is_churn").cast(types.BooleanType()))

## 5. Data Output

##### Final Check

In [15]:
df.printSchema()

root
 |-- user_id: string (nullable = true)
 |-- is_churn: boolean (nullable = true)



In [16]:
df.show()

+--------------------+--------+
|             user_id|is_churn|
+--------------------+--------+
|ugx0CjOMzazClkFzU...|    true|
|f/NmvEzHfhINFEYZT...|    true|
|zLo9f73nGGT1p21lt...|    true|
|8iF/+8HY8lJKFrTc7...|    true|
|K6fja4+jmoZ5xG6By...|    true|
|ibIHVYBqxGwrSExE6...|    true|
|kVmM8X4iBPCOfK/m1...|    true|
|moRTKhKIDvb+C8ZHO...|    true|
|dW/tPZMDh2Oz/ksdu...|    true|
|otEcMhAX3mU4gumUS...|    true|
|t5rqTxCnG7s5VBgEf...|    true|
|dfLS2/Pom6O3iUpo+...|    true|
|a7AtvhlY8KnKZGpiV...|    true|
|F45GsXJIeLvzUJqz/...|    true|
|SJCoxreWp6Cu9WPit...|    true|
|Oo2RDQixJ0pRWqec4...|    true|
|f91n3lDipDjRtAVNg...|    true|
|/L2095JD4M/BNLTCb...|    true|
|1AzXWFlRO6EfMBzfB...|    true|
|WkF/FvlzpBLFoa+Hm...|    true|
+--------------------+--------+
only showing top 20 rows



##### Output to File

In [17]:
filepath = '../../data/2-data_cleaning/2-train.output.csv'

df.write.format('com.databricks.spark.csv').options(header='true').save(filepath)