# 2.1 Transactions Data Cleaning

##### Description

Basic data visualization and data formatting for transactions.csv

##### Notebook Steps

1. Connect Spark
1. Input Data
1. Examine Data
1. Data Cleaning
1. Output Data

## 1. Connect Spark

In [1]:
import pyspark
sc = pyspark.SparkContext(appName="trans-clean")
sc.setLogLevel("INFO")

from pyspark.sql import SQLContext
sqlContext = SQLContext(sc)

## 2. Input Data

In [2]:
import os
import zipfile

with zipfile.ZipFile('../../data/1-data_acquisition/1-transactions.output.zip', 'r') as zip_ref:
    zip_ref.extractall()
    
df = sqlContext.read.csv('1-transactions.output.csv', header=True)

## 3. Examine Data

##### show()

In [3]:
df.show()

+--------------------+-----------------+-----------------+---------------+------------------+-------------+----------------+----------------------+---------+
|                msno|payment_method_id|payment_plan_days|plan_list_price|actual_amount_paid|is_auto_renew|transaction_date|membership_expire_date|is_cancel|
+--------------------+-----------------+-----------------+---------------+------------------+-------------+----------------+----------------------+---------+
|++6eU4LsQ3UQ20ILS...|               32|               90|            298|               298|            0|        20170131|              20170504|        0|
|++lvGPJOinuin/8es...|               41|               30|            149|               149|            1|        20150809|              20190412|        0|
|+/GXNtXWQVfKrEDqY...|               36|               30|            180|               180|            1|        20170303|              20170422|        0|
|+/w1UrZwyka4C9oNH...|               36|            

##### count()

In [4]:
df.count()

1431009

##### describe()

In [5]:
df.describe().show()

+-------+--------------------+-----------------+------------------+------------------+------------------+-------------------+--------------------+----------------------+-------------------+
|summary|                msno|payment_method_id| payment_plan_days|   plan_list_price|actual_amount_paid|      is_auto_renew|    transaction_date|membership_expire_date|          is_cancel|
+-------+--------------------+-----------------+------------------+------------------+------------------+-------------------+--------------------+----------------------+-------------------+
|  count|             1431009|          1431009|           1431009|           1431009|           1431009|            1431009|             1431009|               1431009|            1431009|
|   mean|                null|37.91835481118567| 66.01769590547649|281.78703488238017| 281.3172411913552| 0.7853025382789347|2.0168484537746444E7|   2.017110068205581E7|0.02455120827332323|
| stddev|                null|4.964804906926858|10

##### printSchema()

In [6]:
df.printSchema()

root
 |-- msno: string (nullable = true)
 |-- payment_method_id: string (nullable = true)
 |-- payment_plan_days: string (nullable = true)
 |-- plan_list_price: string (nullable = true)
 |-- actual_amount_paid: string (nullable = true)
 |-- is_auto_renew: string (nullable = true)
 |-- transaction_date: string (nullable = true)
 |-- membership_expire_date: string (nullable = true)
 |-- is_cancel: string (nullable = true)



##### columns

In [7]:
df.columns

['msno',
 'payment_method_id',
 'payment_plan_days',
 'plan_list_price',
 'actual_amount_paid',
 'is_auto_renew',
 'transaction_date',
 'membership_expire_date',
 'is_cancel']

##### head(5)

In [8]:
df.head(5)

[Row(msno='++6eU4LsQ3UQ20ILS7d99XK8WbiVgbyYL4FUgzZR134=', payment_method_id='32', payment_plan_days='90', plan_list_price='298', actual_amount_paid='298', is_auto_renew='0', transaction_date='20170131', membership_expire_date='20170504', is_cancel='0'),
 Row(msno='++lvGPJOinuin/8esghpnqdljm6NXS8m8Zwchc7gOeA=', payment_method_id='41', payment_plan_days='30', plan_list_price='149', actual_amount_paid='149', is_auto_renew='1', transaction_date='20150809', membership_expire_date='20190412', is_cancel='0'),
 Row(msno='+/GXNtXWQVfKrEDqYAzcSw2xSPYMKWNj22m+5XkVQZc=', payment_method_id='36', payment_plan_days='30', plan_list_price='180', actual_amount_paid='180', is_auto_renew='1', transaction_date='20170303', membership_expire_date='20170422', is_cancel='0'),
 Row(msno='+/w1UrZwyka4C9oNH3+Q8fUf3fD8R3EwWrx57ODIsqk=', payment_method_id='36', payment_plan_days='30', plan_list_price='180', actual_amount_paid='180', is_auto_renew='1', transaction_date='20170329', membership_expire_date='20170331', 

##### tail(5)

In [9]:
df.tail(5)

[Row(msno='zwF50wwaJI2TBKWhB42HRBJ6EQK0jgSo1Xmwb9Jq3SU=', payment_method_id='32', payment_plan_days='180', plan_list_price='536', actual_amount_paid='536', is_auto_renew='0', transaction_date='20170215', membership_expire_date='20170817', is_cancel='0'),
 Row(msno='zx/h5MzQQmsSat04wSfGpHp6N8aWLLwM1+7OV7ujmPY=', payment_method_id='41', payment_plan_days='30', plan_list_price='149', actual_amount_paid='149', is_auto_renew='1', transaction_date='20170306', membership_expire_date='20170406', is_cancel='0'),
 Row(msno='zxvgjIKjy18Fm+cIWUfYKr68z09+ILBxuMW0DnbeUZ8=', payment_method_id='41', payment_plan_days='30', plan_list_price='99', actual_amount_paid='99', is_auto_renew='1', transaction_date='20170308', membership_expire_date='20170408', is_cancel='0'),
 Row(msno='zzNhkExbpzmpjp9tXefiCUBtgNLgS+vZE7fFfTRDJVc=', payment_method_id='38', payment_plan_days='30', plan_list_price='149', actual_amount_paid='149', is_auto_renew='0', transaction_date='20170318', membership_expire_date='20170417', i

##### Null per Column

In [14]:
from pyspark.sql.functions import isnan, when, count, col

df.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in df.columns]).show()

AnalysisException: cannot resolve 'isnan(`is_auto_renew`)' due to data type mismatch: argument 1 requires (double or float) type, however, '`is_auto_renew`' is of boolean type.;;
'Aggregate [count(CASE WHEN (isnan(cast(user_id#786 as double)) OR isnull(user_id#786)) THEN user_id END) AS user_id#923L, count(CASE WHEN (isnan(cast(payment_method_id#796 as double)) OR isnull(payment_method_id#796)) THEN payment_method_id END) AS payment_method_id#925L, count(CASE WHEN (isnan(cast(payment_plan_days#806 as double)) OR isnull(payment_plan_days#806)) THEN payment_plan_days END) AS payment_plan_days#927L, count(CASE WHEN (isnan(cast(plan_list_price#816 as double)) OR isnull(plan_list_price#816)) THEN plan_list_price END) AS plan_list_price#929L, count(CASE WHEN (isnan(cast(actual_amount_paid#826 as double)) OR isnull(actual_amount_paid#826)) THEN actual_amount_paid END) AS actual_amount_paid#931L, count(CASE WHEN (isnan(is_auto_renew#836) OR isnull(is_auto_renew#836)) THEN is_auto_renew END) AS is_auto_renew#933, count(CASE WHEN (isnan(transaction_date#856) OR isnull(transaction_date#856)) THEN transaction_date END) AS transaction_date#935, count(CASE WHEN (isnan(membership_expire_date#866) OR isnull(membership_expire_date#866)) THEN membership_expire_date END) AS membership_expire_date#937, count(CASE WHEN (isnan(is_cancel#846) OR isnull(is_cancel#846)) THEN is_cancel END) AS is_cancel#939]
+- Project [user_id#786, payment_method_id#796, payment_plan_days#806, plan_list_price#816, actual_amount_paid#826, is_auto_renew#836, transaction_date#856, to_date(membership_expire_date#23, Some(yyyyMMdd)) AS membership_expire_date#866, is_cancel#846]
   +- Project [user_id#786, payment_method_id#796, payment_plan_days#806, plan_list_price#816, actual_amount_paid#826, is_auto_renew#836, to_date(transaction_date#22, Some(yyyyMMdd)) AS transaction_date#856, membership_expire_date#23, is_cancel#846]
      +- Project [user_id#786, payment_method_id#796, payment_plan_days#806, plan_list_price#816, actual_amount_paid#826, is_auto_renew#836, transaction_date#22, membership_expire_date#23, cast(is_cancel#24 as boolean) AS is_cancel#846]
         +- Project [user_id#786, payment_method_id#796, payment_plan_days#806, plan_list_price#816, actual_amount_paid#826, cast(is_auto_renew#21 as boolean) AS is_auto_renew#836, transaction_date#22, membership_expire_date#23, is_cancel#24]
            +- Project [user_id#786, payment_method_id#796, payment_plan_days#806, plan_list_price#816, cast(actual_amount_paid#20 as int) AS actual_amount_paid#826, is_auto_renew#21, transaction_date#22, membership_expire_date#23, is_cancel#24]
               +- Project [user_id#786, payment_method_id#796, payment_plan_days#806, cast(plan_list_price#19 as int) AS plan_list_price#816, actual_amount_paid#20, is_auto_renew#21, transaction_date#22, membership_expire_date#23, is_cancel#24]
                  +- Project [user_id#786, payment_method_id#796, cast(payment_plan_days#18 as int) AS payment_plan_days#806, plan_list_price#19, actual_amount_paid#20, is_auto_renew#21, transaction_date#22, membership_expire_date#23, is_cancel#24]
                     +- Project [user_id#786, cast(payment_method_id#17 as int) AS payment_method_id#796, payment_plan_days#18, plan_list_price#19, actual_amount_paid#20, is_auto_renew#21, transaction_date#22, membership_expire_date#23, is_cancel#24]
                        +- Project [msno#16 AS user_id#786, payment_method_id#17, payment_plan_days#18, plan_list_price#19, actual_amount_paid#20, is_auto_renew#21, transaction_date#22, membership_expire_date#23, is_cancel#24]
                           +- Relation[msno#16,payment_method_id#17,payment_plan_days#18,plan_list_price#19,actual_amount_paid#20,is_auto_renew#21,transaction_date#22,membership_expire_date#23,is_cancel#24] csv


## 4. Data Cleaning

##### Column Names

In [11]:
df = df.withColumnRenamed("msno","user_id")
df.columns

['user_id',
 'payment_method_id',
 'payment_plan_days',
 'plan_list_price',
 'actual_amount_paid',
 'is_auto_renew',
 'transaction_date',
 'membership_expire_date',
 'is_cancel']

##### Data Types

In [12]:
from pyspark.sql import types
from pyspark.sql.functions import col, to_date

# Integer types
df = df.withColumn("payment_method_id",col("payment_method_id").cast(types.IntegerType()))
df = df.withColumn("payment_plan_days",col("payment_plan_days").cast(types.IntegerType()))
df = df.withColumn("plan_list_price",col("plan_list_price").cast(types.IntegerType()))
df = df.withColumn("actual_amount_paid",col("actual_amount_paid").cast(types.IntegerType()))

# Boolean types
df = df.withColumn("is_auto_renew",col("is_auto_renew").cast(types.BooleanType()))
df = df.withColumn("is_cancel",col("is_cancel").cast(types.BooleanType()))

# Date types
df= df.withColumn('transaction_date',to_date(df.transaction_date, 'yyyyMMdd'))
df= df.withColumn('membership_expire_date',to_date(df.membership_expire_date, 'yyyyMMdd'))

df.printSchema()

root
 |-- user_id: string (nullable = true)
 |-- payment_method_id: integer (nullable = true)
 |-- payment_plan_days: integer (nullable = true)
 |-- plan_list_price: integer (nullable = true)
 |-- actual_amount_paid: integer (nullable = true)
 |-- is_auto_renew: boolean (nullable = true)
 |-- transaction_date: date (nullable = true)
 |-- membership_expire_date: date (nullable = true)
 |-- is_cancel: boolean (nullable = true)



In [13]:
df.show()

+--------------------+-----------------+-----------------+---------------+------------------+-------------+----------------+----------------------+---------+
|             user_id|payment_method_id|payment_plan_days|plan_list_price|actual_amount_paid|is_auto_renew|transaction_date|membership_expire_date|is_cancel|
+--------------------+-----------------+-----------------+---------------+------------------+-------------+----------------+----------------------+---------+
|++6eU4LsQ3UQ20ILS...|               32|               90|            298|               298|        false|      2017-01-31|            2017-05-04|    false|
|++lvGPJOinuin/8es...|               41|               30|            149|               149|         true|      2015-08-09|            2019-04-12|    false|
|+/GXNtXWQVfKrEDqY...|               36|               30|            180|               180|         true|      2017-03-03|            2017-04-22|    false|
|+/w1UrZwyka4C9oNH...|               36|            