# 2.1 Members Data Cleaning

##### Description

Basic data visualization and data formatting for members.csv

##### Notebook Steps

1. Connect Spark
1. Input Data
1. Examine Data
1. Data Cleaning
1. Output Data

## 1. Connect Spark

In [1]:
import pyspark
sc = pyspark.SparkContext(appName="mems-clean")
sc.setLogLevel("INFO")

from pyspark.sql import SQLContext
sqlContext = SQLContext(sc)

## 2. Input Data

In [2]:
import os
import zipfile
import pandas as pd

with zipfile.ZipFile('../../data/1-data_acquisition/1-members.output.zip', 'r') as zip_ref:
    zip_ref.extractall()
    
df = sqlContext.read.csv('1-members.output.csv', header=True)

## 3. Examine Data

##### show()

In [3]:
df.show()

+--------------------+----+---+------+--------------+----------------------+
|                msno|city| bd|gender|registered_via|registration_init_time|
+--------------------+----+---+------+--------------+----------------------+
|Rb9UwLQTrxzBVwCB6...|   1|  0|  null|            11|              20110911|
|+tJonkh+O1CA796Fm...|   1|  0|  null|             7|              20110914|
|cV358ssn7a0f7jZOw...|   1|  0|  null|            11|              20110915|
|9bzDeJP6sQodK73K5...|   1|  0|  null|            11|              20110915|
|WFLY3s7z4EZsieHCt...|   6| 32|female|             9|              20110915|
|yLkV2gbZ4GLFwqTOX...|   4| 30|  male|             9|              20110916|
|jNCGK78YkTyId3H3w...|   1|  0|  null|             7|              20110916|
|WH5Jq4mgtfUFXh2yz...|   5| 34|  male|             9|              20110916|
|tKmbR4X5VXjHmxERr...|   5| 19|  male|             9|              20110917|
|I0yFvqMoNkM8ZNHb6...|  13| 63|  male|             9|              20110918|

##### count()

In [4]:
df.count()

6769473

##### describe()

In [5]:
df.describe().show()

+-------+--------------------+-----------------+------------------+-------+------------------+----------------------+
|summary|                msno|             city|                bd| gender|    registered_via|registration_init_time|
+-------+--------------------+-----------------+------------------+-------+------------------+----------------------+
|  count|             6769473|          6769473|           6769473|2339968|           6769473|               6769473|
|   mean|                null|3.847357689439045| 9.795794295951842|   null| 5.253068739619761|  2.0145175906336136E7|
| stddev|                null|5.478359063260526|17.925899719010822|   null|2.3613983148064808|    23186.007612070716|
|    min|+++4vcS9aMH7KWdfh...|                1|               -10| female|                -1|              20040326|
|    max|zzzyOgMk9MljCerbC...|                9|               994|   male|                 9|              20170429|
+-------+--------------------+-----------------+--------

##### printSchema()

In [6]:
df.printSchema()

root
 |-- msno: string (nullable = true)
 |-- city: string (nullable = true)
 |-- bd: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- registered_via: string (nullable = true)
 |-- registration_init_time: string (nullable = true)



##### columns

In [7]:
df.columns

['msno', 'city', 'bd', 'gender', 'registered_via', 'registration_init_time']

##### head(5)

In [8]:
df.head(5)

[Row(msno='Rb9UwLQTrxzBVwCB6+bCcSQWZ9JiNLC9dXtM1oEsZA8=', city='1', bd='0', gender=None, registered_via='11', registration_init_time='20110911'),
 Row(msno='+tJonkh+O1CA796Fm5X60UMOtB6POHAwPjbTRVl/EuU=', city='1', bd='0', gender=None, registered_via='7', registration_init_time='20110914'),
 Row(msno='cV358ssn7a0f7jZOwGNWS07wCKVqxyiImJUX6xcIwKw=', city='1', bd='0', gender=None, registered_via='11', registration_init_time='20110915'),
 Row(msno='9bzDeJP6sQodK73K5CBlJ6fgIQzPeLnRl0p5B77XP+g=', city='1', bd='0', gender=None, registered_via='11', registration_init_time='20110915'),
 Row(msno='WFLY3s7z4EZsieHCt63XrsdtfTEmJ+2PnnKLH5GY4Tk=', city='6', bd='32', gender='female', registered_via='9', registration_init_time='20110915')]

##### tail(5)

In [9]:
df.tail(5)

[Row(msno='VSGkb3hyBRUtb/b1MQUZbvOkktS3vKLnhMHW0CF8eyU=', city='1', bd='0', gender=None, registered_via='7', registration_init_time='20151020'),
 Row(msno='nWjH7glPkZ7jOVaCRwwjlpmp0T1hSWdv8hMJxiWCwKc=', city='1', bd='0', gender=None, registered_via='7', registration_init_time='20151020'),
 Row(msno='GH+b5+1tlv7ZZXsA8upBzVXMTLyffKcsF7WoU8b5rOI=', city='15', bd='26', gender='female', registered_via='4', registration_init_time='20151020'),
 Row(msno='XVlwT3fdCFGKqerEKBzUIjK+jzI6jzSke4cDMVhYyjE=', city='1', bd='0', gender=None, registered_via='4', registration_init_time='20151020'),
 Row(msno='isBt+JlgvZRNy6lxVr5vvuJ4lD00ofTaKyJ+uRnGcPg=', city='1', bd='0', gender=None, registered_via='4', registration_init_time='20151021')]

##### Null per Column

In [10]:
from pyspark.sql.functions import isnan, when, count, col

df.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in df.columns]).show()

+----+----+---+-------+--------------+----------------------+
|msno|city| bd| gender|registered_via|registration_init_time|
+----+----+---+-------+--------------+----------------------+
|   0|   0|  0|4429505|             0|                     0|
+----+----+---+-------+--------------+----------------------+



##### Value Counts

In [11]:
df.groupBy('city').count().orderBy('count').show()

+----+------+
|city| count|
+----+------+
|  19|  1199|
|  20|  4233|
|  16|  5092|
|   7| 11610|
|   3| 27282|
|  17| 27772|
|  21| 30837|
|  10| 32482|
|  18| 38039|
|   8| 45975|
|  11| 47489|
|   9| 47639|
|  12| 66843|
|  14| 89940|
|   6|135200|
|  15|190213|
|  22|210407|
|   4|246848|
|  13|320978|
|   5|385069|
+----+------+
only showing top 20 rows



In [12]:
df.groupBy('bd').count().orderBy('count').show()

+----+-----+
|  bd|count|
+----+-----+
| 786|    1|
| 584|    1|
|1056|    1|
| 323|    1|
|-512|    1|
|1111|    1|
|1958|    1|
| 685|    1|
|-489|    1|
|1021|    1|
| 334|    1|
| 743|    1|
| 462|    1|
| -30|    1|
| 919|    1|
| 124|    1|
| 926|    1|
| 155|    1|
| 940|    1|
|2016|    1|
+----+-----+
only showing top 20 rows



In [13]:
df.groupBy('gender').count().orderBy('count').show()

+------+-------+
|gender|  count|
+------+-------+
|female|1144613|
|  male|1195355|
|  null|4429505|
+------+-------+



In [14]:
df.groupBy('registered_via').count().orderBy('count').show()

+--------------+-------+
|registered_via|  count|
+--------------+-------+
|            -1|      1|
|            18|      5|
|            10|     10|
|             1|     43|
|            14|    615|
|            16|    888|
|            19|    974|
|             6|   1213|
|             2|   1452|
|            17|   1494|
|             5|   3115|
|             8|   3982|
|            13|   5455|
|            11|  25047|
|             7| 805895|
|             9|1482863|
|             3|1643208|
|             4|2793213|
+--------------+-------+



## 4. Data Cleaning

In [15]:
from pyspark.sql import types
from pyspark.sql.functions import col, to_date, regexp_replace, when
from operator import add

### Columns

##### msno
The msno column corresponds to user ids for the dataset, so the column is renamed from msno to user_id.

In [16]:
df = df.withColumnRenamed("msno","user_id")

##### city
The city column is currently formatted as a column of strings. The actual data is represented as integers, so the column is cast to match.

In [17]:
df = df.withColumn("city",col("city").cast(types.IntegerType()))

##### bd
The bd column has a high number of outlier values, so the column will be dropped.

In [18]:
df = df.drop(df.bd)

##### gender
The gender column has a high number of null values, so the column will be dropped.

In [19]:
df = df.drop(df.gender)

##### registered_via
As with the city column, the registered_via column is cast from string to integer.

In [20]:
df = df.withColumn("registered_via",col("registered_via").cast(types.IntegerType()))

##### registration_init_time
The registration_init_time column must be parsed and cast to a date object. The column is also renamed to registration_date.

In [21]:
df= df.withColumn('registration_init_time',to_date(df.registration_init_time, 'yyyyMMdd'))
df = df.withColumnRenamed("registration_init_time", "registration_date")

## 5. Data Output

##### Final Check

In [22]:
df.printSchema()

root
 |-- user_id: string (nullable = true)
 |-- city: integer (nullable = true)
 |-- registered_via: integer (nullable = true)
 |-- registration_date: date (nullable = true)



In [23]:
df.show()

+--------------------+----+--------------+-----------------+
|             user_id|city|registered_via|registration_date|
+--------------------+----+--------------+-----------------+
|Rb9UwLQTrxzBVwCB6...|   1|            11|       2011-09-11|
|+tJonkh+O1CA796Fm...|   1|             7|       2011-09-14|
|cV358ssn7a0f7jZOw...|   1|            11|       2011-09-15|
|9bzDeJP6sQodK73K5...|   1|            11|       2011-09-15|
|WFLY3s7z4EZsieHCt...|   6|             9|       2011-09-15|
|yLkV2gbZ4GLFwqTOX...|   4|             9|       2011-09-16|
|jNCGK78YkTyId3H3w...|   1|             7|       2011-09-16|
|WH5Jq4mgtfUFXh2yz...|   5|             9|       2011-09-16|
|tKmbR4X5VXjHmxERr...|   5|             9|       2011-09-17|
|I0yFvqMoNkM8ZNHb6...|  13|             9|       2011-09-18|
|OoDwiKZM+ZGr9P3fR...|   1|             7|       2011-09-18|
|dCvvBHlaOAqgkAcv3...|  22|             9|       2011-09-19|
|6bra2AiVV8SGlm7R6...|   4|             9|       2011-09-19|
|4De1jAxNRABoyRBDZ...|  

##### Output to File

In [24]:
filepath = '../../data/2-data_cleaning/2-members.output.csv'

df.write.format('com.databricks.spark.csv').options(header='true').save(filepath)