# 2.1 Logs Data Cleaning

##### Description

Basic data visualization and data formatting for logs.csv

##### Notebook Steps

1. Connect Spark
1. Input Data
1. Examine Data
1. Data Cleaning
1. Output Data

## 1. Connect Spark

In [1]:
import pyspark
sc = pyspark.SparkContext(appName="logs-clean")
sc.setLogLevel("INFO")

from pyspark.sql import SQLContext
sqlContext = SQLContext(sc)

## 2. Input Data

In [2]:
import os
import zipfile

with zipfile.ZipFile('../../data/1-data_acquisition/1-logs.output.zip', 'r') as zip_ref:
    zip_ref.extractall()
    
df = sqlContext.read.csv('1-logs.output.csv', header=True)

## 3. Examine Data

##### show()

In [3]:
df.show()

+--------------------+--------+------+------+------+-------+-------+-------+----------+
|                msno|    date|num_25|num_50|num_75|num_985|num_100|num_unq|total_secs|
+--------------------+--------+------+------+------+-------+-------+-------+----------+
|u9E91QDTvHLq6NXjE...|20170331|     8|     4|     0|      1|     21|     18|  6309.273|
|nTeWW/eOZA/UHKdD5...|20170330|     2|     2|     1|      0|      9|     11|  2390.699|
|2UqkWXwZbIjs03dHL...|20170331|    52|     3|     5|      3|     84|    110| 23203.337|
|ycwLc+m2O0a85jSLA...|20170331|   176|     4|     2|      2|     19|    191|  7100.454|
|EGcbTofOSOkMmQyN1...|20170331|     2|     1|     0|      1|    112|     93| 28401.558|
|qR/ndQ5B+1cY+c9ih...|20170331|     3|     0|     0|      0|     39|     41|  9786.842|
|N6ch5ArfJixq9mvAR...|20170330|     9|     1|     0|      0|     18|     26|  4920.255|
|JEjl2W1ivEI6epeob...|20170331|   181|    68|     5|      3|     54|    291| 22433.105|
|lPK4IYIFdfTT6pq7x...|20170331| 

##### count()

In [4]:
df.count()

18396362

##### describe()

In [5]:
df.describe().show()

+-------+--------------------+--------------------+------------------+------------------+------------------+------------------+-----------------+-----------------+-----------------+
|summary|                msno|                date|            num_25|            num_50|            num_75|           num_985|          num_100|          num_unq|       total_secs|
+-------+--------------------+--------------------+------------------+------------------+------------------+------------------+-----------------+-----------------+-----------------+
|  count|            18396362|            18396362|          18396362|          18396362|          18396362|          18396362|         18396362|         18396362|         18396362|
|   mean|                null|2.0170316085612632E7| 6.191400941120858|1.5087888572751504|0.9413759089976594|1.0799049290289027|30.28246090178047|29.03614551616238|7904.813556356482|
| stddev|                null|    8.91672003449104|13.428266336534906| 3.908538557820418|1

##### printSchema()

In [6]:
df.printSchema()

root
 |-- msno: string (nullable = true)
 |-- date: string (nullable = true)
 |-- num_25: string (nullable = true)
 |-- num_50: string (nullable = true)
 |-- num_75: string (nullable = true)
 |-- num_985: string (nullable = true)
 |-- num_100: string (nullable = true)
 |-- num_unq: string (nullable = true)
 |-- total_secs: string (nullable = true)



##### columns

In [7]:
df.columns

['msno',
 'date',
 'num_25',
 'num_50',
 'num_75',
 'num_985',
 'num_100',
 'num_unq',
 'total_secs']

##### head(5)

In [8]:
df.head(5)

[Row(msno='u9E91QDTvHLq6NXjEaWv8u4QIqhrHk72kE+w31Gnhdg=', date='20170331', num_25='8', num_50='4', num_75='0', num_985='1', num_100='21', num_unq='18', total_secs='6309.273'),
 Row(msno='nTeWW/eOZA/UHKdD5L7DEqKKFTjaAj3ALLPoAWsU8n0=', date='20170330', num_25='2', num_50='2', num_75='1', num_985='0', num_100='9', num_unq='11', total_secs='2390.699'),
 Row(msno='2UqkWXwZbIjs03dHLU9KHJNNEvEkZVzm69f3jCS+uLI=', date='20170331', num_25='52', num_50='3', num_75='5', num_985='3', num_100='84', num_unq='110', total_secs='23203.337'),
 Row(msno='ycwLc+m2O0a85jSLALtr941AaZt9ai8Qwlg9n0Nql5U=', date='20170331', num_25='176', num_50='4', num_75='2', num_985='2', num_100='19', num_unq='191', total_secs='7100.454'),
 Row(msno='EGcbTofOSOkMmQyN1NMLxHEXJ1yV3t/JdhGwQ9wXjnI=', date='20170331', num_25='2', num_50='1', num_75='0', num_985='1', num_100='112', num_unq='93', total_secs='28401.558')]

##### tail(5)

In [9]:
df.tail(5)

[Row(msno='FGpiy2mB+vXLKziYRcY/xJcJEFJfRDfUqlU+p760f7E=', date='20170314', num_25='0', num_50='0', num_75='0', num_985='0', num_100='1', num_unq='1', total_secs='248.058'),
 Row(msno='iZRjKNMrw5ffEbfXODLhV/0tJLPbOH3am1WYDgqBf8Q=', date='20170306', num_25='0', num_50='0', num_75='0', num_985='0', num_100='1', num_unq='1', total_secs='311.0'),
 Row(msno='yztw4Y0EggG0w2wPkbMZx7ke7saSx7dLSfMheHZG/DQ=', date='20170331', num_25='0', num_50='0', num_75='0', num_985='0', num_100='17', num_unq='1', total_secs='3973.189'),
 Row(msno='swCHwkNx30/aENjq30qqaLlm7bUUytbMXdz1bH7g0Jk=', date='20170307', num_25='0', num_50='0', num_75='0', num_985='1', num_100='0', num_unq='1', total_secs='179.278'),
 Row(msno='pDQSIFZchNPmOproRzaNzy51/8yRXzLK3vvDXKzzGLQ=', date='20170329', num_25='0', num_50='0', num_75='0', num_985='0', num_100='1', num_unq='1', total_secs='264.288')]

##### Null per Column

In [10]:
from pyspark.sql.functions import isnan, when, count, col

df.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in df.columns]).show()

+----+----+------+------+------+-------+-------+-------+----------+
|msno|date|num_25|num_50|num_75|num_985|num_100|num_unq|total_secs|
+----+----+------+------+------+-------+-------+-------+----------+
|   0|   0|     0|     0|     0|      0|      0|      0|         0|
+----+----+------+------+------+-------+-------+-------+----------+



## 4. Data Cleaning

##### Column Names

In [11]:
df = df.withColumnRenamed("msno","user_id")
df.columns

['user_id',
 'date',
 'num_25',
 'num_50',
 'num_75',
 'num_985',
 'num_100',
 'num_unq',
 'total_secs']

##### Data Types

In [12]:
from pyspark.sql import types
from pyspark.sql.functions import col, to_date

# Integer types
df = df.withColumn("num_25",col("num_25").cast(types.IntegerType()))
df = df.withColumn("num_50",col("num_50").cast(types.IntegerType()))
df = df.withColumn("num_75",col("num_75").cast(types.IntegerType()))
df = df.withColumn("num_985",col("num_985").cast(types.IntegerType()))
df = df.withColumn("num_100",col("num_100").cast(types.IntegerType()))
df = df.withColumn("num_unq",col("num_unq").cast(types.IntegerType()))

# Decimal types
df = df.withColumn("total_secs",col("total_secs").cast(types.DecimalType()))

# Date types
df= df.withColumn('date',to_date(df.date, 'yyyyMMdd'))

df.printSchema()

root
 |-- user_id: string (nullable = true)
 |-- date: date (nullable = true)
 |-- num_25: integer (nullable = true)
 |-- num_50: integer (nullable = true)
 |-- num_75: integer (nullable = true)
 |-- num_985: integer (nullable = true)
 |-- num_100: integer (nullable = true)
 |-- num_unq: integer (nullable = true)
 |-- total_secs: decimal(10,0) (nullable = true)



In [13]:
df.show()

+--------------------+----------+------+------+------+-------+-------+-------+----------+
|             user_id|      date|num_25|num_50|num_75|num_985|num_100|num_unq|total_secs|
+--------------------+----------+------+------+------+-------+-------+-------+----------+
|u9E91QDTvHLq6NXjE...|2017-03-31|     8|     4|     0|      1|     21|     18|      6309|
|nTeWW/eOZA/UHKdD5...|2017-03-30|     2|     2|     1|      0|      9|     11|      2391|
|2UqkWXwZbIjs03dHL...|2017-03-31|    52|     3|     5|      3|     84|    110|     23203|
|ycwLc+m2O0a85jSLA...|2017-03-31|   176|     4|     2|      2|     19|    191|      7100|
|EGcbTofOSOkMmQyN1...|2017-03-31|     2|     1|     0|      1|    112|     93|     28402|
|qR/ndQ5B+1cY+c9ih...|2017-03-31|     3|     0|     0|      0|     39|     41|      9787|
|N6ch5ArfJixq9mvAR...|2017-03-30|     9|     1|     0|      0|     18|     26|      4920|
|JEjl2W1ivEI6epeob...|2017-03-31|   181|    68|     5|      3|     54|    291|     22433|
|lPK4IYIFd