# Task 2.2: Data Analysis using Big Data tools

### Loading data into PySpark

In [1]:
import findspark
findspark.init('/usr/local/spark')
import pyspark
from pyspark.sql import HiveContext
from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark.sql import Row
from pyspark.sql import functions as F
from pyspark.sql.functions import countDistinct
from pyspark.sql.functions import col,isnan, when, count
from pyspark.sql.types import StructType, StructField, StringType, IntegerType

In [2]:
from pyspark.sql import SparkSession
spark = SparkSession \
    .builder \
    .appName("capstone_checkpoint_two") \
    .getOrCreate()

In [3]:
data = spark.read.csv('hdfs://localhost:54310/user/hduser/User_product_purchase_details_p2.csv', inferSchema="true", header="true")
data.printSchema()

root
 |-- User_ID: integer (nullable = true)
 |-- Product_ID: string (nullable = true)
 |-- City_Category: string (nullable = true)
 |-- Stay_In_Current_City_Years: string (nullable = true)
 |-- Marital_Status: integer (nullable = true)
 |-- Product_Category_1: integer (nullable = true)
 |-- Product_Category_2: integer (nullable = true)
 |-- Product_Category_3: integer (nullable = true)
 |-- Purchase: integer (nullable = true)
 |-- Gender: string (nullable = true)
 |-- Age: string (nullable = true)
 |-- Occupation: integer (nullable = true)
 |-- City_Code: string (nullable = true)
 |-- State_Code: string (nullable = true)
 |-- Zip_Code: integer (nullable = true)
 |-- City_Name: string (nullable = true)
 |-- State: string (nullable = true)



In [5]:
spark.sql("CREATE DATABASE IF NOT EXISTS capstone")

DataFrame[]

In [6]:
spark.sql("show databases").show()

+------------+
|databaseName|
+------------+
|    capstone|
|     default|
+------------+



In [7]:
spark.sql('show tables').show()

+--------+---------+-----------+
|database|tableName|isTemporary|
+--------+---------+-----------+
+--------+---------+-----------+



In [8]:
data.write.mode("overwrite").saveAsTable("capstone.purchase")
table=spark.sql('select * from capstone.purchase')
table.show()

+-------+----------+-------------+--------------------------+--------------+------------------+------------------+------------------+--------+------+-----+----------+---------+----------+--------+---------------+----------+
|User_ID|Product_ID|City_Category|Stay_In_Current_City_Years|Marital_Status|Product_Category_1|Product_Category_2|Product_Category_3|Purchase|Gender|  Age|Occupation|City_Code|State_Code|Zip_Code|      City_Name|     State|
+-------+----------+-------------+--------------------------+--------------+------------------+------------------+------------------+--------+------+-----+----------+---------+----------+--------+---------------+----------+
|1000001| P00069042|            A|                         2|             0|                 3|              null|              null|    8370|     F| 0-17|        10|     C259|       S16|   42420|      Henderson|  Kentucky|
|1000001| P00248942|            A|                         2|             0|                 1|         

In [9]:
spark.sql("use capstone")

DataFrame[]

In [10]:
spark.sql('show tables').show()

+--------+---------+-----------+
|database|tableName|isTemporary|
+--------+---------+-----------+
|capstone| purchase|      false|
+--------+---------+-----------+



In [12]:
table.printSchema()

root
 |-- User_ID: integer (nullable = true)
 |-- Product_ID: string (nullable = true)
 |-- City_Category: string (nullable = true)
 |-- Stay_In_Current_City_Years: string (nullable = true)
 |-- Marital_Status: integer (nullable = true)
 |-- Product_Category_1: integer (nullable = true)
 |-- Product_Category_2: integer (nullable = true)
 |-- Product_Category_3: integer (nullable = true)
 |-- Purchase: integer (nullable = true)
 |-- Gender: string (nullable = true)
 |-- Age: string (nullable = true)
 |-- Occupation: integer (nullable = true)
 |-- City_Code: string (nullable = true)
 |-- State_Code: string (nullable = true)
 |-- Zip_Code: integer (nullable = true)
 |-- City_Name: string (nullable = true)
 |-- State: string (nullable = true)



In [13]:
df=spark.sql('select * from capstone.purchase')

DATA PROFILING

In [14]:
#Find the no. of null for each column 
df.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in df.columns]).show()

+-------+----------+-------------+--------------------------+--------------+------------------+------------------+------------------+--------+------+------+----------+---------+----------+--------+---------+------+
|User_ID|Product_ID|City_Category|Stay_In_Current_City_Years|Marital_Status|Product_Category_1|Product_Category_2|Product_Category_3|Purchase|Gender|   Age|Occupation|City_Code|State_Code|Zip_Code|City_Name| State|
+-------+----------+-------------+--------------------------+--------------+------------------+------------------+------------------+--------+------+------+----------+---------+----------+--------+---------+------+
|      0|         0|            0|                         0|             0|                 0|            173638|            383247|       0|547782|550053|    550055|   550055|    550049|  550053|   550055|550049|
+-------+----------+-------------+--------------------------+--------------+------------------+------------------+------------------+-------

In [15]:
#find the mean of each column
df.select(mean ('Product_Category_1'), mean ('Product_Category_2'), mean ('Product_Category_3'), mean ('Purchase')).show()

+-----------------------+-----------------------+-----------------------+-----------------+
|avg(Product_Category_1)|avg(Product_Category_2)|avg(Product_Category_3)|    avg(Purchase)|
+-----------------------+-----------------------+-----------------------+-----------------+
|      5.404270017525106|      9.842329251122386|     12.668243206790512|9263.968712959126|
+-----------------------+-----------------------+-----------------------+-----------------+



In [16]:
#find the max and min values of each column
df.agg({'Product_Category_1': 'max'}).show()
df.agg({'Product_Category_2': 'max'}).show()
df.agg({'Product_Category_3': 'max'}).show()
df.agg({'Purchase': 'max'}).show()

+-----------------------+
|max(Product_Category_1)|
+-----------------------+
|                     20|
+-----------------------+

+-----------------------+
|max(Product_Category_2)|
+-----------------------+
|                     18|
+-----------------------+

+-----------------------+
|max(Product_Category_3)|
+-----------------------+
|                     18|
+-----------------------+

+-------------+
|max(Purchase)|
+-------------+
|        23961|
+-------------+



In [17]:
df.agg({'Product_Category_1': 'min'}).show()
df.agg({'Product_Category_2': 'min'}).show()
df.agg({'Product_Category_3': 'min'}).show()
df.agg({'Purchase': 'min'}).show()

+-----------------------+
|min(Product_Category_1)|
+-----------------------+
|                      1|
+-----------------------+

+-----------------------+
|min(Product_Category_2)|
+-----------------------+
|                      2|
+-----------------------+

+-----------------------+
|min(Product_Category_3)|
+-----------------------+
|                      3|
+-----------------------+

+-------------+
|min(Purchase)|
+-------------+
|           12|
+-------------+



In [18]:
df.describe().show()

+-------+------------------+----------+-------------+--------------------------+-------------------+------------------+------------------+------------------+-----------------+------+----+-----------------+---------+----------+-----------------+---------------+----------+
|summary|           User_ID|Product_ID|City_Category|Stay_In_Current_City_Years|     Marital_Status|Product_Category_1|Product_Category_2|Product_Category_3|         Purchase|Gender| Age|       Occupation|City_Code|State_Code|         Zip_Code|      City_Name|     State|
+-------+------------------+----------+-------------+--------------------------+-------------------+------------------+------------------+------------------+-----------------+------+----+-----------------+---------+----------+-----------------+---------------+----------+
|  count|            550068|    550068|       550068|                    550068|             550068|            550068|            376430|            166821|           550068|  2286|  

In [19]:
df.count()

550068

In [20]:
df.select(countDistinct("Product_ID")).show()

+--------------------------+
|count(DISTINCT Product_ID)|
+--------------------------+
|                      3631|
+--------------------------+



In [21]:
df.select(countDistinct("User_ID")).show()

+-----------------------+
|count(DISTINCT User_ID)|
+-----------------------+
|                   5891|
+-----------------------+



In [22]:
df.select(countDistinct("City_Name")).show()

+-------------------------+
|count(DISTINCT City_Name)|
+-------------------------+
|                        3|
+-------------------------+



In [23]:
df.select(countDistinct("State")).show()

+---------------------+
|count(DISTINCT State)|
+---------------------+
|                    3|
+---------------------+



In [26]:
df1 = df.na.drop()

In [27]:
df1.select([count(when(col(c).contains('None') | \
                            col(c).contains('NULL') | \
                            (col(c) == '' ) | \
                            col(c).isNull() | \
                            isnan(c), c 
                           )).alias(c)
                    for c in nonulldf.columns]).show()


+-------+----------+-------------+--------------------------+--------------+------------------+------------------+------------------+--------+------+---+----------+---------+----------+--------+---------+-----+
|User_ID|Product_ID|City_Category|Stay_In_Current_City_Years|Marital_Status|Product_Category_1|Product_Category_2|Product_Category_3|Purchase|Gender|Age|Occupation|City_Code|State_Code|Zip_Code|City_Name|State|
+-------+----------+-------------+--------------------------+--------------+------------------+------------------+------------------+--------+------+---+----------+---------+----------+--------+---------+-----+
|      0|         0|            0|                         0|             0|                 0|                 0|                 0|       0|     0|  0|         0|        0|         0|       0|        0|    0|
+-------+----------+-------------+--------------------------+--------------+------------------+------------------+------------------+--------+------+---+---

In [29]:
df1.dtypes

[('User_ID', 'int'),
 ('Product_ID', 'string'),
 ('City_Category', 'string'),
 ('Stay_In_Current_City_Years', 'string'),
 ('Marital_Status', 'int'),
 ('Product_Category_1', 'int'),
 ('Product_Category_2', 'int'),
 ('Product_Category_3', 'int'),
 ('Purchase', 'int'),
 ('Gender', 'string'),
 ('Age', 'string'),
 ('Occupation', 'int'),
 ('City_Code', 'string'),
 ('State_Code', 'string'),
 ('Zip_Code', 'int'),
 ('City_Name', 'string'),
 ('State', 'string')]

In [28]:
#ML modelling
features = ['Product_ID','Marital_Status','Gender','Age','City_Name','State']
from pyspark.ml.feature import VectorAssembler
assembler = VectorAssembler(inputCols = features,outputCol='features')
new_df = assembler.transform(df1)

IllegalArgumentException: 'Data type StringType is not supported.'