In [1]:
import pyspark

In [2]:
# Start Session
from pyspark.sql import SparkSession

In [3]:
spark = SparkSession.builder.appName('DiwaliSales').getOrCreate()

In [4]:
# Session summary
spark

In [5]:
# Reading CSV
df_pyspark = spark.read.csv('data.csv')

In [6]:
df_pyspark.show()

+-------+---------+----------+------+---------+---+--------------+----------------+--------+---------------+----------------+------+--------+------+--------+
|    _c0|      _c1|       _c2|   _c3|      _c4|_c5|           _c6|             _c7|     _c8|            _c9|            _c10|  _c11|    _c12|  _c13|    _c14|
+-------+---------+----------+------+---------+---+--------------+----------------+--------+---------------+----------------+------+--------+------+--------+
|User_ID|Cust_name|Product_ID|Gender|Age Group|Age|Marital_Status|           State|    Zone|     Occupation|Product_Category|Orders|  Amount|Status|unnamed1|
|1002903|Sanskriti| P00125942|     F|    26-35| 28|             0|     Maharashtra| Western|     Healthcare|            Auto|     1|   23952|  NULL|    NULL|
|1000732|   Kartik| P00110942|     F|    26-35| 35|             1|  Andhra�Pradesh|Southern|           Govt|            Auto|     3|   23934|  NULL|    NULL|
|1001990|    Bindu| P00118542|     F|    26-35| 35| 

In [7]:
# As the above table shows it is not in proper format, i.e. it create _c0, _c1.. columns instead of using actual columns. Thus,

df_pyspark = spark.read.option('header', 'true').csv('data.csv')

# OR
# df_pyspark = spark.read.csv('data.csv', header=True, inferSchema=True)

In [8]:
df_pyspark.show(3)

+-------+---------+----------+------+---------+---+--------------+--------------+--------+----------+----------------+------+------+------+--------+
|User_ID|Cust_name|Product_ID|Gender|Age Group|Age|Marital_Status|         State|    Zone|Occupation|Product_Category|Orders|Amount|Status|unnamed1|
+-------+---------+----------+------+---------+---+--------------+--------------+--------+----------+----------------+------+------+------+--------+
|1002903|Sanskriti| P00125942|     F|    26-35| 28|             0|   Maharashtra| Western|Healthcare|            Auto|     1| 23952|  NULL|    NULL|
|1000732|   Kartik| P00110942|     F|    26-35| 35|             1|Andhra�Pradesh|Southern|      Govt|            Auto|     3| 23934|  NULL|    NULL|
|1001990|    Bindu| P00118542|     F|    26-35| 35|             1| Uttar Pradesh| Central|Automobile|            Auto|     3| 23924|  NULL|    NULL|
+-------+---------+----------+------+---------+---+--------------+--------------+--------+----------+-----

In [9]:
type(df_pyspark)

pyspark.sql.dataframe.DataFrame

In [10]:
# Head in spark is used to get first 3 rows in list format.
df_pyspark.head(3)

[Row(User_ID='1002903', Cust_name='Sanskriti', Product_ID='P00125942', Gender='F', Age Group='26-35', Age='28', Marital_Status='0', State='Maharashtra', Zone='Western', Occupation='Healthcare', Product_Category='Auto', Orders='1', Amount='23952', Status=None, unnamed1=None),
 Row(User_ID='1000732', Cust_name='Kartik', Product_ID='P00110942', Gender='F', Age Group='26-35', Age='35', Marital_Status='1', State='Andhra�Pradesh', Zone='Southern', Occupation='Govt', Product_Category='Auto', Orders='3', Amount='23934', Status=None, unnamed1=None),
 Row(User_ID='1001990', Cust_name='Bindu', Product_ID='P00118542', Gender='F', Age Group='26-35', Age='35', Marital_Status='1', State='Uttar Pradesh', Zone='Central', Occupation='Automobile', Product_Category='Auto', Orders='3', Amount='23924', Status=None, unnamed1=None)]

In [11]:
# Like pd.DataFrame.info() we use below method to see what col has what data type.
df_pyspark.printSchema()

root
 |-- User_ID: string (nullable = true)
 |-- Cust_name: string (nullable = true)
 |-- Product_ID: string (nullable = true)
 |-- Gender: string (nullable = true)
 |-- Age Group: string (nullable = true)
 |-- Age: string (nullable = true)
 |-- Marital_Status: string (nullable = true)
 |-- State: string (nullable = true)
 |-- Zone: string (nullable = true)
 |-- Occupation: string (nullable = true)
 |-- Product_Category: string (nullable = true)
 |-- Orders: string (nullable = true)
 |-- Amount: string (nullable = true)
 |-- Status: string (nullable = true)
 |-- unnamed1: string (nullable = true)



__Reading Columns and indexing:__

In [12]:
# To see all the columns
df_pyspark.columns

['User_ID',
 'Cust_name',
 'Product_ID',
 'Gender',
 'Age Group',
 'Age',
 'Marital_Status',
 'State',
 'Zone',
 'Occupation',
 'Product_Category',
 'Orders',
 'Amount',
 'Status',
 'unnamed1']

In [13]:
# Viewing column -> Cust_name
df_pyspark.select('Cust_name').show(5)

+---------+
|Cust_name|
+---------+
|Sanskriti|
|   Kartik|
|    Bindu|
|   Sudevi|
|     Joni|
+---------+
only showing top 5 rows



In [14]:
# Viewing Multiple Cols: Cust_name, Age
df_pyspark.select(['Cust_name', 'Age']).show(5)

+---------+---+
|Cust_name|Age|
+---------+---+
|Sanskriti| 28|
|   Kartik| 35|
|    Bindu| 35|
|   Sudevi| 16|
|     Joni| 28|
+---------+---+
only showing top 5 rows



In [15]:
df_pyspark.dtypes

[('User_ID', 'string'),
 ('Cust_name', 'string'),
 ('Product_ID', 'string'),
 ('Gender', 'string'),
 ('Age Group', 'string'),
 ('Age', 'string'),
 ('Marital_Status', 'string'),
 ('State', 'string'),
 ('Zone', 'string'),
 ('Occupation', 'string'),
 ('Product_Category', 'string'),
 ('Orders', 'string'),
 ('Amount', 'string'),
 ('Status', 'string'),
 ('unnamed1', 'string')]

In [16]:
# Same as df.describe() in pandas, spark also hass .describe(). Values are NULL because it takes string into account as well.
df_pyspark.describe().show()

+-------+------------------+---------+----------+------+---------+------------------+------------------+--------------+-------+-----------+----------------+------------------+-----------------+------+--------+
|summary|           User_ID|Cust_name|Product_ID|Gender|Age Group|               Age|    Marital_Status|         State|   Zone| Occupation|Product_Category|            Orders|           Amount|Status|unnamed1|
+-------+------------------+---------+----------+------+---------+------------------+------------------+--------------+-------+-----------+----------------+------------------+-----------------+------+--------+
|  count|             11251|    11251|     11251| 11251|    11251|             11251|             11251|         11251|  11251|      11251|           11251|             11251|            11239|     0|       0|
|   mean| 1003004.488134388|     NULL|      NULL|  NULL|     NULL|35.421207003821884|0.4203181939383166|          NULL|   NULL|       NULL|            NULL|2.48

In [17]:
# Adding a new column
df_pyspark = df_pyspark.withColumn('Is 28 Year Old?', df_pyspark['Age']=='28')

# As Amount is string column we cannot execute below code but if it was int:
# df_pyspark.withColumn('Amount (in k)', df_pyspark['Amount']/1000)

In [18]:
df_pyspark.show(3)

+-------+---------+----------+------+---------+---+--------------+--------------+--------+----------+----------------+------+------+------+--------+---------------+
|User_ID|Cust_name|Product_ID|Gender|Age Group|Age|Marital_Status|         State|    Zone|Occupation|Product_Category|Orders|Amount|Status|unnamed1|Is 28 Year Old?|
+-------+---------+----------+------+---------+---+--------------+--------------+--------+----------+----------------+------+------+------+--------+---------------+
|1002903|Sanskriti| P00125942|     F|    26-35| 28|             0|   Maharashtra| Western|Healthcare|            Auto|     1| 23952|  NULL|    NULL|           true|
|1000732|   Kartik| P00110942|     F|    26-35| 35|             1|Andhra�Pradesh|Southern|      Govt|            Auto|     3| 23934|  NULL|    NULL|          false|
|1001990|    Bindu| P00118542|     F|    26-35| 35|             1| Uttar Pradesh| Central|Automobile|            Auto|     3| 23924|  NULL|    NULL|          false|
+-------+-

In [19]:
# Dropping Columns:
df_pyspark = df_pyspark.drop('Is 28 Year Old?')

In [20]:
# Column Rename
df_pyspark = df_pyspark.withColumnRenamed('State', 'Pradesh')
df_pyspark.show(3)

+-------+---------+----------+------+---------+---+--------------+--------------+--------+----------+----------------+------+------+------+--------+
|User_ID|Cust_name|Product_ID|Gender|Age Group|Age|Marital_Status|       Pradesh|    Zone|Occupation|Product_Category|Orders|Amount|Status|unnamed1|
+-------+---------+----------+------+---------+---+--------------+--------------+--------+----------+----------------+------+------+------+--------+
|1002903|Sanskriti| P00125942|     F|    26-35| 28|             0|   Maharashtra| Western|Healthcare|            Auto|     1| 23952|  NULL|    NULL|
|1000732|   Kartik| P00110942|     F|    26-35| 35|             1|Andhra�Pradesh|Southern|      Govt|            Auto|     3| 23934|  NULL|    NULL|
|1001990|    Bindu| P00118542|     F|    26-35| 35|             1| Uttar Pradesh| Central|Automobile|            Auto|     3| 23924|  NULL|    NULL|
+-------+---------+----------+------+---------+---+--------------+--------------+--------+----------+-----

__Handling Null Values__

In [21]:
# na.drop takes how as argument which has two option:
# 1. how = any -> drop a row if it contains any null value.
# 2. how = all -> Drops rows if all the values are null.

df_pyspark.na.drop(how='all').show(3)

+-------+---------+----------+------+---------+---+--------------+--------------+--------+----------+----------------+------+------+------+--------+
|User_ID|Cust_name|Product_ID|Gender|Age Group|Age|Marital_Status|       Pradesh|    Zone|Occupation|Product_Category|Orders|Amount|Status|unnamed1|
+-------+---------+----------+------+---------+---+--------------+--------------+--------+----------+----------------+------+------+------+--------+
|1002903|Sanskriti| P00125942|     F|    26-35| 28|             0|   Maharashtra| Western|Healthcare|            Auto|     1| 23952|  NULL|    NULL|
|1000732|   Kartik| P00110942|     F|    26-35| 35|             1|Andhra�Pradesh|Southern|      Govt|            Auto|     3| 23934|  NULL|    NULL|
|1001990|    Bindu| P00118542|     F|    26-35| 35|             1| Uttar Pradesh| Central|Automobile|            Auto|     3| 23924|  NULL|    NULL|
+-------+---------+----------+------+---------+---+--------------+--------------+--------+----------+-----

In [22]:
# It also has thresh as argument which will drop rows having more than k null values, given thres=k.
df_pyspark.na.drop(thresh=2 ).show(3)

+-------+---------+----------+------+---------+---+--------------+--------------+--------+----------+----------------+------+------+------+--------+
|User_ID|Cust_name|Product_ID|Gender|Age Group|Age|Marital_Status|       Pradesh|    Zone|Occupation|Product_Category|Orders|Amount|Status|unnamed1|
+-------+---------+----------+------+---------+---+--------------+--------------+--------+----------+----------------+------+------+------+--------+
|1002903|Sanskriti| P00125942|     F|    26-35| 28|             0|   Maharashtra| Western|Healthcare|            Auto|     1| 23952|  NULL|    NULL|
|1000732|   Kartik| P00110942|     F|    26-35| 35|             1|Andhra�Pradesh|Southern|      Govt|            Auto|     3| 23934|  NULL|    NULL|
|1001990|    Bindu| P00118542|     F|    26-35| 35|             1| Uttar Pradesh| Central|Automobile|            Auto|     3| 23924|  NULL|    NULL|
+-------+---------+----------+------+---------+---+--------------+--------------+--------+----------+-----

In [25]:
df_pyspark.na.drop(subset=["Zone","User_ID"]).show(3)

+-------+---------+----------+------+---------+---+--------------+--------------+--------+----------+----------------+------+------+------+--------+
|User_ID|Cust_name|Product_ID|Gender|Age Group|Age|Marital_Status|       Pradesh|    Zone|Occupation|Product_Category|Orders|Amount|Status|unnamed1|
+-------+---------+----------+------+---------+---+--------------+--------------+--------+----------+----------------+------+------+------+--------+
|1002903|Sanskriti| P00125942|     F|    26-35| 28|             0|   Maharashtra| Western|Healthcare|            Auto|     1| 23952|  NULL|    NULL|
|1000732|   Kartik| P00110942|     F|    26-35| 35|             1|Andhra�Pradesh|Southern|      Govt|            Auto|     3| 23934|  NULL|    NULL|
|1001990|    Bindu| P00118542|     F|    26-35| 35|             1| Uttar Pradesh| Central|Automobile|            Auto|     3| 23924|  NULL|    NULL|
+-------+---------+----------+------+---------+---+--------------+--------------+--------+----------+-----

In [29]:
# Fills every None value with "Missing Value"
# df_pyspark.na.fill("Missing Values").show(3)

# Fills every None value with 0
df_pyspark.na.fill("Married", "Status").show(3)

+-------+---------+----------+------+---------+---+--------------+--------------+--------+----------+----------------+------+------+-------+--------+
|User_ID|Cust_name|Product_ID|Gender|Age Group|Age|Marital_Status|       Pradesh|    Zone|Occupation|Product_Category|Orders|Amount| Status|unnamed1|
+-------+---------+----------+------+---------+---+--------------+--------------+--------+----------+----------------+------+------+-------+--------+
|1002903|Sanskriti| P00125942|     F|    26-35| 28|             0|   Maharashtra| Western|Healthcare|            Auto|     1| 23952|Married|    NULL|
|1000732|   Kartik| P00110942|     F|    26-35| 35|             1|Andhra�Pradesh|Southern|      Govt|            Auto|     3| 23934|Married|    NULL|
|1001990|    Bindu| P00118542|     F|    26-35| 35|             1| Uttar Pradesh| Central|Automobile|            Auto|     3| 23924|Married|    NULL|
+-------+---------+----------+------+---------+---+--------------+--------------+--------+----------

In [30]:
from pyspark.ml.feature import Imputer

imputer = Imputer(inputCols=['Age'], 
                  outputCols=['Age_Imputed']
                  ).setStrategy("mean")

In [33]:
# Use:
# imputer.fit(df_pyspark).transform(df_pyspark).show(3)