In [2]:
# Pyspark DE Project "FlipKart Data Engineering Project EXPOSED for Beginners! | END to END PySpark Project" by "Be a Programmer YT Channel"
#import libraries
#!pip install kaggle
import kaggle
#download dataset using kaggle api --downloading anyone playstore dataset using old api
!kaggle datasets download san2deep/flipkart-product-dataset -f test.csv


  0%|          | 0.00/495k [00:00<?, ?B/s]
100%|██████████| 495k/495k [00:01<00:00, 295kB/s]
100%|██████████| 495k/495k [00:01<00:00, 295kB/s]


Dataset URL: https://www.kaggle.com/datasets/san2deep/flipkart-product-dataset
License(s): DbCL-1.0
Downloading test.csv to d:\Data_Engineer\Flipkart_Pyspark_DE_Project



In [49]:
# install and import pyspark 
#!pip install pyspark it is already installed
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType,StructField,StringType,IntegerType
from pyspark.sql.functions import *
spark=SparkSession.builder\
.appName("FlipkartPysparkApp")\
.config("spark.hadoop.fs.file.impl", "org.apache.hadoop.fs.LocalFileSystem")\
.getOrCreate()
#Create Dataframe
df=spark.read.load('test.csv',format='csv',sep=",",header='true',escape='"',inferSchema='True')# we can use read.csv to directly reading csv file but by default read.load is reading parquet format for other we need to define format

In [50]:
df.show(5) #df.show(5, truncate=False) means all column are fully visible

+-----+--------------------+------+---------+--------+---------+---------+----------+-------+-------+-------+-------+-------+----------+
|   id|               title|Rating|maincateg|platform|actprice1|norating1|noreviews1|star_5f|star_4f|star_3f|star_2f|star_1f|fulfilled1|
+-----+--------------------+------+---------+--------+---------+---------+----------+-------+-------+-------+-------+-------+----------+
| 2242|Casuals For Men  ...|   3.8|      Men|Flipkart|      999|    27928|      3543|  14238|   4295|   3457|   1962|   3976|         1|
|20532|Women Black Flats...|   3.9|    Women|Flipkart|      499|     3015|       404|   1458|    657|    397|    182|    321|         1|
|10648|Women Gold Wedges...|   3.9|    Women|Flipkart|      999|      449|        52|    229|     70|     71|     33|     46|         1|
|20677|Men's Height Incr...|   3.9|      Men|Flipkart|     2999|      290|        40|    141|     51|     49|     17|     32|         1|
|12593|Loafers For Men  ...|   3.9|      

In [51]:
#checking the schema
df.printSchema() #it will show the datatype of each column initially we define inferSchema='true' means spark will create same schema as source file
df.describe().show() #Summarizes the DataFrame by providing basic statistics (e.g., count, mean, standard deviation, min, max) for numeric columns.


root
 |-- id: integer (nullable = true)
 |-- title: string (nullable = true)
 |-- Rating: double (nullable = true)
 |-- maincateg: string (nullable = true)
 |-- platform: string (nullable = true)
 |-- actprice1: integer (nullable = true)
 |-- norating1: integer (nullable = true)
 |-- noreviews1: integer (nullable = true)
 |-- star_5f: integer (nullable = true)
 |-- star_4f: integer (nullable = true)
 |-- star_3f: integer (nullable = true)
 |-- star_2f: integer (nullable = true)
 |-- star_1f: integer (nullable = true)
 |-- fulfilled1: integer (nullable = true)

+-------+------------------+--------------------+-------------------+---------+--------+------------------+------------------+------------------+-----------------+-----------------+------------------+------------------+------------------+-------------------+
|summary|                id|               title|             Rating|maincateg|platform|         actprice1|         norating1|        noreviews1|          star_5f|          s

In [52]:
#handling the missing data
#df.select([count(when(col(c).isNull(),c)).alias(c) for c in df.columns]).show()
#drop the rows that is missing
df_cleanflip=df.dropna()
df_cleanflip.select([count(when(col(c).isNull(),c)).alias(c) for c in df_cleanflip.columns]).show()
#filling specific values to the columns or missing columns
df_filledflip=df.fillna({"Rating":0})


+---+-----+------+---------+--------+---------+---------+----------+-------+-------+-------+-------+-------+----------+
| id|title|Rating|maincateg|platform|actprice1|norating1|noreviews1|star_5f|star_4f|star_3f|star_2f|star_1f|fulfilled1|
+---+-----+------+---------+--------+---------+---------+----------+-------+-------+-------+-------+-------+----------+
|  0|    0|     0|        0|       0|        0|        0|         0|      0|      0|      0|      0|      0|         0|
+---+-----+------+---------+--------+---------+---------+----------+-------+-------+-------+-------+-------+----------+



In [54]:
# Filter products with ratings greater than 4 and priced below 1000
high_rated_products=df_filledflip.filter((col("Rating")>4))
high_rated_products.show(5) #show the result

+-----+--------------------+------+---------+--------+---------+---------+----------+-------+-------+-------+-------+-------+----------+
|   id|               title|Rating|maincateg|platform|actprice1|norating1|noreviews1|star_5f|star_4f|star_3f|star_2f|star_1f|fulfilled1|
+-----+--------------------+------+---------+--------+---------+---------+----------+-------+-------+-------+-------+-------+----------+
| 6433|ARYA - DIFFERENT ...|   4.2|    Women|Flipkart|     4299|      166|        24|     94|     39|     12|      6|     15|         1|
|13859|Women Black Wedge...|   4.2|    Women|Flipkart|      279|     3048|       487|   1746|    679|    348|    114|    161|         1|
|19453|Denill Ankle Leng...|   4.1|    Women|Flipkart|      999|     6806|       961|   3646|   1508|    810|    335|    510|         1|
| 8121|Women Grey Heels ...|   4.2|    Women|Flipkart|     1990|       16|         1|     10|      2|      1|      3|      0|         1|
| 9791|Pink Perfect Styl...|   4.2|    Wo

In [55]:
#group the category and calculate the averate rating
avg_rating_by_category=df_filledflip.groupBy("maincateg").avg("Rating")
avg_rating_by_category.show()

+---------+------------------+
|maincateg|       avg(Rating)|
+---------+------------------+
|     NULL|0.7567164179104477|
|      Men|3.8350116550116575|
|    Women|  3.93901715039579|
+---------+------------------+



In [56]:
#Total Revenue by category
total_revenue_by_category=df_filledflip.groupBy("maincateg").agg(sum("Rating"))
total_revenue_by_category.show()

+---------+------------------+
|maincateg|       sum(Rating)|
+---------+------------------+
|     NULL|50.699999999999996|
|      Men| 8226.100000000006|
|    Women|11943.100000000035|
+---------+------------------+

