In [101]:
from pyspark.sql import SparkSession

In [102]:
spark = SparkSession.builder.appName('Practise').getOrCreate()

In [103]:
spark

In [128]:
# Read the data set
df_pyspark = spark.read.option('header', 'true').csv('apple_products.csv', inferSchema = True)

In [105]:
# Check the schema which helps identify column types

df_pyspark.printSchema()

root
 |-- Product Name: string (nullable = true)
 |-- Product URL: string (nullable = true)
 |-- Brand: string (nullable = true)
 |-- Sale Price: integer (nullable = true)
 |-- Mrp: integer (nullable = true)
 |-- Discount Percentage: integer (nullable = true)
 |-- Number Of Ratings: integer (nullable = true)
 |-- Number Of Reviews: integer (nullable = true)
 |-- Upc: string (nullable = true)
 |-- Star Rating: double (nullable = true)
 |-- Ram: string (nullable = true)



In [106]:
df_pyspark = spark.read.csv('apple_products.csv', header = True, inferSchema = True)
df_pyspark.show()

+--------------------+--------------------+-----+----------+------+-------------------+-----------------+-----------------+----------------+-----------+----+
|        Product Name|         Product URL|Brand|Sale Price|   Mrp|Discount Percentage|Number Of Ratings|Number Of Reviews|             Upc|Star Rating| Ram|
+--------------------+--------------------+-----+----------+------+-------------------+-----------------+-----------------+----------------+-----------+----+
|APPLE iPhone 8 Pl...|https://www.flipk...|Apple|     49900| 49900|                  0|             3431|              356|MOBEXRGV7EHHTGUH|        4.6|2 GB|
|APPLE iPhone 8 Pl...|https://www.flipk...|Apple|     84900| 84900|                  0|             3431|              356|MOBEXRGVAC6TJT4F|        4.6|2 GB|
|APPLE iPhone 8 Pl...|https://www.flipk...|Apple|     84900| 84900|                  0|             3431|              356|MOBEXRGVGETABXWZ|        4.6|2 GB|
|APPLE iPhone 8 (S...|https://www.flipk...|Apple|   

In [107]:
type(df_pyspark)

In [108]:
# get column types

df_pyspark.columns

['Product Name',
 'Product URL',
 'Brand',
 'Sale Price',
 'Mrp',
 'Discount Percentage',
 'Number Of Ratings',
 'Number Of Reviews',
 'Upc',
 'Star Rating',
 'Ram']

In [109]:
df_pyspark.head(3)

[Row(Product Name='APPLE iPhone 8 Plus (Gold, 64 GB)', Product URL='https://www.flipkart.com/apple-iphone-8-plus-gold-64-gb/p/itmexrgvuzgzttzh?pid=MOBEXRGV7EHHTGUH', Brand='Apple', Sale Price=49900, Mrp=49900, Discount Percentage=0, Number Of Ratings=3431, Number Of Reviews=356, Upc='MOBEXRGV7EHHTGUH', Star Rating=4.6, Ram='2 GB'),
 Row(Product Name='APPLE iPhone 8 Plus (Space Grey, 256 GB)', Product URL='https://www.flipkart.com/apple-iphone-8-plus-space-grey-256-gb/p/itmexrgvzkbyqgtf?pid=MOBEXRGVAC6TJT4F', Brand='Apple', Sale Price=84900, Mrp=84900, Discount Percentage=0, Number Of Ratings=3431, Number Of Reviews=356, Upc='MOBEXRGVAC6TJT4F', Star Rating=4.6, Ram='2 GB'),
 Row(Product Name='APPLE iPhone 8 Plus (Silver, 256 GB)', Product URL='https://www.flipkart.com/apple-iphone-8-plus-silver-256-gb/p/itmexrgvxatuyrqw?pid=MOBEXRGVGETABXWZ', Brand='Apple', Sale Price=84900, Mrp=84900, Discount Percentage=0, Number Of Ratings=3431, Number Of Reviews=356, Upc='MOBEXRGVGETABXWZ', Star Rat

In [110]:
df_pyspark.show()

+--------------------+--------------------+-----+----------+------+-------------------+-----------------+-----------------+----------------+-----------+----+
|        Product Name|         Product URL|Brand|Sale Price|   Mrp|Discount Percentage|Number Of Ratings|Number Of Reviews|             Upc|Star Rating| Ram|
+--------------------+--------------------+-----+----------+------+-------------------+-----------------+-----------------+----------------+-----------+----+
|APPLE iPhone 8 Pl...|https://www.flipk...|Apple|     49900| 49900|                  0|             3431|              356|MOBEXRGV7EHHTGUH|        4.6|2 GB|
|APPLE iPhone 8 Pl...|https://www.flipk...|Apple|     84900| 84900|                  0|             3431|              356|MOBEXRGVAC6TJT4F|        4.6|2 GB|
|APPLE iPhone 8 Pl...|https://www.flipk...|Apple|     84900| 84900|                  0|             3431|              356|MOBEXRGVGETABXWZ|        4.6|2 GB|
|APPLE iPhone 8 (S...|https://www.flipk...|Apple|   

In [111]:
df_pyspark.select('Product Name')

DataFrame[Product Name: string]

In [112]:
type(df_pyspark.select('Product Name'))

In [113]:
df_pyspark.select('Product Name').show(3)

+--------------------+
|        Product Name|
+--------------------+
|APPLE iPhone 8 Pl...|
|APPLE iPhone 8 Pl...|
|APPLE iPhone 8 Pl...|
+--------------------+
only showing top 3 rows



In [114]:
df_pyspark.select(['Product Name', 'Sale Price'])

DataFrame[Product Name: string, Sale Price: int]

In [115]:
df_pyspark.select(['Product Name', 'Sale Price']).show(2)

+--------------------+----------+
|        Product Name|Sale Price|
+--------------------+----------+
|APPLE iPhone 8 Pl...|     49900|
|APPLE iPhone 8 Pl...|     84900|
+--------------------+----------+
only showing top 2 rows



In [116]:
df_pyspark['Product Name']

Column<'Product Name'>

In [117]:
# Check datatypes of columns another way
df_pyspark.dtypes

[('Product Name', 'string'),
 ('Product URL', 'string'),
 ('Brand', 'string'),
 ('Sale Price', 'int'),
 ('Mrp', 'int'),
 ('Discount Percentage', 'int'),
 ('Number Of Ratings', 'int'),
 ('Number Of Reviews', 'int'),
 ('Upc', 'string'),
 ('Star Rating', 'double'),
 ('Ram', 'string')]

In [118]:
df_pyspark.describe()

DataFrame[summary: string, Product Name: string, Product URL: string, Brand: string, Sale Price: string, Mrp: string, Discount Percentage: string, Number Of Ratings: string, Number Of Reviews: string, Upc: string, Star Rating: string, Ram: string]

In [119]:
df_pyspark.describe().show()

+-------+--------------------+--------------------+-----+-----------------+------------------+-------------------+-----------------+------------------+----------------+-------------------+----+
|summary|        Product Name|         Product URL|Brand|       Sale Price|               Mrp|Discount Percentage|Number Of Ratings| Number Of Reviews|             Upc|        Star Rating| Ram|
+-------+--------------------+--------------------+-----+-----------------+------------------+-------------------+-----------------+------------------+----------------+-------------------+----+
|  count|                  62|                  62|   62|               62|                62|                 62|               62|                62|              62|                 62|  62|
|   mean|                NULL|                NULL| NULL| 80073.8870967742| 88058.06451612903|  9.951612903225806|22420.40322580645|1861.6774193548388|            NULL|  4.575806451612904|NULL|
| stddev|                NULL|

In [120]:
print(df_pyspark)

DataFrame[Product Name: string, Product URL: string, Brand: string, Sale Price: int, Mrp: int, Discount Percentage: int, Number Of Ratings: int, Number Of Reviews: int, Upc: string, Star Rating: double, Ram: string]


In [121]:
df_pyspark = df_pyspark.withColumn('Sale Price after 2 years', df_pyspark['Sale Price'] + 1000)
df_pyspark

DataFrame[Product Name: string, Product URL: string, Brand: string, Sale Price: int, Mrp: int, Discount Percentage: int, Number Of Ratings: int, Number Of Reviews: int, Upc: string, Star Rating: double, Ram: string, Sale Price after 2 years: int]

In [122]:
df_pyspark.show(2)

+--------------------+--------------------+-----+----------+-----+-------------------+-----------------+-----------------+----------------+-----------+----+------------------------+
|        Product Name|         Product URL|Brand|Sale Price|  Mrp|Discount Percentage|Number Of Ratings|Number Of Reviews|             Upc|Star Rating| Ram|Sale Price after 2 years|
+--------------------+--------------------+-----+----------+-----+-------------------+-----------------+-----------------+----------------+-----------+----+------------------------+
|APPLE iPhone 8 Pl...|https://www.flipk...|Apple|     49900|49900|                  0|             3431|              356|MOBEXRGV7EHHTGUH|        4.6|2 GB|                   50900|
|APPLE iPhone 8 Pl...|https://www.flipk...|Apple|     84900|84900|                  0|             3431|              356|MOBEXRGVAC6TJT4F|        4.6|2 GB|                   85900|
+--------------------+--------------------+-----+----------+-----+-------------------+----

In [123]:
# Drop all the columns

df_pyspark.drop('Sale Price after 2 years').show()

+--------------------+--------------------+-----+----------+------+-------------------+-----------------+-----------------+----------------+-----------+----+
|        Product Name|         Product URL|Brand|Sale Price|   Mrp|Discount Percentage|Number Of Ratings|Number Of Reviews|             Upc|Star Rating| Ram|
+--------------------+--------------------+-----+----------+------+-------------------+-----------------+-----------------+----------------+-----------+----+
|APPLE iPhone 8 Pl...|https://www.flipk...|Apple|     49900| 49900|                  0|             3431|              356|MOBEXRGV7EHHTGUH|        4.6|2 GB|
|APPLE iPhone 8 Pl...|https://www.flipk...|Apple|     84900| 84900|                  0|             3431|              356|MOBEXRGVAC6TJT4F|        4.6|2 GB|
|APPLE iPhone 8 Pl...|https://www.flipk...|Apple|     84900| 84900|                  0|             3431|              356|MOBEXRGVGETABXWZ|        4.6|2 GB|
|APPLE iPhone 8 (S...|https://www.flipk...|Apple|   

In [124]:
print(df_pyspark)

# Observe that Sale Price is not dropped yet

DataFrame[Product Name: string, Product URL: string, Brand: string, Sale Price: int, Mrp: int, Discount Percentage: int, Number Of Ratings: int, Number Of Reviews: int, Upc: string, Star Rating: double, Ram: string, Sale Price after 2 years: int]


In [125]:
# For dropping it we should assign the drop operation to df_pyspark

df_pyspark = df_pyspark.drop('Sale Price after 2 years').show()


+--------------------+--------------------+-----+----------+------+-------------------+-----------------+-----------------+----------------+-----------+----+
|        Product Name|         Product URL|Brand|Sale Price|   Mrp|Discount Percentage|Number Of Ratings|Number Of Reviews|             Upc|Star Rating| Ram|
+--------------------+--------------------+-----+----------+------+-------------------+-----------------+-----------------+----------------+-----------+----+
|APPLE iPhone 8 Pl...|https://www.flipk...|Apple|     49900| 49900|                  0|             3431|              356|MOBEXRGV7EHHTGUH|        4.6|2 GB|
|APPLE iPhone 8 Pl...|https://www.flipk...|Apple|     84900| 84900|                  0|             3431|              356|MOBEXRGVAC6TJT4F|        4.6|2 GB|
|APPLE iPhone 8 Pl...|https://www.flipk...|Apple|     84900| 84900|                  0|             3431|              356|MOBEXRGVGETABXWZ|        4.6|2 GB|
|APPLE iPhone 8 (S...|https://www.flipk...|Apple|   

In [129]:
# Assuming you originally created `df_pyspark` properly
df_pyspark = df_pyspark.withColumnRenamed('Product Name', 'Product')
df_pyspark.show()


+--------------------+--------------------+-----+----------+------+-------------------+-----------------+-----------------+----------------+-----------+----+
|             Product|         Product URL|Brand|Sale Price|   Mrp|Discount Percentage|Number Of Ratings|Number Of Reviews|             Upc|Star Rating| Ram|
+--------------------+--------------------+-----+----------+------+-------------------+-----------------+-----------------+----------------+-----------+----+
|APPLE iPhone 8 Pl...|https://www.flipk...|Apple|     49900| 49900|                  0|             3431|              356|MOBEXRGV7EHHTGUH|        4.6|2 GB|
|APPLE iPhone 8 Pl...|https://www.flipk...|Apple|     84900| 84900|                  0|             3431|              356|MOBEXRGVAC6TJT4F|        4.6|2 GB|
|APPLE iPhone 8 Pl...|https://www.flipk...|Apple|     84900| 84900|                  0|             3431|              356|MOBEXRGVGETABXWZ|        4.6|2 GB|
|APPLE iPhone 8 (S...|https://www.flipk...|Apple|   

In [None]:
## aextra line 