In [1]:
!pip install pyspark



In [2]:
from pyspark.sql import SparkSession

spark=SparkSession.builder\
      .appName("Basics")\
      .getOrCreate()

spark.version

'4.0.1'

#First DataFrame

In [3]:
data=[
    {"name": "Arjun", "Age": 21},
     {"name": "Sara", "Age": 19},
     {"name": "Dinesh", "Age": 20},
]

df=spark.createDataFrame(data)
df.show()

+---+------+
|Age|  name|
+---+------+
| 21| Arjun|
| 19|  Sara|
| 20|Dinesh|
+---+------+



In [4]:
df.printSchema()

root
 |-- Age: long (nullable = true)
 |-- name: string (nullable = true)



In [5]:
df.select("name").show()
df.select("name", "age").show()

+------+
|  name|
+------+
| Arjun|
|  Sara|
|Dinesh|
+------+

+------+---+
|  name|age|
+------+---+
| Arjun| 21|
|  Sara| 19|
|Dinesh| 20|
+------+---+



In [7]:
df.filter(df.Age > 20).show()
df.filter(df.name == 'Sara').show()

+---+-----+
|Age| name|
+---+-----+
| 21|Arjun|
+---+-----+

+---+----+
|Age|name|
+---+----+
| 19|Sara|
+---+----+



In [11]:
from pyspark.sql.functions import col

df2=df.withColumn("age_plus_5", col("Age")+ 5)
df2.show()


+---+------+----------+
|Age|  name|age_plus_5|
+---+------+----------+
| 21| Arjun|        26|
| 19|  Sara|        24|
| 20|Dinesh|        25|
+---+------+----------+



In [12]:
from pyspark.sql.functions import lower

df3=df.withColumn("name_lower", lower(col("name")))
df3.show()

+---+------+----------+
|Age|  name|name_lower|
+---+------+----------+
| 21| Arjun|     arjun|
| 19|  Sara|      sara|
| 20|Dinesh|    dinesh|
+---+------+----------+



In [13]:
df.drop("Age").show()

+------+
|  name|
+------+
| Arjun|
|  Sara|
|Dinesh|
+------+



#Exercise of PySpark

In [14]:
data=[ ("C001", "Arjun", "Hyderabad", 25, 45000, "Electronics"),
 ("C002", "Meera", "Chennai", 32, 52000, "Grocery"),
  ("C003", "Rajesh", "Bangalore", 29, 61000, "Clothing"),
   ("C004", "Priya", "Delhi", 22, 38000, "Grocery"),
    ("C005", "Sanjay", "Mumbai", 35, 72000, "Electronics"),
     ("C006", "Kavya", "Hyderabad", 28, 48000, "Grocery"),
      ("C007", "Imran", "Delhi", 31, 53000, "Clothing"),
       ("C008", "Divya", "Chennai", 27, 45000, "Electronics"),
        ("C009", "Anil", "Bangalore", 40, 85000, "Furniture"),
         ("C010", "Ritu", "Mumbai", 23, 39000, "Clothing"),
          ("C011", "Hari", "Hyderabad", 33, 56000, "Grocery"),
           ("C012", "Sana", "Delhi", 26, 47000, "Electronics"),
            ("C013", "Vikram", "Chennai", 38, 91000, "Furniture"),
             ("C014", "Deepa", "Mumbai", 30, 62000, "Clothing"),
              ("C015", "Asha", "Bangalore", 24, 41000, "Grocery"),
               ("C016", "Kiran", "Delhi", 29, 59000, "Furniture"),
                ("C017", "Farah", "Hyderabad", 36, 70000, "Clothing"),
                 ("C018", "Tarun", "Chennai", 28, 53000, "Furniture"),
                  ("C019", "Nisha", "Mumbai", 21, 35000, "Grocery"),
                   ("C020", "Yusuf", "Bangalore", 34, 76000, "Electronics"),
                    ("C021", "Pooja", "Delhi", 27, 47000, "Clothing"),
                     ("C022", "Zara", "Hyderabad", 32, 58000, "Grocery"),
                      ("C023", "Ajay", "Chennai", 30, 51000, "Furniture"),
                       ("C024", "Reema", "Bangalore", 28, 49000, "Clothing"),
                        ("C025", "Gautam", "Mumbai", 39, 82000, "Furniture"),
                         ("C026", "Swati", "Delhi", 25, 46000, "Electronics"),
                          ("C027", "Mahesh", "Hyderabad", 41, 90000, "Furniture"),
                           ("C028", "Anita", "Chennai", 26, 44000, "Clothing"),
                            ("C029", "Sameer", "Bangalore", 33, 68000, "Electronics"),
                             ("C030", "Leela", "Delhi", 22, 36000, "Grocery") ]




In [15]:
columns = "customer_id", "name", "city", "age", "annual_spend", "category"

In [16]:
print(type(columns))

<class 'tuple'>


In [17]:
df = spark.createDataFrame(data, columns)
df.show()

+-----------+------+---------+---+------------+-----------+
|customer_id|  name|     city|age|annual_spend|   category|
+-----------+------+---------+---+------------+-----------+
|       C001| Arjun|Hyderabad| 25|       45000|Electronics|
|       C002| Meera|  Chennai| 32|       52000|    Grocery|
|       C003|Rajesh|Bangalore| 29|       61000|   Clothing|
|       C004| Priya|    Delhi| 22|       38000|    Grocery|
|       C005|Sanjay|   Mumbai| 35|       72000|Electronics|
|       C006| Kavya|Hyderabad| 28|       48000|    Grocery|
|       C007| Imran|    Delhi| 31|       53000|   Clothing|
|       C008| Divya|  Chennai| 27|       45000|Electronics|
|       C009|  Anil|Bangalore| 40|       85000|  Furniture|
|       C010|  Ritu|   Mumbai| 23|       39000|   Clothing|
|       C011|  Hari|Hyderabad| 33|       56000|    Grocery|
|       C012|  Sana|    Delhi| 26|       47000|Electronics|
|       C013|Vikram|  Chennai| 38|       91000|  Furniture|
|       C014| Deepa|   Mumbai| 30|      

## Show first 10 rows

In [18]:
df.show(10)

+-----------+------+---------+---+------------+-----------+
|customer_id|  name|     city|age|annual_spend|   category|
+-----------+------+---------+---+------------+-----------+
|       C001| Arjun|Hyderabad| 25|       45000|Electronics|
|       C002| Meera|  Chennai| 32|       52000|    Grocery|
|       C003|Rajesh|Bangalore| 29|       61000|   Clothing|
|       C004| Priya|    Delhi| 22|       38000|    Grocery|
|       C005|Sanjay|   Mumbai| 35|       72000|Electronics|
|       C006| Kavya|Hyderabad| 28|       48000|    Grocery|
|       C007| Imran|    Delhi| 31|       53000|   Clothing|
|       C008| Divya|  Chennai| 27|       45000|Electronics|
|       C009|  Anil|Bangalore| 40|       85000|  Furniture|
|       C010|  Ritu|   Mumbai| 23|       39000|   Clothing|
+-----------+------+---------+---+------------+-----------+
only showing top 10 rows


## Show all unique cities

In [20]:
df.select("city").distinct().show()

+---------+
|     city|
+---------+
|Bangalore|
|  Chennai|
|   Mumbai|
|    Delhi|
|Hyderabad|
+---------+



## Show columns of customer_id, name and annual spend only

In [21]:
df.select("customer_id", "name", "annual_spend").show()

+-----------+------+------------+
|customer_id|  name|annual_spend|
+-----------+------+------------+
|       C001| Arjun|       45000|
|       C002| Meera|       52000|
|       C003|Rajesh|       61000|
|       C004| Priya|       38000|
|       C005|Sanjay|       72000|
|       C006| Kavya|       48000|
|       C007| Imran|       53000|
|       C008| Divya|       45000|
|       C009|  Anil|       85000|
|       C010|  Ritu|       39000|
|       C011|  Hari|       56000|
|       C012|  Sana|       47000|
|       C013|Vikram|       91000|
|       C014| Deepa|       62000|
|       C015|  Asha|       41000|
|       C016| Kiran|       59000|
|       C017| Farah|       70000|
|       C018| Tarun|       53000|
|       C019| Nisha|       35000|
|       C020| Yusuf|       76000|
+-----------+------+------------+
only showing top 20 rows


## Filter customers who spend more than 60000 annually

In [22]:
df.filter(df.annual_spend > 6000).show()

+-----------+------+---------+---+------------+-----------+
|customer_id|  name|     city|age|annual_spend|   category|
+-----------+------+---------+---+------------+-----------+
|       C001| Arjun|Hyderabad| 25|       45000|Electronics|
|       C002| Meera|  Chennai| 32|       52000|    Grocery|
|       C003|Rajesh|Bangalore| 29|       61000|   Clothing|
|       C004| Priya|    Delhi| 22|       38000|    Grocery|
|       C005|Sanjay|   Mumbai| 35|       72000|Electronics|
|       C006| Kavya|Hyderabad| 28|       48000|    Grocery|
|       C007| Imran|    Delhi| 31|       53000|   Clothing|
|       C008| Divya|  Chennai| 27|       45000|Electronics|
|       C009|  Anil|Bangalore| 40|       85000|  Furniture|
|       C010|  Ritu|   Mumbai| 23|       39000|   Clothing|
|       C011|  Hari|Hyderabad| 33|       56000|    Grocery|
|       C012|  Sana|    Delhi| 26|       47000|Electronics|
|       C013|Vikram|  Chennai| 38|       91000|  Furniture|
|       C014| Deepa|   Mumbai| 30|      

## Show all customers from Delhi who are younger than 30

In [23]:
df.filter((df.city == "Delhi") & (df.age < 30)).show()

+-----------+-----+-----+---+------------+-----------+
|customer_id| name| city|age|annual_spend|   category|
+-----------+-----+-----+---+------------+-----------+
|       C004|Priya|Delhi| 22|       38000|    Grocery|
|       C012| Sana|Delhi| 26|       47000|Electronics|
|       C016|Kiran|Delhi| 29|       59000|  Furniture|
|       C021|Pooja|Delhi| 27|       47000|   Clothing|
|       C026|Swati|Delhi| 25|       46000|Electronics|
|       C030|Leela|Delhi| 22|       36000|    Grocery|
+-----------+-----+-----+---+------------+-----------+



## Create new column spend_lakh

In [24]:

df = df.withColumn("spend_lakh", df.annual_spend / 100000)

In [25]:
df.show()

+-----------+------+---------+---+------------+-----------+----------+
|customer_id|  name|     city|age|annual_spend|   category|spend_lakh|
+-----------+------+---------+---+------------+-----------+----------+
|       C001| Arjun|Hyderabad| 25|       45000|Electronics|      0.45|
|       C002| Meera|  Chennai| 32|       52000|    Grocery|      0.52|
|       C003|Rajesh|Bangalore| 29|       61000|   Clothing|      0.61|
|       C004| Priya|    Delhi| 22|       38000|    Grocery|      0.38|
|       C005|Sanjay|   Mumbai| 35|       72000|Electronics|      0.72|
|       C006| Kavya|Hyderabad| 28|       48000|    Grocery|      0.48|
|       C007| Imran|    Delhi| 31|       53000|   Clothing|      0.53|
|       C008| Divya|  Chennai| 27|       45000|Electronics|      0.45|
|       C009|  Anil|Bangalore| 40|       85000|  Furniture|      0.85|
|       C010|  Ritu|   Mumbai| 23|       39000|   Clothing|      0.39|
|       C011|  Hari|Hyderabad| 33|       56000|    Grocery|      0.56|
|     

## Create new column customer_type

In [26]:
from pyspark.sql.functions import when

df = df.withColumn("customer_type", when(df.annual_spend > 70000, "Premium").otherwise("Standard"))
df.show()

+-----------+------+---------+---+------------+-----------+----------+-------------+
|customer_id|  name|     city|age|annual_spend|   category|spend_lakh|customer_type|
+-----------+------+---------+---+------------+-----------+----------+-------------+
|       C001| Arjun|Hyderabad| 25|       45000|Electronics|      0.45|     Standard|
|       C002| Meera|  Chennai| 32|       52000|    Grocery|      0.52|     Standard|
|       C003|Rajesh|Bangalore| 29|       61000|   Clothing|      0.61|     Standard|
|       C004| Priya|    Delhi| 22|       38000|    Grocery|      0.38|     Standard|
|       C005|Sanjay|   Mumbai| 35|       72000|Electronics|      0.72|      Premium|
|       C006| Kavya|Hyderabad| 28|       48000|    Grocery|      0.48|     Standard|
|       C007| Imran|    Delhi| 31|       53000|   Clothing|      0.53|     Standard|
|       C008| Divya|  Chennai| 27|       45000|Electronics|      0.45|     Standard|
|       C009|  Anil|Bangalore| 40|       85000|  Furniture|      

## Show customers whose name starts with "A"

In [27]:
df.filter(df.name.startswith("A")).show()

+-----------+-----+---------+---+------------+-----------+----------+-------------+
|customer_id| name|     city|age|annual_spend|   category|spend_lakh|customer_type|
+-----------+-----+---------+---+------------+-----------+----------+-------------+
|       C001|Arjun|Hyderabad| 25|       45000|Electronics|      0.45|     Standard|
|       C009| Anil|Bangalore| 40|       85000|  Furniture|      0.85|      Premium|
|       C015| Asha|Bangalore| 24|       41000|    Grocery|      0.41|     Standard|
|       C023| Ajay|  Chennai| 30|       51000|  Furniture|      0.51|     Standard|
|       C028|Anita|  Chennai| 26|       44000|   Clothing|      0.44|     Standard|
+-----------+-----+---------+---+------------+-----------+----------+-------------+



## Filter customers where category is Clothing or Electronics

In [31]:
df.filter((df.category == "Clothing") | (df.category=="Electronics")).show()

+-----------+------+---------+---+------------+-----------+----------+-------------+
|customer_id|  name|     city|age|annual_spend|   category|spend_lakh|customer_type|
+-----------+------+---------+---+------------+-----------+----------+-------------+
|       C001| Arjun|Hyderabad| 25|       45000|Electronics|      0.45|     Standard|
|       C003|Rajesh|Bangalore| 29|       61000|   Clothing|      0.61|     Standard|
|       C005|Sanjay|   Mumbai| 35|       72000|Electronics|      0.72|      Premium|
|       C007| Imran|    Delhi| 31|       53000|   Clothing|      0.53|     Standard|
|       C008| Divya|  Chennai| 27|       45000|Electronics|      0.45|     Standard|
|       C010|  Ritu|   Mumbai| 23|       39000|   Clothing|      0.39|     Standard|
|       C012|  Sana|    Delhi| 26|       47000|Electronics|      0.47|     Standard|
|       C014| Deepa|   Mumbai| 30|       62000|   Clothing|      0.62|     Standard|
|       C017| Farah|Hyderabad| 36|       70000|   Clothing|      

## Convert city name to uppercase

In [33]:
from pyspark.sql.functions import upper

df=df.withColumn("city_upper", upper(df.city))
df.show()

+-----------+------+---------+---+------------+-----------+----------+-------------+----------+
|customer_id|  name|     city|age|annual_spend|   category|spend_lakh|customer_type|city_upper|
+-----------+------+---------+---+------------+-----------+----------+-------------+----------+
|       C001| Arjun|Hyderabad| 25|       45000|Electronics|      0.45|     Standard| HYDERABAD|
|       C002| Meera|  Chennai| 32|       52000|    Grocery|      0.52|     Standard|   CHENNAI|
|       C003|Rajesh|Bangalore| 29|       61000|   Clothing|      0.61|     Standard| BANGALORE|
|       C004| Priya|    Delhi| 22|       38000|    Grocery|      0.38|     Standard|     DELHI|
|       C005|Sanjay|   Mumbai| 35|       72000|Electronics|      0.72|      Premium|    MUMBAI|
|       C006| Kavya|Hyderabad| 28|       48000|    Grocery|      0.48|     Standard| HYDERABAD|
|       C007| Imran|    Delhi| 31|       53000|   Clothing|      0.53|     Standard|     DELHI|
|       C008| Divya|  Chennai| 27|      

## Remove the category column

In [34]:
df = df.drop("category")
df.show()

+-----------+------+---------+---+------------+----------+-------------+----------+
|customer_id|  name|     city|age|annual_spend|spend_lakh|customer_type|city_upper|
+-----------+------+---------+---+------------+----------+-------------+----------+
|       C001| Arjun|Hyderabad| 25|       45000|      0.45|     Standard| HYDERABAD|
|       C002| Meera|  Chennai| 32|       52000|      0.52|     Standard|   CHENNAI|
|       C003|Rajesh|Bangalore| 29|       61000|      0.61|     Standard| BANGALORE|
|       C004| Priya|    Delhi| 22|       38000|      0.38|     Standard|     DELHI|
|       C005|Sanjay|   Mumbai| 35|       72000|      0.72|      Premium|    MUMBAI|
|       C006| Kavya|Hyderabad| 28|       48000|      0.48|     Standard| HYDERABAD|
|       C007| Imran|    Delhi| 31|       53000|      0.53|     Standard|     DELHI|
|       C008| Divya|  Chennai| 27|       45000|      0.45|     Standard|   CHENNAI|
|       C009|  Anil|Bangalore| 40|       85000|      0.85|      Premium| BAN

## Sort customers by age in descending order

In [35]:
df.orderBy(df.age.desc()).show()

+-----------+------+---------+---+------------+----------+-------------+----------+
|customer_id|  name|     city|age|annual_spend|spend_lakh|customer_type|city_upper|
+-----------+------+---------+---+------------+----------+-------------+----------+
|       C027|Mahesh|Hyderabad| 41|       90000|       0.9|      Premium| HYDERABAD|
|       C009|  Anil|Bangalore| 40|       85000|      0.85|      Premium| BANGALORE|
|       C025|Gautam|   Mumbai| 39|       82000|      0.82|      Premium|    MUMBAI|
|       C013|Vikram|  Chennai| 38|       91000|      0.91|      Premium|   CHENNAI|
|       C017| Farah|Hyderabad| 36|       70000|       0.7|     Standard| HYDERABAD|
|       C005|Sanjay|   Mumbai| 35|       72000|      0.72|      Premium|    MUMBAI|
|       C020| Yusuf|Bangalore| 34|       76000|      0.76|      Premium| BANGALORE|
|       C011|  Hari|Hyderabad| 33|       56000|      0.56|     Standard| HYDERABAD|
|       C029|Sameer|Bangalore| 33|       68000|      0.68|     Standard| BAN

## Customers younger than average age

In [36]:
from pyspark.sql.functions import avg

avg_age = df.select(avg("age")).collect()[0][0]
df.filter(df.age < avg_age).show()

+-----------+------+---------+---+------------+----------+-------------+----------+
|customer_id|  name|     city|age|annual_spend|spend_lakh|customer_type|city_upper|
+-----------+------+---------+---+------------+----------+-------------+----------+
|       C001| Arjun|Hyderabad| 25|       45000|      0.45|     Standard| HYDERABAD|
|       C003|Rajesh|Bangalore| 29|       61000|      0.61|     Standard| BANGALORE|
|       C004| Priya|    Delhi| 22|       38000|      0.38|     Standard|     DELHI|
|       C006| Kavya|Hyderabad| 28|       48000|      0.48|     Standard| HYDERABAD|
|       C008| Divya|  Chennai| 27|       45000|      0.45|     Standard|   CHENNAI|
|       C010|  Ritu|   Mumbai| 23|       39000|      0.39|     Standard|    MUMBAI|
|       C012|  Sana|    Delhi| 26|       47000|      0.47|     Standard|     DELHI|
|       C015|  Asha|Bangalore| 24|       41000|      0.41|     Standard| BANGALORE|
|       C016| Kiran|    Delhi| 29|       59000|      0.59|     Standard|    

## Top 5 highest spending customers

In [37]:
df.orderBy(df.annual_spend.desc()).show(5)

+-----------+------+---------+---+------------+----------+-------------+----------+
|customer_id|  name|     city|age|annual_spend|spend_lakh|customer_type|city_upper|
+-----------+------+---------+---+------------+----------+-------------+----------+
|       C013|Vikram|  Chennai| 38|       91000|      0.91|      Premium|   CHENNAI|
|       C027|Mahesh|Hyderabad| 41|       90000|       0.9|      Premium| HYDERABAD|
|       C009|  Anil|Bangalore| 40|       85000|      0.85|      Premium| BANGALORE|
|       C025|Gautam|   Mumbai| 39|       82000|      0.82|      Premium|    MUMBAI|
|       C020| Yusuf|Bangalore| 34|       76000|      0.76|      Premium| BANGALORE|
+-----------+------+---------+---+------------+----------+-------------+----------+
only showing top 5 rows


## New DataFrame with only Mumbai customers

In [38]:
mumbai_df = df.filter(df.city == "Mumbai")
mumbai_df.show()

+-----------+------+------+---+------------+----------+-------------+----------+
|customer_id|  name|  city|age|annual_spend|spend_lakh|customer_type|city_upper|
+-----------+------+------+---+------------+----------+-------------+----------+
|       C005|Sanjay|Mumbai| 35|       72000|      0.72|      Premium|    MUMBAI|
|       C010|  Ritu|Mumbai| 23|       39000|      0.39|     Standard|    MUMBAI|
|       C014| Deepa|Mumbai| 30|       62000|      0.62|     Standard|    MUMBAI|
|       C019| Nisha|Mumbai| 21|       35000|      0.35|     Standard|    MUMBAI|
|       C025|Gautam|Mumbai| 39|       82000|      0.82|      Premium|    MUMBAI|
+-----------+------+------+---+------------+----------+-------------+----------+



## Extract first letter of each name

In [39]:
from pyspark.sql.functions import substring

df = df.withColumn("first_letter", substring("name", 1, 1))
df.show()

+-----------+------+---------+---+------------+----------+-------------+----------+------------+
|customer_id|  name|     city|age|annual_spend|spend_lakh|customer_type|city_upper|first_letter|
+-----------+------+---------+---+------------+----------+-------------+----------+------------+
|       C001| Arjun|Hyderabad| 25|       45000|      0.45|     Standard| HYDERABAD|           A|
|       C002| Meera|  Chennai| 32|       52000|      0.52|     Standard|   CHENNAI|           M|
|       C003|Rajesh|Bangalore| 29|       61000|      0.61|     Standard| BANGALORE|           R|
|       C004| Priya|    Delhi| 22|       38000|      0.38|     Standard|     DELHI|           P|
|       C005|Sanjay|   Mumbai| 35|       72000|      0.72|      Premium|    MUMBAI|           S|
|       C006| Kavya|Hyderabad| 28|       48000|      0.48|     Standard| HYDERABAD|           K|
|       C007| Imran|    Delhi| 31|       53000|      0.53|     Standard|     DELHI|           I|
|       C008| Divya|  Chennai|

## Mask customer_id (C001 â†’ ***01)

In [40]:
from pyspark.sql.functions import expr

df = df.withColumn("masked_id", expr("concat('***', substring(customer_id, 2, 2))"))
df.show()


+-----------+------+---------+---+------------+----------+-------------+----------+------------+---------+
|customer_id|  name|     city|age|annual_spend|spend_lakh|customer_type|city_upper|first_letter|masked_id|
+-----------+------+---------+---+------------+----------+-------------+----------+------------+---------+
|       C001| Arjun|Hyderabad| 25|       45000|      0.45|     Standard| HYDERABAD|           A|    ***00|
|       C002| Meera|  Chennai| 32|       52000|      0.52|     Standard|   CHENNAI|           M|    ***00|
|       C003|Rajesh|Bangalore| 29|       61000|      0.61|     Standard| BANGALORE|           R|    ***00|
|       C004| Priya|    Delhi| 22|       38000|      0.38|     Standard|     DELHI|           P|    ***00|
|       C005|Sanjay|   Mumbai| 35|       72000|      0.72|      Premium|    MUMBAI|           S|    ***00|
|       C006| Kavya|Hyderabad| 28|       48000|      0.48|     Standard| HYDERABAD|           K|    ***00|
|       C007| Imran|    Delhi| 31|   

##  Create boolean column is_senior (age > 35)

In [41]:
df = df.withColumn("is_senior", df.age > 35)
df.show()

+-----------+------+---------+---+------------+----------+-------------+----------+------------+---------+---------+
|customer_id|  name|     city|age|annual_spend|spend_lakh|customer_type|city_upper|first_letter|masked_id|is_senior|
+-----------+------+---------+---+------------+----------+-------------+----------+------------+---------+---------+
|       C001| Arjun|Hyderabad| 25|       45000|      0.45|     Standard| HYDERABAD|           A|    ***00|    false|
|       C002| Meera|  Chennai| 32|       52000|      0.52|     Standard|   CHENNAI|           M|    ***00|    false|
|       C003|Rajesh|Bangalore| 29|       61000|      0.61|     Standard| BANGALORE|           R|    ***00|    false|
|       C004| Priya|    Delhi| 22|       38000|      0.38|     Standard|     DELHI|           P|    ***00|    false|
|       C005|Sanjay|   Mumbai| 35|       72000|      0.72|      Premium|    MUMBAI|           S|    ***00|    false|
|       C006| Kavya|Hyderabad| 28|       48000|      0.48|     S

## Count customers per city

In [42]:
df.groupBy("city").count().show()

+---------+-----+
|     city|count|
+---------+-----+
|Bangalore|    6|
|  Chennai|    6|
|   Mumbai|    5|
|    Delhi|    7|
|Hyderabad|    6|
+---------+-----+

