#### Aadhaar data Analysis

##### Creating a SparkSession

In [1]:
from pyspark.sql import SparkSession, Window
from pyspark.sql.types import *
from pyspark.sql.functions import *

spark = SparkSession.builder.appName("Aadhaar Analysis").getOrCreate()

##### Setting up Schema and reading data from csvFile

In [2]:
schemaStruct = StructType([StructField("date",DateType(),True)
                          ,StructField("registrar", StringType(),True)
                          ,StructField("agency",StringType(),True)
                          ,StructField("state",StringType(),True)
                          ,StructField("district",StringType(),True)
                          ,StructField("subdistrict",StringType(),True)
                          ,StructField("pincode",LongType(),True)
                          ,StructField("gender",StringType(),True)
                          ,StructField("age",IntegerType(),True)
                          ,StructField("generated",IntegerType(),True)
                          ,StructField("rejected",IntegerType(),True)
                          ,StructField("mobile",IntegerType(),True)
                          ,StructField("email",IntegerType(),True)])

csvFile = "./data/aadhaar_data.csv"

data = spark.read.format("csv").schema(schemaStruct).option("dateFormat","yyyyMMdd").load(csvFile)

##### Check the Schema

In [3]:
data.printSchema()

root
 |-- date: date (nullable = true)
 |-- registrar: string (nullable = true)
 |-- agency: string (nullable = true)
 |-- state: string (nullable = true)
 |-- district: string (nullable = true)
 |-- subdistrict: string (nullable = true)
 |-- pincode: long (nullable = true)
 |-- gender: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- generated: integer (nullable = true)
 |-- rejected: integer (nullable = true)
 |-- mobile: integer (nullable = true)
 |-- email: integer (nullable = true)



##### Check the data

In [4]:
data.show()

+----------+--------------+--------------------+-------------+--------------+--------------+-------+------+---+---------+--------+------+-----+
|      date|     registrar|              agency|        state|      district|   subdistrict|pincode|gender|age|generated|rejected|mobile|email|
+----------+--------------+--------------------+-------------+--------------+--------------+-------+------+---+---------+--------+------+-----+
|2015-04-20|Allahabad Bank|A-Onerealtors Pvt...|        Delhi|   South Delhi|Defence Colony| 110025|     F| 49|        1|       0|     0|    1|
|2015-04-20|Allahabad Bank|A-Onerealtors Pvt...|        Delhi|   South Delhi|Defence Colony| 110025|     F| 65|        1|       0|     0|    0|
|2015-04-20|Allahabad Bank|A-Onerealtors Pvt...|        Delhi|   South Delhi|Defence Colony| 110025|     M| 42|        1|       0|     0|    1|
|2015-04-20|Allahabad Bank|A-Onerealtors Pvt...|        Delhi|   South Delhi|Defence Colony| 110025|     M| 61|        1|       0|     0

##### KPI-1  

######  1. View/result of the top 25 rows from each individual store

In [5]:
data.groupBy("registrar").agg(expr("max(generated)")).orderBy("max(generated)",ascending=False).show(25,truncate=False)

+-------------------------------------------------+--------------+
|registrar                                        |max(generated)|
+-------------------------------------------------+--------------+
|Jharkhand                                        |371           |
|Govt of Maharashtra                              |201           |
|Registrar General India BEL2                     |167           |
|DENA BANK                                        |149           |
|Registrar General of India ITI                   |147           |
|Registrar General India - BEL                    |142           |
|NSDL e-Governance Infrastructure Limited         |129           |
|Canara Bank                                      |111           |
|Union Bank                                       |101           |
|Registrar General India ECIL                     |98            |
|CSC e-Governance Services India Limited          |95            |
|Information Technology & Communication Department|79         

In [6]:
data.createOrReplaceTempView("aadhardata")

##### KPI-2
###### 1. Find the count and names of registrars in the table.

In [7]:
data.select("registrar").distinct().count()

69

In [8]:
spark.sql("""
SELECT count(distinct registrar)
FROM aadhardata""").show()

+-------------------------+
|count(DISTINCT registrar)|
+-------------------------+
|                       69|
+-------------------------+



In [9]:
data.select("registrar").distinct().show(70,truncate=0)

+-----------------------------------------------------------+
|registrar                                                  |
+-----------------------------------------------------------+
|Govt of Andhra Pradesh                                     |
|UT Of Daman and Diu                                        |
|Govt of Madhya Pradesh                                     |
|Punjab National Bank                                       |
|NSDL e-Governance Infrastructure Limited                   |
|IDBI Bank ltd                                              |
|Delhi - North DC                                           |
|Govt of Maharashtra                                        |
|Indiapost                                                  |
|FCS Govt of Punjab                                         |
|Govt of Sikkim - Dept of Econo                             |
|Delhi - NE DC                                              |
|FCR Govt of Haryana                                        |
|IGNOU  

In [10]:
spark.sql("""
SELECT distinct registrar
FROM aadhardata""").show(70,truncate=0)

+-----------------------------------------------------------+
|registrar                                                  |
+-----------------------------------------------------------+
|Govt of Andhra Pradesh                                     |
|UT Of Daman and Diu                                        |
|Govt of Madhya Pradesh                                     |
|Punjab National Bank                                       |
|NSDL e-Governance Infrastructure Limited                   |
|IDBI Bank ltd                                              |
|Delhi - North DC                                           |
|Govt of Maharashtra                                        |
|Indiapost                                                  |
|FCS Govt of Punjab                                         |
|Govt of Sikkim - Dept of Econo                             |
|Delhi - NE DC                                              |
|FCR Govt of Haryana                                        |
|IGNOU  

##### 2.Find the number of states, districts in each state and sub-districts in each district.

In [11]:
(data
 .groupBy("state")
 .agg(expr("count(distinct district) as no_district"), expr("count(distinct subdistrict) as no_subdistrict"))
 .show(truncate=0)
)

+----------------------+-----------+--------------+
|state                 |no_district|no_subdistrict|
+----------------------+-----------+--------------+
|Nagaland              |9          |51            |
|Karnataka             |44         |217           |
|Odisha                |33         |357           |
|Kerala                |15         |100           |
|Tamil Nadu            |40         |238           |
|Chhattisgarh          |34         |148           |
|Andhra Pradesh        |29         |1110          |
|Lakshadweep           |1          |5             |
|Madhya Pradesh        |50         |273           |
|Punjab                |25         |99            |
|Manipur               |9          |43            |
|Goa                   |7          |20            |
|Mizoram               |7          |13            |
|Dadra and Nagar Haveli|1          |2             |
|Himachal Pradesh      |13         |206           |
|Puducherry            |4          |15            |
|Haryana    

In [12]:
spark.sql("""
SELECT state
     , count(distinct district) 
     , count(distinct subdistrict)
  FROM aadhardata
  GROUP BY state""").show(truncate=0)

+----------------------+------------------------+---------------------------+
|state                 |count(DISTINCT district)|count(DISTINCT subdistrict)|
+----------------------+------------------------+---------------------------+
|Nagaland              |9                       |51                         |
|Karnataka             |44                      |217                        |
|Odisha                |33                      |357                        |
|Kerala                |15                      |100                        |
|Tamil Nadu            |40                      |238                        |
|Chhattisgarh          |34                      |148                        |
|Andhra Pradesh        |29                      |1110                       |
|Lakshadweep           |1                       |5                          |
|Madhya Pradesh        |50                      |273                        |
|Punjab                |25                      |99             

##### 3.Find the number of males and females in each state from the table.

In [13]:
data.groupBy("state","gender").agg(sum("generated")).orderBy("state").show()

+--------------------+------+--------------+
|               state|gender|sum(generated)|
+--------------------+------+--------------+
|Andaman and Nicob...|     F|           498|
|Andaman and Nicob...|     M|           138|
|      Andhra Pradesh|     M|        122779|
|      Andhra Pradesh|     T|            24|
|      Andhra Pradesh|     F|        147252|
|   Arunachal Pradesh|     M|          1056|
|   Arunachal Pradesh|     F|           122|
|               Assam|     M|           694|
|               Assam|     F|           197|
|               Bihar|     M|        108245|
|               Bihar|     F|         99915|
|               Bihar|     T|             1|
|          Chandigarh|     M|          1043|
|          Chandigarh|     F|           935|
|        Chhattisgarh|     T|             2|
|        Chhattisgarh|     F|         37529|
|        Chhattisgarh|     M|         22233|
|Dadra and Nagar H...|     F|            38|
|Dadra and Nagar H...|     M|            70|
|       Da

In [14]:
spark.sql("""
SELECT state
     , gender
     , count(generated)
  FROM aadhardata
 GROUP BY state, gender
 ORDER BY state""").show()

+--------------------+------+----------------+
|               state|gender|count(generated)|
+--------------------+------+----------------+
|Andaman and Nicob...|     F|             257|
|Andaman and Nicob...|     M|             143|
|      Andhra Pradesh|     M|           74748|
|      Andhra Pradesh|     T|              40|
|      Andhra Pradesh|     F|           80768|
|   Arunachal Pradesh|     M|             273|
|   Arunachal Pradesh|     F|              89|
|               Assam|     M|             528|
|               Assam|     F|             161|
|               Bihar|     M|           59173|
|               Bihar|     F|           54227|
|               Bihar|     T|               1|
|          Chandigarh|     M|            1011|
|          Chandigarh|     F|             906|
|        Chhattisgarh|     T|               2|
|        Chhattisgarh|     F|           13349|
|        Chhattisgarh|     M|           10189|
|Dadra and Nagar H...|     F|              41|
|Dadra and Na

##### 4.Find out the names of private agencies for each state

In [15]:
data.select("state","agency").distinct().orderBy("state").show(truncate=False)

+---------------------------+----------------------------------------+
|state                      |agency                                  |
+---------------------------+----------------------------------------+
|Andaman and Nicobar Islands|SREEVEN INFOCOM LIMITED                 |
|Andaman and Nicobar Islands|Madras Security Printers Ltd            |
|Andaman and Nicobar Islands|DATASOFT COMPUTER SERVICES(P)           |
|Andaman and Nicobar Islands|Akshaya                                 |
|Andaman and Nicobar Islands|Karvy Data Management Services          |
|Andaman and Nicobar Islands|A3 Logics  India  Ltd                   |
|Andaman and Nicobar Islands|Chinar Construction Company Prime agency|
|Andaman and Nicobar Islands|India Computer Technology               |
|Andhra Pradesh             |Origin ITFS Pvt Ltd                     |
|Andhra Pradesh             |Binary Systems                          |
|Andhra Pradesh             |Krishna Infotech                        |
|Andhr

In [16]:
spark.sql("""
SELECT distinct state
     , agency
  FROM aadhardata
 ORDER BY state, agency
  """).show(truncate=False)

+---------------------------+----------------------------------------+
|state                      |agency                                  |
+---------------------------+----------------------------------------+
|Andaman and Nicobar Islands|A3 Logics  India  Ltd                   |
|Andaman and Nicobar Islands|Akshaya                                 |
|Andaman and Nicobar Islands|Chinar Construction Company Prime agency|
|Andaman and Nicobar Islands|DATASOFT COMPUTER SERVICES(P)           |
|Andaman and Nicobar Islands|India Computer Technology               |
|Andaman and Nicobar Islands|Karvy Data Management Services          |
|Andaman and Nicobar Islands|Madras Security Printers Ltd            |
|Andaman and Nicobar Islands|SREEVEN INFOCOM LIMITED                 |
|Andhra Pradesh             |4G IDENTITY SOLUTIONS                   |
|Andhra Pradesh             |4G INFORMATICS                          |
|Andhra Pradesh             |77 Infosystems Pvt Ltd                  |
|Andhr

##### KPI-3

###### 1. Find top 3 states generating most number of Aadhaar cards?

In [17]:
data.groupBy("state").agg(sum("generated")).orderBy("sum(generated)",ascending=False).show(3, False)

+--------------+--------------+
|state         |sum(generated)|
+--------------+--------------+
|Maharashtra   |951201        |
|Uttar Pradesh |385463        |
|Andhra Pradesh|270055        |
+--------------+--------------+
only showing top 3 rows



In [18]:
spark.sql("""
SELECT state
     , sum(generated)
  FROM aadhardata
 GROUP BY 1
 ORDER BY 2 DESC LIMIT 3""").show()

+--------------+--------------+
|         state|sum(generated)|
+--------------+--------------+
|   Maharashtra|        951201|
| Uttar Pradesh|        385463|
|Andhra Pradesh|        270055|
+--------------+--------------+



##### 2. Find top 3 private agencies generating the most number of Aadhar cards?

In [19]:
data.groupBy("agency").agg(sum("generated")).orderBy("sum(generated)",ascending=False).limit(3).show(truncate=False)

+-----------------------------+--------------+
|agency                       |sum(generated)|
+-----------------------------+--------------+
|Wipro Ltd                    |745751        |
|Vakrangee Softwares Limited  |225273        |
|Swathy Smartcards Hi-Tech Pvt|211790        |
+-----------------------------+--------------+



In [20]:
spark.sql("""
SELECT agency
     , sum(generated)
  FROM aadhardata
 GROUP BY 1
 ORDER BY 2 DESC LIMIT 3""").show()

+--------------------+--------------+
|              agency|sum(generated)|
+--------------------+--------------+
|           Wipro Ltd|        745751|
|Vakrangee Softwar...|        225273|
|Swathy Smartcards...|        211790|
+--------------------+--------------+



##### 3. Find the number of residents providing email, mobile number? (Hint: consider non-zero values.)

In [21]:
data.selectExpr("sum(mobile)","sum(email)").show()

+-----------+----------+
|sum(mobile)|sum(email)|
+-----------+----------+
|      56504|   1424434|
+-----------+----------+



In [22]:
spark.sql("""
SELECT sum(mobile)
     , sum(email)
  FROM aadhardata""").show()

+-----------+----------+
|sum(mobile)|sum(email)|
+-----------+----------+
|      56504|   1424434|
+-----------+----------+



##### 4. Find top 3 districts where enrolment numbers are maximum?

In [23]:
data.groupBy("district").agg(sum("generated")).orderBy("sum(generated)",ascending=False).limit(3).show(truncate=0)

+--------+--------------+
|district|sum(generated)|
+--------+--------------+
|Pune    |140851        |
|Mumbai  |114755        |
|Nagpur  |99296         |
+--------+--------------+



In [24]:
spark.sql("""
SELECT district
     , sum(generated)
  FROM aadhardata
 GROUP BY district
 ORDER BY 2 desc
 LIMIT 3""").show(truncate=0)

+--------+--------------+
|district|sum(generated)|
+--------+--------------+
|Pune    |140851        |
|Mumbai  |114755        |
|Nagpur  |99296         |
+--------+--------------+



##### 5. Find the no. of Aadhaar cards generated in each state?

In [25]:
data.groupBy("state").agg(sum("generated")).show(truncate=0)

+----------------------+--------------+
|state                 |sum(generated)|
+----------------------+--------------+
|Nagaland              |631           |
|Karnataka             |146013        |
|Odisha                |34529         |
|Kerala                |150893        |
|Tamil Nadu            |131735        |
|Chhattisgarh          |59764         |
|Andhra Pradesh        |270055        |
|Lakshadweep           |15            |
|Madhya Pradesh        |171324        |
|Punjab                |65644         |
|Manipur               |47386         |
|Goa                   |7979          |
|Mizoram               |323           |
|Dadra and Nagar Haveli|108           |
|Himachal Pradesh      |33844         |
|Puducherry            |568           |
|Haryana               |95350         |
|Jammu and Kashmir     |17355         |
|Jharkhand             |168855        |
|Arunachal Pradesh     |1178          |
+----------------------+--------------+
only showing top 20 rows



In [26]:
spark.sql("""
SELECT state
     , sum(generated)
  FROM aadhardata
 GROUP BY 1""").show(truncate=0)

+----------------------+--------------+
|state                 |sum(generated)|
+----------------------+--------------+
|Nagaland              |631           |
|Karnataka             |146013        |
|Odisha                |34529         |
|Kerala                |150893        |
|Tamil Nadu            |131735        |
|Chhattisgarh          |59764         |
|Andhra Pradesh        |270055        |
|Lakshadweep           |15            |
|Madhya Pradesh        |171324        |
|Punjab                |65644         |
|Manipur               |47386         |
|Goa                   |7979          |
|Mizoram               |323           |
|Dadra and Nagar Haveli|108           |
|Himachal Pradesh      |33844         |
|Puducherry            |568           |
|Haryana               |95350         |
|Jammu and Kashmir     |17355         |
|Jharkhand             |168855        |
|Arunachal Pradesh     |1178          |
+----------------------+--------------+
only showing top 20 rows



##### KPI-4
##### 1. Write a command to see the correlation between “age” and “mobile_number”? (Hint: Consider the percentage of people who have provided the mobile number out of the total applicants)

In [27]:
(data
 .withColumn("mobPercent",expr("mobile/generated"))
 .corr("age","mobPercent")
)

-0.024500947199046438

In [28]:
spark.sql("""
SELECT corr(age,mobPercent)
FROM(
SELECT *,mobile/generated as mobPercent
FROM aadhardata
)A""").show()

+-------------------------------------+
|corr(CAST(age AS DOUBLE), mobPercent)|
+-------------------------------------+
|                 -0.02866583131236...|
+-------------------------------------+



##### 2. Find the number of unique pincodes in the data?

In [29]:
data.select("pincode").distinct().count()

18011

In [30]:
spark.sql("""
SELECT count(distinct pincode)
FROM aadhardata
""").show()

+-----------------------+
|count(DISTINCT pincode)|
+-----------------------+
|                  18010|
+-----------------------+



##### 3. Find the number of Aadhaar registrations rejected in Uttar Pradesh and Maharashtra?

In [31]:
data.filter(expr("state = 'Uttar Pradesh' or state = 'Maharashtra'")).groupby("state").agg(sum("rejected")).show(truncate=0)

+-------------+-------------+
|state        |sum(rejected)|
+-------------+-------------+
|Maharashtra  |45704        |
|Uttar Pradesh|24752        |
+-------------+-------------+



In [32]:
spark.sql("""
SELECT state
     , sum(rejected)
  FROM aadhardata
 WHERE state IN('Uttar Pradesh','Maharashtra')
 GROUP BY 1""").show(truncate=False)

+-------------+-------------+
|state        |sum(rejected)|
+-------------+-------------+
|Maharashtra  |45704        |
|Uttar Pradesh|24752        |
+-------------+-------------+



##### KPI-5
##### 1. The top 3 states where the percentage of Aadhaar cards being generated for males is the highest.

In [33]:
dataAgg = (data
           .groupBy("state","gender")
           .agg(expr("sum(generated) as total_generated"),expr("sum(rejected) as total_rejected"))
          )

In [35]:
cond = [data["state"] == dataAgg["state"], data["gender"] == data["gender"]]

dataDF = data.join(dataAgg,cond).drop(dataAgg["state"]).drop(dataAgg["gender"])

In [44]:
(dataDF
 .filter("gender = 'M'")
 .groupBy("state","total_generated","total_rejected")
 .agg(sum("generated").alias("male_generated"))
 .withColumn("malePercent",round(expr("male_generated/(total_generated+total_rejected) *100"),2))
 .select("state","malePercent")
 .orderBy("malePercent",ascending=False)
 .show(3)
)

+-----------------+-----------+
|            state|malePercent|
+-----------------+-----------+
|      Lakshadweep|      100.0|
|          Manipur|      99.93|
|Arunachal Pradesh|       98.6|
+-----------------+-----------+
only showing top 3 rows



In [45]:
spark.sql("""
CREATE TEMP VIEW aadharconsol AS
SELECT state
     , gender
     , sum(generated) as total_generated
     , sum(rejected) as total_rejected
  FROM aadhardata
 GROUP BY 1,2""")

DataFrame[]

In [51]:
spark.sql("""
SELECT A.state
     , round(male_total/(total_generated+total_rejected) * 100 ,2) as malePercent
  FROM (SELECT state
             , gender
             , sum(generated) as male_total
          FROM aadhardata
         WHERE gender = 'M'
         GROUP BY 1, 2) A
 JOIN aadharconsol B
   ON A.state = B.state
  AND A.gender = B.gender 
ORDER BY 2 DESC LIMIT 3""").show()

+-----------------+-----------+
|            state|malePercent|
+-----------------+-----------+
|      Lakshadweep|      100.0|
|          Manipur|      99.93|
|Arunachal Pradesh|       98.6|
+-----------------+-----------+



##### 2. In each of these 3 states, identify the top 3 districts where the percentage of Aadhaar cards being rejected for females is the highest.

In [52]:
datadistAgg = (data.groupBy("state","district","gender")
               .agg(expr("sum(generated) as total_generated"),expr("sum(rejected) as total_rejected")))

In [53]:
cond = [data["state"] == datadistAgg["state"], data["district"] == datadistAgg["district"], data["gender"] == datadistAgg["gender"]]

In [54]:
datadistDF = data.join(datadistAgg,cond).drop(datadistAgg["state"]).drop(datadistAgg["district"]).drop(datadistAgg["gender"])

In [73]:
(datadistDF
.filter("state in('Manipur','Arunachal Pradesh','Nagaland') and gender='F'")
.groupBy("state","district","total_rejected","total_generated") 
.agg(expr("sum(rejected) as female_rejected"))
.withColumn("femalePercent",round(expr("female_rejected/(total_rejected+total_generated) * 100"),2))
.select("*",rank().over(Window.partitionBy(datadistDF["state"]).orderBy(col("femalePercent").desc())).alias("rank"))
.filter(col("rank")<=3)
.select("state","district","femalePercent") 
.show(9,False) 
) 

+-----------------+---------------+-------------+
|state            |district       |femalePercent|
+-----------------+---------------+-------------+
|Nagaland         |Kohima         |94.32        |
|Nagaland         |Phek           |90.48        |
|Nagaland         |Mokokchung     |87.5         |
|Manipur          |Chandel        |50.0         |
|Manipur          |Churachandpur  |40.0         |
|Manipur          |Imphal East    |5.26         |
|Arunachal Pradesh|Upper Subansiri|100.0        |
|Arunachal Pradesh|Lohit          |5.17         |
|Arunachal Pradesh|Kurung Kumey   |3.33         |
+-----------------+---------------+-------------+



In [79]:
spark.sql("""
CREATE OR REPLACE TEMP VIEW datadistconsol As
SELECT state
     , district
     , gender
     , sum(generated) as total_generated
     , sum(rejected) as total_rejected
  FROM aadhardata
 GROUP BY 1,2,3""")

DataFrame[]

In [100]:
spark.sql("""
SELECT state
     , district
     , femalePercent
FROM(
SELECT state
     , district
     , femalePercent
     , rank() over(partition by state order by femalePercent desc) AS rnk
FROM(
SELECT A.state
     , A.district
     , round(female_total/(total_generated+total_rejected) * 100 ,2) as femalePercent
  FROM (SELECT state
             , district
             , gender
             , sum(rejected) as female_total
          FROM aadhardata
         WHERE gender = 'F'
           AND state in('Manipur','Arunachal Pradesh','Nagaland')
         GROUP BY 1, 2,3) A
 JOIN datadistconsol B
   ON A.state = B.state
  AND A.gender = B.gender 
  AND A.district = B.district
WHERE A.state in('Manipur','Arunachal Pradesh','Nagaland')
 ORDER BY 2 DESC)B)c
 WHERE rnk <= 3
""").show()

+-----------------+---------------+-------------+
|            state|       district|femalePercent|
+-----------------+---------------+-------------+
|         Nagaland|         Kohima|        94.32|
|         Nagaland|           Phek|        90.48|
|         Nagaland|     Mokokchung|         87.5|
|          Manipur|        Chandel|         50.0|
|          Manipur|  Churachandpur|         40.0|
|          Manipur|    Imphal East|         5.26|
|Arunachal Pradesh|Upper Subansiri|        100.0|
|Arunachal Pradesh|          Lohit|         5.17|
|Arunachal Pradesh|   Kurung Kumey|         3.33|
+-----------------+---------------+-------------+



##### 3. The top 3 states where the percentage of Aadhaar cards being generated for females is the highest.

In [102]:
(dataDF
 .filter("gender = 'F'")
 .groupBy("state","total_generated","total_rejected")
 .agg(sum("generated").alias("female_generated"))
 .withColumn("femalePercent",round(expr("female_generated/(total_generated+total_rejected) *100"),2))
 .select("state","femalePercent")
 .orderBy("femalePercent",ascending=False)
 .show(3)
)

+---------+-------------+
|    state|femalePercent|
+---------+-------------+
|Meghalaya|        100.0|
|  Manipur|        99.15|
|    Assam|        97.52|
+---------+-------------+
only showing top 3 rows



In [104]:
spark.sql("""
SELECT A.state
     , round(female_total/(total_generated+total_rejected) * 100 ,2) as femalePercent
  FROM (SELECT state
             , gender
             , sum(generated) as female_total
          FROM aadhardata
         WHERE gender = 'F'
         GROUP BY 1, 2) A
 JOIN aadharconsol B
   ON A.state = B.state
  AND A.gender = B.gender 
ORDER BY 2 DESC LIMIT 3""").show()

+---------+-------------+
|    state|femalePercent|
+---------+-------------+
|Meghalaya|        100.0|
|  Manipur|        99.15|
|    Assam|        97.52|
+---------+-------------+



##### 4. In each of these 3 states, identify the top 3 districts where the percentage of Aadhaar cards being rejected for males is the highest.

In [109]:
(datadistDF
.filter("state in('Meghalaya','Manipur','Assam') and gender='M'")
.groupBy("state","district","total_rejected","total_generated") 
.agg(expr("sum(rejected) as male_rejected"))
.withColumn("malePercent",round(expr("male_rejected/(total_rejected+total_generated) * 100"),2))
.select("*",rank().over(Window.partitionBy(datadistDF["state"]).orderBy(col("malePercent").desc())).alias("rank"))
.filter(col("rank")<=3)
.select("state","district","malePercent") 
.show(8,truncate=False) 
) 

+---------+----------+-----------+
|state    |district  |malePercent|
+---------+----------+-----------+
|Manipur  |Senapati  |1.32       |
|Manipur  |Tamenglong|1.12       |
|Manipur  |Thoubal   |0.8        |
|Manipur  |Bishnupur |0.8        |
|Assam    |Udalguri  |25.0       |
|Assam    |Hailakandi|25.0       |
|Assam    |Karimganj |18.18      |
|Meghalaya|Ri Bhoi   |50.0       |
+---------+----------+-----------+
only showing top 8 rows



In [110]:
spark.sql("""
SELECT state
     , district
     , femalePercent
FROM(
SELECT state
     , district
     , femalePercent
     , rank() over(partition by state order by femalePercent desc) AS rnk
FROM(
SELECT A.state
     , A.district
     , round(female_total/(total_generated+total_rejected) * 100 ,2) as femalePercent
  FROM (SELECT state
             , district
             , gender
             , sum(rejected) as female_total
          FROM aadhardata
         WHERE gender = 'M'
           AND state in('Meghalaya','Manipur','Assam') 
         GROUP BY 1, 2,3) A
 JOIN datadistconsol B
   ON A.state = B.state
  AND A.gender = B.gender 
  AND A.district = B.district
WHERE A.state in('Meghalaya','Manipur','Assam') 
 ORDER BY 2 DESC)B)c
 WHERE rnk <= 3
""").show()

+---------+----------------+-------------+
|    state|        district|femalePercent|
+---------+----------------+-------------+
|  Manipur|        Senapati|         1.32|
|  Manipur|      Tamenglong|         1.12|
|  Manipur|         Thoubal|          0.8|
|  Manipur|       Bishnupur|          0.8|
|    Assam|        Udalguri|         25.0|
|    Assam|      Hailakandi|         25.0|
|    Assam|       Karimganj|        18.18|
|Meghalaya|         Ri Bhoi|         50.0|
|Meghalaya| West Garo Hills|          0.0|
|Meghalaya|   Jaintia Hills|          0.0|
|Meghalaya|East Khasi Hills|          0.0|
|Meghalaya| East Garo Hills|          0.0|
+---------+----------------+-------------+



##### 5. The summary of the acceptance percentage of all the Aadhaar cards applications by bucketing the age group into 10 buckets.

In [122]:
dataBuckets=(data
 .select("age","generated","rejected")
 .groupBy("age")
 .agg(expr("sum(generated) as total_generated"),expr("sum(rejected) as total_rejected"))
 .withColumn("acceptPercent",round(expr("total_generated/(total_rejected+total_generated) * 100"),2))
 .select("age","acceptPercent"))


In [131]:
dataBuckets.write.format("csv").mode("overwrite").bucketBy(10,"age").sortBy("age").saveAsTable("ageBucket")

In [133]:
spark.sql("DESCRIBE FORMATTED ageBucket").show(truncate=0)

+----------------------------+------------------------------------------------------------------------------+-------+
|col_name                    |data_type                                                                     |comment|
+----------------------------+------------------------------------------------------------------------------+-------+
|age                         |int                                                                           |null   |
|acceptPercent               |double                                                                        |null   |
|                            |                                                                              |       |
|# Detailed Table Information|                                                                              |       |
|Database                    |default                                                                       |       |
|Table                       |agebucket                 

In [135]:
spark.sql("SELECT * FROM ageBucket").show()

+---+-------------+
|age|acceptPercent|
+---+-------------+
|148|          0.0|
|123|        100.0|
| 95|        93.36|
| 32|        91.04|
| 60|         92.3|
| 90|        94.27|
| 22|        90.88|
|128|        100.0|
| 92|         94.8|
|122|          0.0|
|105|        71.43|
| 87|        93.05|
| 34|        91.47|
|  3|        83.44|
| 54|        91.15|
|120|        100.0|
| 72|        89.01|
| 29|        91.47|
| 44|        90.93|
| 80|         93.4|
+---+-------------+
only showing top 20 rows



##### Closing up the SparkSession

In [136]:
spark.stop()