## Olympic Data Analysis using Apache Spark

#### Creating a SparkSession

In [1]:
from pyspark.sql import SparkSession, Window
from pyspark.sql.types import *
from pyspark.sql.functions import *

spark = SparkSession.builder.appName("Olympic Data Analysis").getOrCreate()

#### Reading the data file and creating a DataFrame 

In [2]:
dataFile = "./olympic_Data.csv"

schemaStruct = StructType([StructField("athlete",StringType(),True),
                           StructField("age",IntegerType(),True),
                           StructField("country",StringType(),True),
                           StructField("year",IntegerType(),True),
                           StructField("closingDate",DateType(),True),
                           StructField("sport",StringType(),True),
                           StructField("goldMedals",IntegerType(),True),
                           StructField("silverMedals",IntegerType(),True),
                           StructField("bronzeMedals",IntegerType(),True),
                           StructField("totalMedals",IntegerType(),True)
                          ])

df = spark.read.csv(dataFile,schema=schemaStruct,dateFormat="m/d/yyyy")

#### Checking the loaded data

In [3]:
df.show(5,False)

+----------------+---+-------------+----+-----------+----------+----------+------------+------------+-----------+
|athlete         |age|country      |year|closingDate|sport     |goldMedals|silverMedals|bronzeMedals|totalMedals|
+----------------+---+-------------+----+-----------+----------+----------+------------+------------+-----------+
|Michael Phelps  |23 |United States|2008|2008-01-24 |Swimming  |8         |0           |0           |8          |
|Michael Phelps  |19 |United States|2004|2004-01-29 |Swimming  |6         |0           |2           |8          |
|Michael Phelps  |27 |United States|2012|2012-01-12 |Swimming  |4         |2           |0           |6          |
|Natalie Coughlin|25 |United States|2008|2008-01-24 |Swimming  |1         |2           |3           |6          |
|Aleksey Nemov   |24 |Russia       |2000|2000-01-01 |Gymnastics|2         |1           |3           |6          |
+----------------+---+-------------+----+-----------+----------+----------+------------+

#### Creating a Temporary table for SQL Analysis

In [4]:
df.createOrReplaceTempView("olympics")

####  Q1. No of athletes participated in each Olympic event

In [5]:
(df
.groupBy("year")
.count()
.show(20,truncate=False))

+----+-----+
|year|count|
+----+-----+
|2006|443  |
|2004|1839 |
|2012|1776 |
|2000|1840 |
|2010|441  |
|2008|1872 |
|2002|407  |
+----+-----+



#### Using SQL query

In [6]:
spark.sql("""
SELECT year
     , count(*)
  FROM olympics
 GROUP BY 1""").show()

+----+--------+
|year|count(1)|
+----+--------+
|2006|     443|
|2004|    1839|
|2012|    1776|
|2000|    1840|
|2010|     441|
|2008|    1872|
|2002|     407|
+----+--------+



#### Q2. No of medals each country won in each Olympic in ascending order

In [7]:
(df
.select("country","year","totalMedals")
.groupBy("country","year") 
.agg(sum("totalMedals").alias("Total_Medals"))
.orderBy(asc("country"),desc("Total_Medals"))
.show(25)
)

+-----------+----+------------+
|    country|year|Total_Medals|
+-----------+----+------------+
|Afghanistan|2008|           1|
|Afghanistan|2012|           1|
|    Algeria|2000|           5|
|    Algeria|2008|           2|
|    Algeria|2012|           1|
|  Argentina|2008|          51|
|  Argentina|2004|          49|
|  Argentina|2012|          21|
|  Argentina|2000|          20|
|    Armenia|2008|           6|
|    Armenia|2012|           3|
|    Armenia|2000|           1|
|  Australia|2000|         183|
|  Australia|2004|         156|
|  Australia|2008|         149|
|  Australia|2012|         114|
|  Australia|2010|           3|
|  Australia|2006|           2|
|  Australia|2002|           2|
|    Austria|2006|          30|
|    Austria|2010|          26|
|    Austria|2002|          20|
|    Austria|2004|           8|
|    Austria|2000|           4|
|    Austria|2008|           3|
+-----------+----+------------+
only showing top 25 rows



In [8]:
(df
.select("country","year","totalMedals")
.groupBy("country","year") 
.agg(sum("totalMedals").alias("Total_Medals"))
.orderBy("year",desc("Total_Medals"))
.show(25)
)

+--------------------+----+------------+
|             country|year|Total_Medals|
+--------------------+----+------------+
|       United States|2000|         243|
|              Russia|2000|         187|
|           Australia|2000|         183|
|             Germany|2000|         118|
|               China|2000|          79|
|         Netherlands|2000|          79|
|         South Korea|2000|          73|
|              France|2000|          66|
|                Cuba|2000|          65|
|               Italy|2000|          65|
|       Great Britain|2000|          54|
|             Hungary|2000|          53|
|              Brazil|2000|          48|
|             Romania|2000|          46|
|               Japan|2000|          44|
|              Norway|2000|          43|
|               Spain|2000|          42|
|             Ukraine|2000|          35|
|              Sweden|2000|          32|
|              Canada|2000|          31|
|             Denmark|2000|          25|
|Serbia and Mont

#### Using SQL query

In [9]:
spark.sql("""
SELECT country
     , year
     , sum(totalMedals)
  FROM olympics 
 GROUP BY 1,2
 ORDER BY 2,3 DESC""").show(25)

+--------------------+----+----------------+
|             country|year|sum(totalMedals)|
+--------------------+----+----------------+
|       United States|2000|             243|
|              Russia|2000|             187|
|           Australia|2000|             183|
|             Germany|2000|             118|
|               China|2000|              79|
|         Netherlands|2000|              79|
|         South Korea|2000|              73|
|              France|2000|              66|
|                Cuba|2000|              65|
|               Italy|2000|              65|
|       Great Britain|2000|              54|
|             Hungary|2000|              53|
|              Brazil|2000|              48|
|             Romania|2000|              46|
|               Japan|2000|              44|
|              Norway|2000|              43|
|               Spain|2000|              42|
|             Ukraine|2000|              35|
|              Sweden|2000|              32|
|         

#### Q3. Top 10 athletes who won highest gold medals in all the Olympic events

In [10]:
(df
.groupBy("athlete")
.agg(sum("totalMedals").alias("Total_Medals")) 
.orderBy("Total_Medals",ascending=False) 
).show(10)

+--------------------+------------+
|             athlete|Total_Medals|
+--------------------+------------+
|      Michael Phelps|          22|
|    Natalie Coughlin|          12|
|         Ryan Lochte|          11|
|Ole Einar Bj�rndalen|           9|
|        Leisel Jones|           9|
|          Ian Thorpe|           9|
|         Jason Lezak|           8|
|         Dara Torres|           8|
|    Apolo Anton Ohno|           8|
|      Katalin Kov�cs|           8|
+--------------------+------------+
only showing top 10 rows



#### Using SQL query

In [11]:
spark.sql("""
SELECT athlete
     , sum(totalMedals)
  FROM olympics
 GROUP BY 1
 ORDER BY 2 DESC
 LIMIT 10""").show()

+--------------------+----------------+
|             athlete|sum(totalMedals)|
+--------------------+----------------+
|      Michael Phelps|              22|
|    Natalie Coughlin|              12|
|         Ryan Lochte|              11|
|          Ian Thorpe|               9|
|        Leisel Jones|               9|
|Ole Einar Bj�rndalen|               9|
|    Apolo Anton Ohno|               8|
|         Dara Torres|               8|
|      Katalin Kov�cs|               8|
|         Jason Lezak|               8|
+--------------------+----------------+



#### Q4. No of athletes who won gold and whose age is less than 20

In [12]:
(df
.filter("age<20 and goldMedals>0")
.count())

188

#### Using a SQL query

In [13]:
spark.sql("""
SELECT count(*)
  FROM olympics
 WHERE goldMedals>0
   AND age<20
""").show()

+--------+
|count(1)|
+--------+
|     188|
+--------+



#### Q5. Youngest athlete who won gold in each category of sports in each Olympic

In [14]:
(df
.withColumn("min_age",min("age").over(Window.partitionBy()))
.filter("age = min_age and goldMedals>0")
).show()

+--------------+---+-------------+----+-----------+--------------------+----------+------------+------------+-----------+-------+
|       athlete|age|      country|year|closingDate|               sport|goldMedals|silverMedals|bronzeMedals|totalMedals|min_age|
+--------------+---+-------------+----+-----------+--------------------+----------+------------+------------+-----------+-------+
|    Yang Yilin| 15|        China|2008| 2008-01-24|          Gymnastics|         1|           0|           2|          3|     15|
|   Go Gi-Hyeon| 15|  South Korea|2002| 2002-01-24|Short-Track Speed...|         1|           1|           0|          2|     15|
|   Chen Ruolin| 15|        China|2008| 2008-01-24|              Diving|         2|           0|           0|          2|     15|
| Katie Ledecky| 15|United States|2012| 2012-01-12|            Swimming|         1|           0|           0|          1|     15|
|Ruta Meilutyte| 15|    Lithuania|2012| 2012-01-12|            Swimming|         1|       

#### Using a SQL query

In [15]:
spark.sql("""
SELECT *
  FROM olympics
 WHERE age = (SELECT min(age) FROM olympics)
   AND goldMedals > 0
""").show()

+--------------+---+-------------+----+-----------+--------------------+----------+------------+------------+-----------+
|       athlete|age|      country|year|closingDate|               sport|goldMedals|silverMedals|bronzeMedals|totalMedals|
+--------------+---+-------------+----+-----------+--------------------+----------+------------+------------+-----------+
|    Yang Yilin| 15|        China|2008| 2008-01-24|          Gymnastics|         1|           0|           2|          3|
|   Go Gi-Hyeon| 15|  South Korea|2002| 2002-01-24|Short-Track Speed...|         1|           1|           0|          2|
|   Chen Ruolin| 15|        China|2008| 2008-01-24|              Diving|         2|           0|           0|          2|
| Katie Ledecky| 15|United States|2012| 2012-01-12|            Swimming|         1|           0|           0|          1|
|Ruta Meilutyte| 15|    Lithuania|2012| 2012-01-12|            Swimming|         1|           0|           0|          1|
|Olga Glatskikh| 15|    

#### Q6. No of atheletes from each country who has won a medal in each Olympic in each sports

In [16]:
(df
.select("country","year","sport")
.groupBy("country","year","sport") 
.count()
.orderBy("year",desc("count")) 
.show() 
)

+-------------+----+----------+-----+
|      country|year|     sport|count|
+-------------+----+----------+-----+
|United States|2000|  Swimming|   41|
|    Australia|2000|    Hockey|   32|
|  Netherlands|2000|    Hockey|   32|
|United States|2000| Athletics|   27|
|    Australia|2000|  Swimming|   26|
|       Russia|2000| Waterpolo|   25|
|United States|2000|  Baseball|   24|
|       Russia|2000|Volleyball|   24|
|         Cuba|2000|  Baseball|   24|
|  South Korea|2000|  Baseball|   24|
|United States|2000|Basketball|   23|
|    Australia|2000|    Rowing|   21|
|     Cameroon|2000|  Football|   18|
|        Chile|2000|  Football|   17|
|Great Britain|2000|    Rowing|   17|
|      Germany|2000|  Football|   17|
|        Spain|2000|  Football|   17|
|       Norway|2000|  Football|   17|
|      Jamaica|2000| Athletics|   17|
|       Russia|2000| Athletics|   17|
+-------------+----+----------+-----+
only showing top 20 rows



#### Using a SQL query

In [17]:
spark.sql("""
SELECT country
     , year
     , sport
     , count(*) AS count
  FROM olympics 
 GROUP BY 1,2,3
 ORDER BY 2, count desc
  """).show()

+-------------+----+----------+-----+
|      country|year|     sport|count|
+-------------+----+----------+-----+
|United States|2000|  Swimming|   41|
|    Australia|2000|    Hockey|   32|
|  Netherlands|2000|    Hockey|   32|
|United States|2000| Athletics|   27|
|    Australia|2000|  Swimming|   26|
|       Russia|2000| Waterpolo|   25|
|United States|2000|  Baseball|   24|
|       Russia|2000|Volleyball|   24|
|         Cuba|2000|  Baseball|   24|
|  South Korea|2000|  Baseball|   24|
|United States|2000|Basketball|   23|
|    Australia|2000|    Rowing|   21|
|     Cameroon|2000|  Football|   18|
|        Chile|2000|  Football|   17|
|Great Britain|2000|    Rowing|   17|
|      Germany|2000|  Football|   17|
|        Spain|2000|  Football|   17|
|       Norway|2000|  Football|   17|
|      Jamaica|2000| Athletics|   17|
|       Russia|2000| Athletics|   17|
+-------------+----+----------+-----+
only showing top 20 rows



#### Q7. No of athletes won at least a medal in each events in all the Olympics

In [18]:
(df
.groupBy("sport")
.count()
.orderBy("count",ascending=False) 
.show() 
)

+----------+-----+
|     sport|count|
+----------+-----+
| Athletics|  687|
|    Rowing|  567|
|  Swimming|  487|
|  Football|  407|
|    Hockey|  388|
|Ice Hockey|  384|
|  Handball|  351|
| Waterpolo|  306|
|  Canoeing|  295|
|Basketball|  287|
|Volleyball|  281|
|   Cycling|  261|
| Wrestling|  245|
|   Fencing|  230|
|      Judo|  224|
|  Baseball|  216|
|   Sailing|  210|
|Gymnastics|  194|
|    Boxing|  188|
|  Shooting|  181|
+----------+-----+
only showing top 20 rows



#### Using a Spark SQL query

In [19]:
spark.sql("""
SELECT DISTINCT sport
     , count(*)
  FROM olympics
 GROUP BY 1
 ORDER BY 2 DESC
""").show()

+----------+--------+
|     sport|count(1)|
+----------+--------+
| Athletics|     687|
|    Rowing|     567|
|  Swimming|     487|
|  Football|     407|
|    Hockey|     388|
|Ice Hockey|     384|
|  Handball|     351|
| Waterpolo|     306|
|  Canoeing|     295|
|Basketball|     287|
|Volleyball|     281|
|   Cycling|     261|
| Wrestling|     245|
|   Fencing|     230|
|      Judo|     224|
|  Baseball|     216|
|   Sailing|     210|
|Gymnastics|     194|
|    Boxing|     188|
|  Shooting|     181|
+----------+--------+
only showing top 20 rows



#### Q8. Country won highest no of medals in wrestling in 2012

In [20]:
(df
.filter("sport = 'Wrestling' and year = 2012")
.groupBy("country")
.agg(sum("totalMedals").alias("Total_Medals"))
.orderBy("Total_Medals",ascending=False)
.show(1)
)

+-------+------------+
|country|Total_Medals|
+-------+------------+
| Russia|          11|
+-------+------------+
only showing top 1 row



#### Using a SQL query

In [21]:
spark.sql("""
SELECT country
     , sum(totalMedals) as Total_Medals
  FROM olympics
 WHERE sport = 'Wrestling'
   AND year = 2012
 GROUP BY 1
 ORDER BY 2 DESC
 LIMIT 1
""").show()

+-------+------------+
|country|Total_Medals|
+-------+------------+
| Russia|          11|
+-------+------------+

