In [5]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col,sum as spark_sum,avg,when

spark = SparkSession.builder.appName("COVID Analysis").getOrCreate()

country_latest = spark.read.csv("country_wise_latest.csv", header = True, inferSchema = True)

In [None]:
# To find out the death percentage locally and globally
global_deaths = country_latest.agg(spark_sum("Deaths").alias("TotalDeaths")).first()["TotalDeaths"]
global_confirmed = country_latest.agg(spark_sum("Confirmed").alias("TotalConfirmed")).first()["TotalConfirmed"]
global_death_percentage = (global_deaths / global_confirmed) * 100

country_death_percentage = country_latest.withColumn(
    "Death_Percentage", (col("Deaths") / col("Confirmed")) * 100
).select("Country/Region","Death_Percentage").show()

# country_death_percentage.select("Country/Region").show()

print(f"Global Death Percentage: {global_death_percentage:.2f}%")
# country_death_percentage.show()


+-------------------+------------------+
|     Country/Region|  Death_Percentage|
+-------------------+------------------+
|        Afghanistan| 3.499434685492099|
|            Albania|2.9508196721311477|
|            Algeria|4.1575805240767885|
|            Andorra| 5.733186328555679|
|             Angola| 4.315789473684211|
|Antigua and Barbuda| 3.488372093023256|
|          Argentina| 1.827184976346347|
|            Armenia| 1.901577962021931|
|          Australia|1.0912892896817616|
|            Austria|  3.46823620974803|
|         Azerbaijan| 1.389345069959929|
|            Bahamas|2.8795811518324608|
|            Bahrain|0.3571247657160225|
|         Bangladesh| 1.310642059896121|
|           Barbados| 6.363636363636363|
|            Belarus|0.7999881042661077|
|            Belgium|14.785933642439936|
|             Belize| 4.166666666666666|
|              Benin| 1.977401129943503|
|             Bhutan|               0.0|
+-------------------+------------------+
only showing top

In [10]:
worldometer = spark.read.csv("worldometer_data.csv", header = True, inferSchema = True)
global_population = worldometer.agg(spark_sum("Population").alias("TotalPop")).first()["TotalPop"]
global_infected = worldometer.agg(spark_sum("TotalCases").alias("TotalCases")).first()["TotalCases"]
global_infected_percentage = (global_infected / global_population) * 100

infected_percentage = worldometer.withColumn(
    "Infected_Percentage", (col("TotalCases") / col("Population")) * 100
).select("Country/Region","Infected_Percentage").show()
print(f"Global Infected Percentage: {global_infected_percentage:.2f}%")

+--------------+-------------------+
|Country/Region|Infected_Percentage|
+--------------+-------------------+
|           USA| 1.5193862960518527|
|        Brazil| 1.3716104125127853|
|         India|0.14662586134519442|
|        Russia| 0.5974294091765514|
|  South Africa| 0.9063149328193871|
|        Mexico|0.35849056019021563|
|          Peru| 1.3793451656436928|
|         Chile| 1.9164810228284688|
|      Colombia| 0.7022698289089215|
|         Spain| 0.7582451162880623|
|          Iran| 0.3806492842253104|
|            UK| 0.4536584147096077|
|  Saudi Arabia| 0.8151972130721694|
|      Pakistan| 0.1273693106880707|
|    Bangladesh|0.15144002324857403|
|         Italy| 0.4122306268279621|
|        Turkey|0.28102533496723986|
|     Argentina| 0.5044445590018977|
|       Germany|  0.256779339673452|
|        France|0.29964477865301026|
+--------------+-------------------+
only showing top 20 rows

Global Infected Percentage: 0.30%


In [None]:
# 3. Countries with Highest Infection Rates
infected_percentage = worldometer.withColumn(
    "Infected_Percentage", (col("TotalCases") / col("Population")) * 100
).select("Country/Region", "Infected_Percentage")

infected_percentage.show()  


+--------------+-------------------+
|Country/Region|Infected_Percentage|
+--------------+-------------------+
|           USA| 1.5193862960518527|
|        Brazil| 1.3716104125127853|
|         India|0.14662586134519442|
|        Russia| 0.5974294091765514|
|  South Africa| 0.9063149328193871|
|        Mexico|0.35849056019021563|
|          Peru| 1.3793451656436928|
|         Chile| 1.9164810228284688|
|      Colombia| 0.7022698289089215|
|         Spain| 0.7582451162880623|
|          Iran| 0.3806492842253104|
|            UK| 0.4536584147096077|
|  Saudi Arabia| 0.8151972130721694|
|      Pakistan| 0.1273693106880707|
|    Bangladesh|0.15144002324857403|
|         Italy| 0.4122306268279621|
|        Turkey|0.28102533496723986|
|     Argentina| 0.5044445590018977|
|       Germany|  0.256779339673452|
|        France|0.29964477865301026|
+--------------+-------------------+
only showing top 20 rows



In [23]:
# 4. Countries and Continents with Highest Death Counts
top_death_countries = worldometer.select(col("Country/Region").alias("Country"), "TotalDeaths") \
    .orderBy(col("TotalDeaths").desc()).limit(10)

top_death_continents = worldometer.groupBy("Continent") \
    .agg(spark_sum("TotalDeaths").alias("TotalDeaths")) \
    .orderBy(col("TotalDeaths").desc())

top_death_countries.show()

top_death_continents.show()

+-------+-----------+
|Country|TotalDeaths|
+-------+-----------+
|    USA|     162804|
| Brazil|      98644|
| Mexico|      50517|
|     UK|      46413|
|  India|      41638|
|  Italy|      35187|
| France|      30312|
|  Spain|      28500|
|   Peru|      20424|
|   Iran|      17976|
+-------+-----------+

+-----------------+-----------+
|        Continent|TotalDeaths|
+-----------------+-----------+
|    North America|     229855|
|           Europe|     205232|
|    South America|     154885|
|             Asia|     100627|
|           Africa|      22114|
|Australia/Oceania|        281|
|             NULL|         13|
+-----------------+-----------+



In [None]:
# 6. Average of Cases Divided by Population (Top 10)
top_avg_cases_population = infected_percentage.select(col("Country/Region").alias("Country"), "Infected_Percentage") \
    .orderBy(col("Infected_Percentage").desc()).limit(10)
top_avg_cases_population.show()

+-------------+-------------------+
|      Country|Infected_Percentage|
+-------------+-------------------+
|        Qatar| 3.9921575750452756|
|French Guiana| 2.7145648579588157|
|      Bahrain| 2.5130239079751258|
|   San Marino| 2.0596381637102956|
|        Chile| 1.9164810228284688|
|       Panama|  1.652703989232825|
|       Kuwait| 1.6378443167538763|
|         Oman| 1.5769043963734304|
|          USA| 1.5193862960518527|
| Vatican City| 1.4981273408239701|
+-------------+-------------------+

