In [9]:
from pyspark.sql.types import StructType, StructField, StringType, DoubleType, IntegerType
from pyspark.sql import SparkSession
from pyspark.sql import functions
from pyspark.sql.functions import col, lit, concat, lower, upper, substring, min, max, to_date, desc
import requests

In [2]:
import findspark
findspark.init()

spark = SparkSession.builder.master("local[*]").getOrCreate()
spark.conf.set("spark.sql.repl.eagerEval.enabled", True) 

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/02/25 01:02:01 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
25/02/25 01:02:01 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
25/02/25 01:02:01 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.


In [3]:
spark

In [4]:

path = "https://raw.githubusercontent.com/owid/covid-19-data/master/public/data/owid-covid-data.csv"
req = requests.get(path)
url_content = req.content

csv_file_name = 'owid-covid-data.csv'
csv_file = open(csv_file_name, 'wb')

csv_file.write(url_content)
csv_file.close()

df = spark.read.csv(csv_file_name, header=True, inferSchema=True)

                                                                                

In [5]:
df.printSchema()

root
 |-- iso_code: string (nullable = true)
 |-- continent: string (nullable = true)
 |-- location: string (nullable = true)
 |-- date: date (nullable = true)
 |-- total_cases: integer (nullable = true)
 |-- new_cases: integer (nullable = true)
 |-- new_cases_smoothed: double (nullable = true)
 |-- total_deaths: integer (nullable = true)
 |-- new_deaths: integer (nullable = true)
 |-- new_deaths_smoothed: double (nullable = true)
 |-- total_cases_per_million: double (nullable = true)
 |-- new_cases_per_million: double (nullable = true)
 |-- new_cases_smoothed_per_million: double (nullable = true)
 |-- total_deaths_per_million: double (nullable = true)
 |-- new_deaths_per_million: double (nullable = true)
 |-- new_deaths_smoothed_per_million: double (nullable = true)
 |-- reproduction_rate: double (nullable = true)
 |-- icu_patients: integer (nullable = true)
 |-- icu_patients_per_million: double (nullable = true)
 |-- hosp_patients: integer (nullable = true)
 |-- hosp_patients_per_mil

In [6]:
df.describe().show()

25/02/25 01:02:34 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.

+-------+--------+-------------+-----------+-------------------+-----------------+------------------+-----------------+------------------+-------------------+-----------------------+---------------------+------------------------------+------------------------+----------------------+-------------------------------+------------------+-----------------+------------------------+------------------+-------------------------+---------------------+---------------------------------+----------------------+----------------------------------+-------------------+------------------+------------------------+----------------------+------------------+-------------------------------+-------------------+-----------------+-------------+-------------------+--------------------+-----------------------+--------------------+-----------------+-------------------------+------------------------------+-----------------------------+-----------------------------------+--------------------------+--------------------

                                                                                

In [7]:
df.select(to_date(df.date).alias("date"))

date
2020-01-05
2020-01-06
2020-01-07
2020-01-08
2020-01-09
2020-01-10
2020-01-11
2020-01-12
2020-01-13
2020-01-14


In [10]:
df.filter(df.location == "United States").orderBy(df.date).show(10)

+--------+-------------+-------------+----------+-----------+---------+------------------+------------+----------+-------------------+-----------------------+---------------------+------------------------------+------------------------+----------------------+-------------------------------+-----------------+------------+------------------------+-------------+-------------------------+---------------------+---------------------------------+----------------------+----------------------------------+-----------+---------+------------------------+----------------------+------------------+-------------------------------+-------------+--------------+-----------+------------------+-----------------+-----------------------+--------------+----------------+-------------------------+------------------------------+-----------------------------+-----------------------------------+--------------------------+-------------------------------------+------------------------------+-------------------------

In [11]:
df.filter(df.location == "United States").orderBy(desc("date")).show()

+--------+-------------+-------------+----------+-----------+---------+------------------+------------+----------+-------------------+-----------------------+---------------------+------------------------------+------------------------+----------------------+-------------------------------+-----------------+------------+------------------------+-------------+-------------------------+---------------------+---------------------------------+----------------------+----------------------------------+-----------+---------+------------------------+----------------------+------------------+-------------------------------+-------------+--------------+-----------+------------------+-----------------+-----------------------+--------------+----------------+-------------------------+------------------------------+-----------------------------+-----------------------------------+--------------------------+-------------------------------------+------------------------------+-------------------------

In [22]:
df.groupBy("location").count().show(10)

+--------------------+-----+
|            location|count|
+--------------------+-----+
|            Anguilla| 1674|
|         Afghanistan| 1674|
|              Africa| 1674|
|British Virgin Is...| 1674|
|             Algeria| 1674|
|           Argentina| 1678|
|              Angola| 1674|
|             Belgium| 1674|
|Bonaire Sint Eust...| 1674|
|             Albania| 1674|
+--------------------+-----+
only showing top 10 rows



In [12]:
df.groupBy("location").sum("new_cases").show()

+--------------------+--------------+
|            location|sum(new_cases)|
+--------------------+--------------+
|            Anguilla|          3904|
|         Afghanistan|        235214|
|              Africa|      13146831|
|British Virgin Is...|          7557|
|             Algeria|        272139|
|           Argentina|      10101218|
|              Angola|        107481|
|             Belgium|       4872829|
|Bonaire Sint Eust...|         11922|
|             Albania|        335047|
|               Benin|         28036|
|             Bahamas|         39127|
|             Belarus|        994037|
|      American Samoa|          8359|
|             Andorra|         48015|
|             Bolivia|       1212147|
|          Bangladesh|       2051348|
|            Barbados|        108582|
|              Bhutan|         62697|
|               Aruba|         44224|
+--------------------+--------------+
only showing top 20 rows



In [13]:
df.createOrReplaceTempView("covid_data")

In [14]:
new_df = spark.sql("SELECT * FROM covid_data")
new_df.show()

+--------+---------+-----------+----------+-----------+---------+------------------+------------+----------+-------------------+-----------------------+---------------------+------------------------------+------------------------+----------------------+-------------------------------+-----------------+------------+------------------------+-------------+-------------------------+---------------------+---------------------------------+----------------------+----------------------------------+-----------+---------+------------------------+----------------------+------------------+-------------------------------+-------------+--------------+-----------+------------------+-----------------+-----------------------+--------------+----------------+-------------------------+------------------------------+-----------------------------+-----------------------------------+--------------------------+-------------------------------------+------------------------------+-------------------------------

In [15]:
group_df = spark.sql("SELECT location, count(*) from covid_data GROUP BY location")
group_df.show()

+--------------------+--------+
|            location|count(1)|
+--------------------+--------+
|            Anguilla|    1674|
|         Afghanistan|    1674|
|              Africa|    1674|
|British Virgin Is...|    1674|
|             Algeria|    1674|
|           Argentina|    1678|
|              Angola|    1674|
|             Belgium|    1674|
|Bonaire Sint Eust...|    1674|
|             Albania|    1674|
|               Benin|    1674|
|             Bahamas|    1674|
|             Belarus|    1674|
|      American Samoa|    1674|
|             Andorra|    1674|
|             Bolivia|    1674|
|          Bangladesh|    1674|
|            Barbados|    1674|
|              Bhutan|    1674|
|               Aruba|    1674|
+--------------------+--------+
only showing top 20 rows

