# Portfolio Project 1: Exploring Covid Data with SQL and Tableau
This is part one of my first portfolio project, showcasing a few SQL queries. Here I use datasets from [Our World in Data](https://ourworldindata.org/coronavirus). 

In the second part of this, I save these queries to Excel since Tableau Public does not allow SQL data. I use df.fillna(0).to_excel("file_name.xlsx") to save these, also filling null values with 0. This is omitted to save space. The Tableau visualization can be found [here](https://public.tableau.com/app/profile/ellis.hartley/viz/BasicCovidDashboard_16442037792500/Dashboard2?publish=yes).

In [1]:
import pandas as pd
from sqlalchemy import create_engine
engine = create_engine('sqlite://', echo=False)

In [2]:
vaccine_df = pd.read_csv('CovidVaccinations')
deaths_df = pd.read_csv('CovidDeaths')
vaccine_df.to_sql(name="vaccinations", con=engine, index_label='id', if_exists='replace')
deaths_df.to_sql(name="deaths", con=engine, index_label='id', if_exists='replace')

In [3]:
pd.read_sql('''SELECT * FROM deaths''',con=engine).head()

Unnamed: 0,id,iso_code,continent,location,date,population,total_cases,new_cases,new_cases_smoothed,total_deaths,...,new_deaths_smoothed_per_million,reproduction_rate,icu_patients,icu_patients_per_million,hosp_patients,hosp_patients_per_million,weekly_icu_admissions,weekly_icu_admissions_per_million,weekly_hosp_admissions,weekly_hosp_admissions_per_million
0,0,AFG,Asia,Afghanistan,2020-02-24,39835428.0,5.0,5.0,,,...,,,,,,,,,,
1,1,AFG,Asia,Afghanistan,2020-02-25,39835428.0,5.0,0.0,,,...,,,,,,,,,,
2,2,AFG,Asia,Afghanistan,2020-02-26,39835428.0,5.0,0.0,,,...,,,,,,,,,,
3,3,AFG,Asia,Afghanistan,2020-02-27,39835428.0,5.0,0.0,,,...,,,,,,,,,,
4,4,AFG,Asia,Afghanistan,2020-02-28,39835428.0,5.0,0.0,,,...,,,,,,,,,,


In [4]:
pd.read_sql('''SELECT * FROM vaccinations''',con=engine).head()

Unnamed: 0,id,iso_code,continent,location,date,new_tests,total_tests,total_tests_per_thousand,new_tests_per_thousand,new_tests_smoothed,...,female_smokers,male_smokers,handwashing_facilities,hospital_beds_per_thousand,life_expectancy,human_development_index,excess_mortality_cumulative_absolute,excess_mortality_cumulative,excess_mortality,excess_mortality_cumulative_per_million
0,0,AFG,Asia,Afghanistan,2020-02-24,,,,,,...,,,37.746,0.5,64.83,0.511,,,,
1,1,AFG,Asia,Afghanistan,2020-02-25,,,,,,...,,,37.746,0.5,64.83,0.511,,,,
2,2,AFG,Asia,Afghanistan,2020-02-26,,,,,,...,,,37.746,0.5,64.83,0.511,,,,
3,3,AFG,Asia,Afghanistan,2020-02-27,,,,,,...,,,37.746,0.5,64.83,0.511,,,,
4,4,AFG,Asia,Afghanistan,2020-02-28,,,,,,...,,,37.746,0.5,64.83,0.511,,,,


In [5]:
#select data that we are going to be using
pd.read_sql('''
SELECT location, date, total_cases, new_cases, total_deaths, population 
FROM deaths 
ORDER BY 1,2
''',con=engine).head()

Unnamed: 0,location,date,total_cases,new_cases,total_deaths,population
0,Afghanistan,2020-02-24,5.0,5.0,,39835428.0
1,Afghanistan,2020-02-25,5.0,0.0,,39835428.0
2,Afghanistan,2020-02-26,5.0,0.0,,39835428.0
3,Afghanistan,2020-02-27,5.0,0.0,,39835428.0
4,Afghanistan,2020-02-28,5.0,0.0,,39835428.0


In [6]:
#Looking at Total Cases vs Total Deaths in the US
pd.read_sql('''
SELECT location, date, total_cases, total_deaths, (total_deaths/total_cases)*100 AS DeathPercentage
FROM deaths 
WHERE location like '%states'
ORDER BY 1,2
''',con=engine)

Unnamed: 0,location,date,total_cases,total_deaths,DeathPercentage
0,United States,2020-01-22,1.0,,
1,United States,2020-01-23,1.0,,
2,United States,2020-01-24,2.0,,
3,United States,2020-01-25,2.0,,
4,United States,2020-01-26,5.0,,
...,...,...,...,...,...
710,United States,2022-01-01,54858824.0,826043.0,1.505761
711,United States,2022-01-02,55106998.0,826287.0,1.499423
712,United States,2022-01-03,56278376.0,827975.0,1.471213
713,United States,2022-01-04,57077565.0,830132.0,1.454393


In [7]:
# Looking at Total Cases vs Population in US (what percentge of population got Covid)
pd.read_sql('''
SELECT location, date, total_cases, population, (total_cases/population)*100 AS PctPopInfected
FROM deaths 
WHERE location like '%states'
ORDER BY 1,2
''',con=engine)

Unnamed: 0,location,date,total_cases,population,PctPopInfected
0,United States,2020-01-22,1.0,332915074.0,3.003769e-07
1,United States,2020-01-23,1.0,332915074.0,3.003769e-07
2,United States,2020-01-24,2.0,332915074.0,6.007538e-07
3,United States,2020-01-25,2.0,332915074.0,6.007538e-07
4,United States,2020-01-26,5.0,332915074.0,1.501885e-06
...,...,...,...,...,...
710,United States,2022-01-01,54858824.0,332915074.0,1.647832e+01
711,United States,2022-01-02,55106998.0,332915074.0,1.655287e+01
712,United States,2022-01-03,56278376.0,332915074.0,1.690472e+01
713,United States,2022-01-04,57077565.0,332915074.0,1.714478e+01


In [8]:
#looking at countries with Highest Infection Rate compared to Population 
pd.read_sql('''
SELECT location, population, MAX(total_cases) AS HighestInfectionCount, 
Max((total_cases/population))*100 AS PctPopInfected 
FROM deaths 
--WHERE location like '%states'
GROUP BY location, population
ORDER BY PctPopInfected DESC
''',con=engine)

Unnamed: 0,location,population,HighestInfectionCount,PctPopInfected
0,Andorra,77354.0,25289.0,32.692556
1,Montenegro,628051.0,180260.0,28.701491
2,Gibraltar,33691.0,9227.0,27.387136
3,Seychelles,98910.0,26255.0,26.544333
4,San Marino,34010.0,8847.0,26.012937
...,...,...,...,...
233,Pitcairn,47.0,,
234,Sint Maarten (Dutch part),43421.0,,
235,Tokelau,1368.0,,
236,Turkmenistan,6117933.0,,


In [9]:
#Showing Top 10 countries with Highest Death Count per Country
pd.read_sql('''
SELECT location, MAX(total_deaths) as TotDeathCount 
FROM deaths 
WHERE continent is not null
--WHERE location like '%states'
GROUP BY location
ORDER BY TotDeathCount DESC
LIMIT 10
''',con=engine)

Unnamed: 0,location,TotDeathCount
0,United States,832118.0
1,Brazil,619654.0
2,India,482876.0
3,Russia,306710.0
4,Mexico,299842.0
5,Peru,202904.0
6,United Kingdom,149399.0
7,Indonesia,144109.0
8,Italy,138276.0
9,Iran,131778.0


In [10]:
#Breaking things down by continent
pd.read_sql('''
SELECT location, population, MAX(total_deaths) as TotDeathCount
FROM deaths 
WHERE continent is null AND location NOT LIKE '%income'
GROUP BY location
ORDER BY TotDeathCount DESC
''',con=engine)

Unnamed: 0,location,population,TotDeathCount
0,World,7874966000.0,5465352.0
1,Europe,748963000.0,1544586.0
2,Asia,4678445000.0,1264148.0
3,North America,596581300.0,1229708.0
4,South America,434260100.0,1193145.0
5,European Union,447189900.0,912232.0
6,Africa,1373486000.0,229178.0
7,Oceania,43219950.0,4572.0
8,International,,15.0


In [11]:
#Global numbers
pd.read_sql('''
SELECT SUM(new_cases), SUM(new_deaths), SUM(new_deaths)/SUM(new_cases)*100 as DeathPct
FROM deaths 
WHERE continent is not null
--GROUP BY date
ORDER BY 1,2
''',con=engine)

Unnamed: 0,SUM(new_cases),SUM(new_deaths),DeathPct
0,297196067.0,5440391.0,1.830573


In [12]:
#Looking at total population vs vaccinations
pd.read_sql('''
WITH PopvsVac (continent, location, date, population, new_vaccinations, RollingCountVaccinations) 
AS (
SELECT d.continent, d.location, d.date, d.population, v.new_vaccinations,
SUM(v.new_vaccinations) OVER (Partition by d.location ORDER BY d.location, d.date) AS RollingCountVaccinations
FROM deaths d
JOIN vaccinations v 
    ON d.location = v.location
    AND d.date = v.date
WHERE d.continent is NOT NULL)
SELECT *, (RollingCountVaccinations/population)*100 AS PctVaccinated
FROM PopvsVac
''',con=engine)

Unnamed: 0,continent,location,date,population,new_vaccinations,RollingCountVaccinations,PctVaccinated
0,Asia,Afghanistan,2020-02-24,39835428.0,,,
1,Asia,Afghanistan,2020-02-25,39835428.0,,,
2,Asia,Afghanistan,2020-02-26,39835428.0,,,
3,Asia,Afghanistan,2020-02-27,39835428.0,,,
4,Asia,Afghanistan,2020-02-28,39835428.0,,,
...,...,...,...,...,...,...,...
143269,Africa,Zimbabwe,2022-01-01,15092171.0,11020.0,6441318.0,42.679864
143270,Africa,Zimbabwe,2022-01-02,15092171.0,6595.0,6447913.0,42.723562
143271,Africa,Zimbabwe,2022-01-03,15092171.0,7062.0,6454975.0,42.770354
143272,Africa,Zimbabwe,2022-01-04,15092171.0,10262.0,6465237.0,42.838350
