Data Science Tools 
Constructor University Bremen
By: Leonardo Corredor

COVID Data Science Tools Project

Below is my Jupiter study notebook for the Python data science tools course. Here you will find most of the concepts seen in the tutorial sessions applied to the Dataset:  Covid Country Wise.

In [2]:
## 1. Environment Setup
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
import seaborn as sns
import plotly.express as px

In [3]:
## 2. Data Loading
#Replace here with the location of the Data Set "Country Wise Latest.csv" in your local machine or repository
df = pd.read_csv(r"C:\Users\jlcor\Documents\MASTER\DataSets\country_wise_latest.csv")
df.head()

Unnamed: 0,Country/Region,Confirmed,Deaths,Recovered,Active,New cases,New deaths,New recovered,Deaths / 100 Cases,Recovered / 100 Cases,Deaths / 100 Recovered,Confirmed last week,1 week change,1 week % increase,WHO Region
0,Afghanistan,36263,1269,25198,9796,106,10,18,3.5,69.49,5.04,35526,737,2.07,Eastern Mediterranean
1,Albania,4880,144,2745,1991,117,6,63,2.95,56.25,5.25,4171,709,17.0,Europe
2,Algeria,27973,1163,18837,7973,616,8,749,4.16,67.34,6.17,23691,4282,18.07,Africa
3,Andorra,907,52,803,52,10,0,0,5.73,88.53,6.48,884,23,2.6,Europe
4,Angola,950,41,242,667,18,1,0,4.32,25.47,16.94,749,201,26.84,Africa


TUTORIAL SESSIONS EXERCISES

Cleaning Data

In [4]:
## 3. Convert columns to numeric type, coercing errors to NaN\n",
numeric_cols = ["Confirmed", "Deaths", "Recovered", "Active", "1 week % increase", "Deaths / 100 Cases", "Recovered / 100 Cases"]
#Select the columns you want to convert
for col in numeric_cols:
  #Mark as numeric, if error, set as NaN
  if col in df.columns:
    #Conditional
    df[col] = pd.to_numeric(df[col], errors = "coerce")
    #Conversion to numeric

In [5]:
## 4. Print Data loaded and cleaned (Columns converted to numeric)
print (df.head())
#Why does it look like that?

  Country/Region  Confirmed  Deaths  Recovered  Active  New cases  New deaths  \
0    Afghanistan      36263    1269      25198    9796        106          10   
1        Albania       4880     144       2745    1991        117           6   
2        Algeria      27973    1163      18837    7973        616           8   
3        Andorra        907      52        803      52         10           0   
4         Angola        950      41        242     667         18           1   

   New recovered  Deaths / 100 Cases  Recovered / 100 Cases  \
0             18                3.50                  69.49   
1             63                2.95                  56.25   
2            749                4.16                  67.34   
3              0                5.73                  88.53   
4              0                4.32                  25.47   

   Deaths / 100 Recovered  Confirmed last week  1 week change  \
0                    5.04                35526            737   
1   

SCATTER PLOT - Confirmed vs. 1-week % Increase

In [6]:
## ANALYSIS 1: SCATTER PLOT - Confirmed vs. 1-week % Increase

if not df.empty:
    #df.empty: True if DataFrame is empty (no items), else False.
    #not df.empty: True if DataFrame has items, else False.
    #if not df.empty: Proceed only if DataFrame has data.
    #This checking avoids errors like: AttributeError or Key Error in the case the Data Frame was
    #loaded wrong


    # Filter top 5 countries by confirmed cases
    top5 = df.nlargest(5, "Confirmed")

    # Create interactive scatter plot using Plotly Express 
    # Scatter in Spanish: Diagrama de Dispersión
    fig1 = px.scatter( 
        top5,
        x="Confirmed", #Horizonzal axis
        y="1 week % increase", #Vertical axis
        color="Country/Region",              # Different color for each country
        text="Country/Region",                # Show country names as labels
        size="Confirmed",                    # Bubble size based on confirmed cases
        hover_data=["Deaths", "Recovered", "WHO Region"],  # Extra info on hover
        #hover in spanish: flotante, refiere a la información flotante
        #que aparece al pasar el ratón sobre los países
        title="1. Top 5 Countries: Confirmed Cases vs 1-week % Increase",
    )

    # Customize layout
    fig1.update_traces(textposition="top center") 
    fig1.update_layout(
        xaxis_title="Total Confirmed Cases",
        yaxis_title="1-week % Increase",
        template="seaborn"
        #template in Spanish: Plantilla
        #Templates:
        #plotly_white: White background with gridlines
        #ggplot2: Similar to ggplot2 style in R
        #seaborn: Similar to seaborn style in Pythonm // Fancier (?)
    )

    fig1.show()


BAR CHART - Total Deaths by WHO Region

In [7]:
## ANALYSIS 2: BAR CHART - Total Deaths by WHO Region
if not df.empty:
    #df.empty: True if DataFrame is empty (no items), else False.
    #not df.empty: True if DataFrame has items, else False.
    #if not df.empty: Proceed only if DataFrame has data.
    #This checking avoids errors like: AttributeError or Key Error in the case the Data Frame was
    #loaded wrong
    
    # Group data by WHO Region and calculate the sum of 'Deaths'
    regional_deaths = df.groupby('WHO Region')['Deaths'].sum().reset_index()

    # Create an interactive bar chart
    #Bar Chart in Spanish: Gráfico de barras
    fig2 = px.bar(
        regional_deaths,
        x='WHO Region',
        y='Deaths',
        color='WHO Region',
        title='2. Total Deaths by WHO Region',
        labels={'Deaths': 'Total Deaths', 'WHO Region': 'WHO Region'},
        #label in Spanish: Etiqueta
        #Label function gives a name to each axis
        template='seaborn'
        #template in Spanish: Plantilla
        #Templates:
        #plotly_white: White background with gridlines
        #ggplot2: Similar to ggplot2 style in R
        #seaborn: Similar to seaborn style in Pythonm // Fancier (?)

    )

    # Sort the bars in descending order by 'Deaths'
    fig2.update_layout(
        xaxis={'categoryorder':'total descending'}
    )

    fig2.show()

Recovered Cases Hierarchy (Region -> Country)

In [8]:
## ANALYSIS 3: SUNBURST CHART - Recovered Cases Hierarchy (Region -> Country)

if not df.empty:
    # df.empty: True if DataFrame is empty (no items), else False.
    # not df.empty: True if DataFrame has items, else False.
    # if not df.empty: Proceed only if DataFrame has data.
    # This checking avoids errors like: AttributeError or Key Error in the case the Data Frame was
    # loaded wrong
    
    #Sunburst chart in Spanish: Diagrama de Sol 

    # 1. Prepare data for Sunburst: Drop rows where 'Recovered' is missing (NaN)
    sunburst_df = df.dropna(subset=['Recovered']).copy()
    # df.dropna(subset=['Recovered']): Remove rows where 'Recovered' is NaN
    # subset: it focuses only on the "Recovered" column to check for NaN values
    # .copy(): Create a copy to avoid SettingWithCopyWarning
    
    # 2. FIX FOR ZeroDivisionError: Filter out countries where Recovered cases are zero.
    #    Plotly cannot normalize weights if the sum of all values is zero.
    sunburst_df = sunburst_df[sunburst_df['Recovered'] > 0]
    # sunburst_df = sunburst_df: rewrite sunburst_df
    # sunburst_df[sunburst_df['Recovered'] > 0]: Keep only rows where 'Recovered' > 0
    
    # Check if there is still data to plot after filtering
    if not sunburst_df.empty:
        # df.empty: True if DataFrame is empty (no items), else False.
        # not df.empty: True if DataFrame has items, else False.
        # if not df.empty: Proceed only if DataFrame has data.
        # This checking avoids errors like: AttributeError or Key Error in the case the Data Frame was
        # loaded wrong

        # Sunburst chart visualizes hierarchical data (Region -> Country)
        fig3 = px.sunburst(
            sunburst_df, 
            path=['WHO Region', 'Country/Region'], # Define the hierarchy levels
            # Hierarchy in Spanish: Jerarquía
            values='Recovered',                    # Size based on total 'Recovered' cases
            color='Recovered',                     # Color gradient based on the count
            title='3. Distribution of Recovered Cases: WHO Region to Country',
            color_continuous_scale=px.colors.sequential.Sunset
        )

        # Adjust margins
        fig3.update_layout(
            margin=dict(t=50, l=0, r=0, b=15)
        )
        #Margens for visualization: T= top L = left R= right B= bottom
    

        fig3.show()

Global Mortality Rate

In [9]:
## ANALYSIS 4: CHOROPLETH MAP - Global Mortality Rate

if not df.empty:
    # df.empty: True if DataFrame is empty (no items), else False.
    # not df.empty: True if DataFrame has items, else False.
    # if not df.empty: Proceed only if DataFrame has data.
    # This checking avoids errors like: AttributeError or Key Error in the case the Data Frame was
    # loaded wrong

    # Choropleth maps are for visualizing geographical data
    # Choropleth in Spanish: Coropleta o Mapa temático
    
    fig4 = px.choropleth(
        df.dropna(subset=['Deaths / 100 Cases']),
        locations='Country/Region',      # Column with the country names
        locationmode='country names',    # Instruct Plotly how to match country names
        color='Deaths / 100 Cases',      # Color intensity based on the mortality rate
        hover_name='Country/Region',     # Show country name on hover
        color_continuous_scale=px.colors.sequential.Reds,
        title='4. Global Mortality Rate (Deaths per 100 Cases)',
        template='plotly_white'
    )
        #template in Spanish: Plantilla
        #Templates:
        #plotly_white: White background with gridlines
        #ggplot2: Similar to ggplot2 style in R
        #seaborn: Similar to seaborn style in Pythonm // Fancier (?)

    # Customize map appearance
    fig4.update_geos(
        showcoastlines=True, #Boolean: True to show coastlines, False to hide them
        coastlinecolor="Black", # Color of the coastlines
        showland=True #Boolean: True to show land, False to hide it
    )

    fig4.update_layout(
        coloraxis_colorbar=dict(title='Mortality Rate (%)')
    )

    fig4.show()

FLIGHTS Project

Week 6

In [12]:
#Creating an own Data Set
my_df = pd.DataFrame({
    "name": ["Leonardo", "Marlly", "Kathe", "Laura"],
    "age": [25, 30, 35, np.nan],
    "score": [80, 90, 85, 95]
})

print(my_df)


       name   age  score
0  Leonardo  25.0     80
1    Marlly  30.0     90
2     Kathe  35.0     85
3     Laura   NaN     95


In [14]:
#Flitering rows where age is greater than 28
print(my_df[my_df["age"] > 28])

     name   age  score
1  Marlly  30.0     90
2   Kathe  35.0     85


In [16]:
#Handling missing values in age
my_df_filled = my_df.copy() #A copy of the original Data Frame so the original one is not modified 
my_df_filled["age"] = my_df_filled["age"].fillna(my_df_filled["age"].mean()) #replacing missing values with the mean
print(my_df_filled)


       name   age  score
0  Leonardo  25.0     80
1    Marlly  30.0     90
2     Kathe  35.0     85
3     Laura  30.0     95


In [17]:
# New boolean column: is_high_score
my_df_filled["is_high_score"] = my_df_filled["score"] >= 90
print(my_df_filled)


       name   age  score  is_high_score
0  Leonardo  25.0     80          False
1    Marlly  30.0     90           True
2     Kathe  35.0     85          False
3     Laura  30.0     95           True


In [20]:
#Count how many high scores.
num_high_scores = my_df_filled["is_high_score"].sum()
print (num_high_scores)


2
