# Data analysis using pandas

In [None]:
import pandas as pd
from matplotlib import pyplot as plt

In [None]:
continents = pd.read_csv("continents-according-to-our-world-in-data.csv")
gdp = pd.read_csv("gdp-per-capita-maddison-project-database.csv")
life_expectancy = pd.read_csv("life-expectancy.csv")
population = pd.read_csv("population-with-un-projections.csv")

In [None]:
continents.head()

In [None]:
gdp.head()

## Let's start with gdp
We can take a look at a column:

In [None]:
gdp["Year"]

## Check which values in the column are equal to 2022.

In [None]:
gdp["Year"] == 2022

## Select only the corresponding rows.

In [None]:
gdp[gdp["Year"] == 2022]

In [None]:
gdp = gdp[gdp["Year"] == 2022]
gdp.head()

## What are those OWID_WRL codes above?
Let's look for rows with "Code" with length longer than 3.

In [None]:
gdp[gdp["Code"].str.len() > 3]

## Let's remove those

In [None]:
gdp = gdp[gdp["Code"].str.len() == 3]
gdp.head()

## The index isn't too useful, and we want to combine with other dataframes later

In [None]:
gdp = gdp.set_index("Code")
gdp.head()

## Let's get rid of the last column (we'll see a different way of doing this later).

In [None]:
gdp = gdp[["Entity", "Year", "GDP per capita"]]
gdp.head()

## Moving on to the continents

In [None]:
continents = continents[continents["Code"].str.len() == 3]
continents.head()

## The name of the last column is sooooo long! let's rename it!

In [None]:
continents = continents.rename({"World regions according to OWID": "Region"}, axis=1)
continents.head()

## Set the index to the Code column

In [None]:
continents = continents.set_index("Code")
continents.head()

## Now, let's combine them! with pd.merge

In [None]:
df = pd.merge(continents, gdp, on="Code")
df.head()

## Oops, we don't need to keep the year at all, and let's only keep the Entity for the continent dataframe

In [None]:
continents = continents.drop("Year", axis=1)
gdp = gdp.drop(["Year", "Entity"], axis=1)
df = pd.merge(continents, gdp, on="Code")
df.head()

# EXERCISE
do the same for population and life expectancy! -> rename the fields to "Life expectancy" and "Population"

## Let's merge these with the others!

In [None]:
df = pd.merge(df, life_expectancy, on="Code")
df = pd.merge(df, population, on="Code")

In [None]:
df.head()

In [None]:
regions = list(set(df["Region"]))

In [None]:
regions.sort()
print(regions)

## Pick some colors

In [None]:
colors = ["red", "green", "yellow", "blue", "violet", "pink"]

In [None]:
for i in range(6):
    print("plotting " + regions[i] + " in " + colors[i])
    region_frame = df[df["Region"] == regions[i]]
    region_frame = region_frame.sort_values("Population")
    
    plt.scatter(region_frame["GDP per capita"], 
                region_frame["Life expectancy"], 
                s=region_frame["Population"]/1000000, 
                alpha=0.5, 
                c=colors[i], 
                edgecolors="black",
                label=regions[i])


lgnd = plt.legend()
# making sure the sizes are the same, from https://stackoverflow.com/questions/24706125/setting-a-fixed-size-for-points-in-legend
for handle in lgnd.legend_handles:
    handle.set_sizes([50.0])
    
plt.title("Income and life expectancy in 2022")

# the x axis
plt.xlabel("GDP per capita [USD]")
plt.xscale("log")
tick_values = [1000, 10000, 100000]
tick_labels = ["1k", "10k", "100k"]
plt.xticks(tick_values, tick_labels)
plt.ylabel("Life expectancy")
plt.savefig("life_expectancy_2022.png")
plt.show()

In [None]:
life_expectancy[life_expectancy<20]

In [None]:
df.loc["CAF"]

In [None]:
# looks like an error (according to Google, it was about 54 years)
df = df.drop("CAF")