In [1]:
import pandas as pd
import plotly.express as px

In [2]:
df = pd.read_csv("./data/gapminder_after1952.csv")

FileNotFoundError: [Errno 2] No such file or directory: './data/gapminder_after1952.csv'

In [None]:
df.head(20)

In [None]:
df.shape

In [None]:
df.country.unique().shape

In [None]:
df.year.unique()

In [None]:
df.year.value_counts().reset_index()

In [None]:
df.year.unique().shape

### Let's create database for each country 

In [None]:
countries = df[['country', 'iso_alpha', 'continent']].drop_duplicates()

In [None]:
countries.head()


### EDA on the dataset is always informative  

In [None]:
fig = px.histogram(df, x="lifeExp", histnorm='percent', 
                  title="Life Expectancy")
fig.show()

In [None]:
fig = px.histogram(df, x="gdpPercap", histnorm='percent', 
                  title="GDP Per Capita")
fig.show()

In [None]:
#are there values equal to zero?? 

In [None]:
#outliers? error values? 

In [None]:
df[df.gdpPercap == 0].shape

In [None]:
df.gdpPercap.min()

### Let's visualize the trajectories of  two countries

Create a temporary dataset that contains only two countries

In [None]:
df.country.unique()

In [None]:
tmp = df.query("country in ['Switzerland','Norway', 'United States', 'India', 'China', 'Saudi Arabia']")

In [None]:
tmp

One should improve the chart below by providing better labels and titles

In [None]:
fig = px.line(tmp, x="gdpPercap", y="lifeExp", color="country", text="year")

fig.update_traces(textposition="bottom right")
fig.show()

## Prepare the dataset for HD visualization and cluster Analyses 

Each country should be a line. Columns should be income and life expectancy 

In [None]:
df.head()

In [None]:
lifeExp = df[['country', 'year', 'lifeExp']].set_index(['country', 'year']).unstack().round(0)
lifeExp

In [None]:
lifeExp.to_clipboard()

In [None]:
gdpPercap = df[['country', 'year', 'gdpPercap']].set_index(['country', 'year']).unstack().round(0)
gdpPercap

Create a new dataset that that concatenates the life expectancy and GDP Percentage Cap 

In [None]:
gdpPercap.to_clipboard()

In [None]:
df2 = pd.concat([lifeExp,gdpPercap], axis=1) 

In [None]:
df2.head()

In [None]:
df2[(  'lifeExp', 1952)].mean()

In [None]:
df2.to_clipboard()

### The new dataset has variables with different scales. It needs to be standardized or normalized. 

In [None]:
df2.shape 

In [None]:
from sklearn.preprocessing import StandardScaler

In [None]:
df2_scaled =  StandardScaler().fit(df2).transform(df2)

df2_scaled is just an array. Let's put in a dataframe

In [None]:
df2_scaled = pd.DataFrame(df2_scaled, columns=df2.columns, index=df2.index)

In [None]:
df2_scaled.head(10)

In [None]:
df2_scaled.to_clipboard()

In [None]:
df2[df2_scaled.columns[0]]

In [None]:
df2_scaled[df2_scaled.columns[0]]

In [None]:
df2_scaled[df2_scaled.columns[0]].std()

#### It is a good idea to know how many PCA components you need to describe most of the variability of the dataset but it is not necessary. 

In [None]:
from sklearn.decomposition import PCA

In [None]:
pca = PCA(n_components = 2).fit(df2_scaled)

In [None]:
pca.explained_variance_ratio_.sum()

### Project the data using MDS

In [None]:
from sklearn.manifold import MDS

In [None]:
coords_MDS = MDS(n_components=2).fit_transform(df2_scaled)

In [None]:
coords_MDS

Create a dataset that contains all projections

In [None]:
projections = pd.DataFrame(index=df2_scaled.index)

In [None]:
projections['MDS_x'] = coords_MDS[:,0]
projections['MDS_y'] = coords_MDS[:,1]

In [None]:
projections

Add country information to the projections dataset 

In [None]:
projections = projections.merge(countries, left_index=True, right_on="country")

In [None]:
projections.head()

In [None]:
fig = px.scatter(projections, x="MDS_x", y="MDS_y", hover_name="country", 
                  color="continent",
                 width=800, height=800, 
                 title="MDS projection of the country trajectories")

fig.update_yaxes(
    scaleanchor = "x",
    scaleratio = 1,
  )



fig.show()

### Project the countries using TSNE

In [None]:
from sklearn.manifold import TSNE

In [None]:
tsne = TSNE(perplexity=5)

tsne_coords = tsne.fit_transform(df2_scaled)

projections['tsne_x'] = tsne_coords[:,0]
projections['tsne_y'] = tsne_coords[:,1]


In [None]:
projections.head()

In [None]:
fig = px.scatter(projections, x="tsne_x", y="tsne_y", hover_name="iso_alpha", color="continent",
                 width=800, height=800, 
                title="TNSE projection of the country trajectories")


fig.update_yaxes(
    scaleanchor = "x",
    scaleratio = 1,
  )



fig.show()

In [None]:
df.gdpPercap.min()

In [None]:
df.lifeExp.min()

In [None]:
projections.to_clipboard()

### Select a number of countries from different portions of the MDS visualize their paths

In [None]:
tmp = df.query("iso_alpha in ['GHA', 'MRT',  'AUS', 'DEU',  'LKA', 'JPN', 'ITA', 'SAU']")
#tmp contains only a portion of the dataset with only the countries in the list above 

fig = px.line(tmp, x="gdpPercap", y="lifeExp", color="country", 
             title="selected country trajctories from different portions of the MDS and TSNE charts")# text="year")
fig.update_traces(textposition="bottom right")

fig.update_xaxes(
    range=[0,50000],  # sets the range of xaxis

)
fig.update_yaxes(
    range=[0,85],  # sets the range of xaxis

)

fig.show()

In [None]:
tmp = df.query("iso_alpha in ['GHA', 'MRT',  'AUS', 'DEU',  'LKA', 'JPN', 'ITA', 'SAU', 'CAN', 'DNK']")
#tmp contains only a portion of the dataset with only the countries in the list above 

fig = px.line(tmp, x="gdpPercap", y="lifeExp", color="country", log_x=True, 
             title="country trajectories using a logaritmic x axis")# text="year")
fig.update_traces(textposition="bottom right")


fig.show()

## Try Clustering 

In [None]:
from sklearn.cluster import KMeans
from sklearn.cluster import SpectralClustering

The optimal number of clusters is something to investigate. Here we will just try clustering with some plausible values 

# Applying the spectral clustering algorithm

In [None]:
clustering = SpectralClustering(n_clusters=6,
         assign_labels='discretize',
         random_state=0).fit(df2_scaled)
clustering.labels_

The algorithm attaches cluster lable to each country 

In [None]:
projections['scluster6'] = clustering.labels_
#projections['scluster4'] = clustering.labels_

In [None]:
projections['scluster6'] =  projections['scluster6'].astype('str')
#projections['scluster4'] =  projections['scluster4'].astype('str')

In [None]:
projections

In [None]:
projections[projections.scluster6 == "0"]

#### Let's try Kmeans with six clusters


In [None]:
kmeans = KMeans(n_clusters = 7, random_state = 0)

km2 = kmeans.fit(df2_scaled)

km2.labels_

In [None]:
projections['kmeans7'] = km2.labels_

In [None]:
projections.head()

In [None]:
projections['kmeans7'] =  projections['kmeans7'].astype('str')

In [None]:
fig = px.scatter(projections, x="MDS_x", y="MDS_y",
                 hover_name="iso_alpha", color="scluster6",
                 width=800, height=800)


fig.update_yaxes(
    scaleanchor = "x",
    scaleratio = 1,
  )



fig.show()

In [None]:
fig = px.scatter(projections, x="MDS_x", y="MDS_y",
                 hover_name="iso_alpha", color="scluster6",
                 width=800, height=800)


fig.update_yaxes(
    scaleanchor = "x",
    scaleratio = 1,
  )



fig.show()

In [None]:
projections[projections.scluster6 == "4"]

In [None]:
tmp = df.query("iso_alpha in ['MLI', 'BOL', 'COL', 'MNE',  'IRL', 'CAN']")

fig = px.line(tmp, x="gdpPercap", y="lifeExp", color="country",  log_x=True, width=800, height=400)

fig.update_traces(textposition="bottom right")

fig.show()

### Calculate Cluster Centroids 

#### Add cluster information to the initial dataset

In [None]:
df.head(20)

In [None]:
df3 = df.merge(projections[['country', 'scluster6']], on="country")

In [None]:
df3.head(20)

#### Calculate the average life expectancy and GDP per capita per year

In [None]:
cluster_centroids = df3[['year', 'scluster6', 'lifeExp',
                         'gdpPercap']].groupby(['scluster6', 'year']).mean().reset_index()

In [None]:
cluster_centroids

In [None]:
fig = px.line(cluster_centroids, x="gdpPercap", y="lifeExp", color="scluster6",  log_x=True, 
             title="Cluster Trajectories. Each line represents the average gpb per capita and life by year",
             width=800, height=600)

fig.update_traces(textposition="bottom right")


fig.show()

In [None]:
projections

In [None]:
projections.to_csv("model/gapminder_after1952_projections.csv")