<a href="https://colab.research.google.com/github/jaya-shankar/education-impact/blob/master/All_Models.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Cloning Repo & installing libs

In [1]:
!rm -rf education-impact

In [2]:
!git clone https://github.com/jaya-shankar/education-impact.git


Cloning into 'education-impact'...
remote: Enumerating objects: 422, done.[K
remote: Counting objects: 100% (422/422), done.[K
remote: Compressing objects: 100% (361/361), done.[K
remote: Total 422 (delta 213), reused 192 (delta 58), pack-reused 0[K
Receiving objects: 100% (422/422), 6.29 MiB | 17.88 MiB/s, done.
Resolving deltas: 100% (213/213), done.


In [3]:
!pip install tensorflow_decision_forests
!pip install wurlitzer
!pip install seaborn

Collecting tensorflow_decision_forests
  Downloading tensorflow_decision_forests-0.2.2-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (17.7 MB)
[K     |████████████████████████████████| 17.7 MB 728 kB/s 
[?25hCollecting wurlitzer
  Downloading wurlitzer-3.0.2-py3-none-any.whl (7.3 kB)
Installing collected packages: wurlitzer, tensorflow-decision-forests
Successfully installed tensorflow-decision-forests-0.2.2 wurlitzer-3.0.2


In [4]:
import pandas as pd
import os
import numpy as np
import math
import seaborn as sns
import tensorflow_decision_forests as tfdf
from sklearn.model_selection import train_test_split
from wurlitzer import sys_pipes



# Setting paths

In [17]:
root = "education-impact/datasets/" 
datasets_path = {
                    "infant_mortality"                :  root+ "Infant_Mortality_Rate.csv",
                    "child_mortality"                 :  root+ "child_mortality_0_5_year_olds_dying_per_1000_born.csv",
                    "children_per_woman"              :  root+ "children_per_woman_total_fertility.csv",
                    "co2_emissions_percapita"         :  root+ "co2_emissions_tonnes_per_person.csv",
                    "population"                      :  root+ "converted_pop.csv",
                    "population_density"              :  root+ "population_per_area.csv",
                    "gdp_growth"                      :  root+ "gdp_per_capita_yearly_growth.csv",
                    "Avg_daily_income_ppp"            :  root+ "mincpcap_cppp.csv",
                    "gdppercapita_us_infla_adjust"    :  root+ "gdppercapita_us_inflation_adjusted.csv",
                    "gini_index"                      :  root+ "gini.csv",
                    "life_expectancy"                 :  root+ "life_expectancy_years.csv",
                    "poverty_index"                   :  root+ "mincpcap_cppp.csv",
                    "people_in_poverty"               :  root+ "number_of_people_in_poverty.csv",
                    "ratio_b/g_in_primary"            :  root+ "ratio_of_girls_to_boys_in_primary_and_secondary_education_perc.csv",
                    "wcde-25--34"                     :  root+ "wcde-25--34.csv",
                    "20-24-In_Primary_OL"             :  root+ "In_Primary_OL.csv",
                    "20-24-Primary_OL"                :  root+ "Primary_OL.csv",
                    "20-24-Lower_Secondary_OL"        :  root+ "Lower_Secondary_OL.csv",
                    "20-24_female-In_Primary_OL"      :  root+ "female_In_Primary_OL.csv",
                    "20-24_female-Primary_OL"         :  root+ "female_Primary_OL.csv",
                    "20-24_female-Lower_Secondary_OL" :  root+ "female_Lower_Secondary_OL.csv",
                    "20-24-In_Primary_OL_comp"        :  root+ "In_Primary_OL_complete.csv",
                    "20-24-Primary_OL_comp"           :  root+ "Primary_OL_complete.csv",
                    "20-24-Lower_Secondary_OL_comp"   :  root+ "Lower_Secondary_OL_complete.csv",
                }

created_datasets = []

# Defining common functions

In [5]:
def get_countries_count(datasets):
  for dataset in datasets:
    df = pd.read_csv(datasets_path[dataset])
    count = len(set(df.Country.unique()))
    print(f"{'Factor: ' + dataset:<40} count: {count}")


In [6]:
def find_common_countries(datasets):
  common_countries = set()
  for dataset in datasets:
    countries_list = list(pd.read_csv(datasets_path[dataset]).Country)
    countries_list = set(map(lambda x: x.lower(), countries_list))
    if common_countries == set():
      common_countries = countries_list
    else:
      common_countries = common_countries.intersection(countries_list)
  return list(common_countries)

In [7]:
def generate_indices(countries,years):
  keys=[]
  for y in years:
    for c in countries:
      keys.append((c,str(y)))
  return keys

In [8]:
def load_datasets_to_pd(datasets,keys,include_output=True):
  combined_df = pd.DataFrame(keys,columns=['country','year'])
  for dataset in datasets:
    combined_df = add_dataset(combined_df,dataset)
    

  if include_output:
    combined_df = add_dataset(combined_df,dataset,output = True)
  return combined_df

  

In [9]:
def add_dataset(input_df,dataset,output = False):
  label = dataset
  if output : 
    label = "o_"+OUTPUT
  input_df[label] = [math.nan]*len(input_df)
  df = pd.read_csv(datasets_path[dataset])
  df["Country"] = df["Country"].str.lower()
  df.set_index("Country", inplace=True)
  for e in range(len(input_df)):
    country = input_df.iloc[e].country
    year    = input_df.iloc[e].year
    if output:
      input_df.at[e,label] = df.loc[country][str( int(year) + PREDICT_FUTURE )]
    else:
      input_df.at[e,label] = df.loc[country][year]
  return input_df

In [177]:
def create_n_yrs_old_csv(dataset,n):
    table_name = str(n)+"_yrs_old_"+dataset
    # if table_name in created_datasets:
    #   return table_name
    df = pd.read_csv(datasets_path[dataset])
    years = list(range(1960, 2016))
    countries = list(df['Country'])
    new_df = pd.DataFrame(countries,columns=['Country'])
    for i in range(1960,2016):
      new_df[i] = np.nan
    new_df.set_index('Country',inplace=True)
    df.set_index('Country',inplace=True)
    for c in countries:
      for y in range(1960+n,2016):
        new_df[y].at[c] = df.loc[c][str(y-n)]
    
    
    datasets_path[table_name] = root+table_name+".csv"
    new_df.to_csv(root+table_name+".csv",encoding='utf-8', index=True)

    created_datasets.append(table_name)
    return table_name

In [19]:
def create_n_dropout_csv(dataset,n,s_n = 90):
    df = pd.read_csv(datasets_path[dataset])
    years = list(range(1875, 2016))
    countries = list(df['Country'])
    drop_out_years = []
    for i in range(len(df)):
      started = False
      s_year  = 1875
      for year in years:
        if not started and df.iloc[i][str(year)] < s_n :
          started = True
          s_year  = year
        elif started and df.iloc[i][str(year)] < n :
          drop_out_years.append((df.iloc[i]['Country'],(year-s_year)))
          break
      else:
        if started:
          drop_out_years.append((df.iloc[i]['Country'], 100))
        else:
          drop_out_years.append((df.iloc[i]['Country'], year-s_year))
    new_df = pd.DataFrame(countries,columns=['Country'])
    for i in years:
      new_df[i] = np.nan
    new_df.set_index('Country',inplace=True)
    for c,y in drop_out_years:
      for i in years:
        new_df.at[c,i] = y
    table_name = str(n)+"%_dropout_"+dataset
    datasets_path[table_name] = root+table_name+".csv"
    new_df = new_df[(new_df.T != 1).any()]
    new_df.to_csv(root+table_name+".csv",encoding='utf-8', index=True)

    created_datasets.append(table_name)
    return table_name

In [12]:
def combine_dfs(X,y):
  label = y.columns[0]
  X[label] = y
  X.dropna(subset=[label],inplace=True)
  return X

In [13]:
def extract_variable_imp(variable):
  v_list = inspector.variable_importances()[variable]
  v = variable + "\n"

  for i in range(1,len(v_list)):
    v += str(i)+ "  "
    v += f"{v_list[i][0][0]:<50}" 
    v += str(v_list[i][1]) + "\n"
  return v

# Plotting Data

In [None]:
datasets_to_plot = [
            "infant_mortality",
            "child_mortality",
            "children_per_woman",
            "co2_emissions_percapita",
            "gini_index",
            "gdppercapita_us_infla_adjust",
            "20-24-In_Primary_OL",
            "20-24-Primary_OL",
            "20-24-Lower_Secondary_OL",
            "population",
            "population_density",
            "20-24_female-In_Primary_OL",
            "20-24_female-Primary_OL" ,
            "20-24_female-Lower_Secondary_OL",
            "life_expectancy"
            ]

# creating a list of all countries & years
countries   = find_common_countries(datasets_to_plot)
years       = [y for y in range(1960,2016)]
keys        = generate_indices(countries, years)

combined_df = load_datasets_to_pd(datasets_to_plot,keys,include_output=False)

## Life Expectancy

In [None]:
PREDICT_FUTURE  = 0
OUTPUT          = 'life_expectancy'
n = len(datasets_to_plot)
r = math.ceil(math.sqrt(n))
i=0
for m in range(r):
  plt = sns.pairplot(combined_df,diag_kind="kde", y_vars=[OUTPUT], x_vars=[datasets_to_plot[i] for i in range(r*m,min(n,r*(m+1)))], height=4,dropna=True)
  plt

## Total Fertility Rate

In [None]:
PREDICT_FUTURE  = 0
OUTPUT          = 'children_per_woman
n = len(datasets_to_plot)
r = math.ceil(math.sqrt(n))
i=0
for m in range(r):
  plt = sns.pairplot(combined_df,diag_kind="kde", y_vars=[OUTPUT], x_vars=[datasets_to_plot[i] for i in range(r*m,min(n,r*(m+1)))], height=4,dropna=True)
  plt

## Primary education OL

In [None]:
PREDICT_FUTURE  = 0
OUTPUT          = '20-24-Primary_OL'
n = len(datasets_to_plot)
r = math.ceil(math.sqrt(n))
i=0
for m in range(r):
  plt = sns.pairplot(combined_df,diag_kind="kde", y_vars=[OUTPUT], x_vars=[datasets_to_plot[i] for i in range(r*m,min(n,r*(m+1)))], height=4,dropna=True)
  plt

## GDP per capita

In [None]:
PREDICT_FUTURE  = 0
OUTPUT          = 'gdppercapita_us_infla_adjust'
n = len(datasets_to_plot)
r = math.ceil(math.sqrt(n))
i=0
for m in range(r):
  plt = sns.pairplot(combined_df,diag_kind="kde", y_vars=[OUTPUT], x_vars=[datasets_to_plot[i] for i in range(r*m,min(n,r*(m+1)))], height=4,dropna=True)
  plt

# Building Model

## Life Expectancy

### Preparing the Data

In [14]:
PREDICT_FUTURE  = 0
OUTPUT         = 'life_expectancy'

In [30]:
datasets = [
            # "infant_mortality",
            # "life_expectancy",
            # "child_mortality",
            "children_per_woman",
            "co2_emissions_percapita",
            "gini_index",
            "gdppercapita_us_infla_adjust",
            create_n_yrs_old_csv("gdppercapita_us_infla_adjust",n = 20),
            create_n_dropout_csv("20-24-Primary_OL_comp",50,95),
            "20-24-In_Primary_OL",
            "20-24-Primary_OL",
            # "20-24-Lower_Secondary_OL",
            "population",
            "20-24_female-In_Primary_OL",
            "20-24_female-Primary_OL",
            # "20-24_female-Lower_Secondary_OL",
            ]
get_countries_count(datasets)

Factor: children_per_woman               count: 202
Factor: co2_emissions_percapita          count: 194
Factor: gini_index                       count: 195
Factor: gdppercapita_us_infla_adjust     count: 207
Factor: 20_yrs_old_gdppercapita_us_infla_adjust count: 207
Factor: 50%_dropout_20-24-Primary_OL_comp count: 180
Factor: 20-24-In_Primary_OL              count: 202
Factor: 20-24-Primary_OL                 count: 202
Factor: population                       count: 197
Factor: 20-24_female-In_Primary_OL       count: 202
Factor: 20-24_female-Primary_OL          count: 202


In [31]:
# creating a list of all countries & years
countries = find_common_countries(datasets)
years     = [y for y in range(1960,2015-PREDICT_FUTURE+1)]
keys      = generate_indices(countries, years)

In [32]:
input_df            = load_datasets_to_pd(datasets,keys)
input_df.set_index(["country","year"], inplace=True)

In [33]:
output_df           = input_df[["o_" + OUTPUT]]
input_df.drop(labels=["o_" + OUTPUT], axis = 1, inplace=True)

In [34]:
input_df

Unnamed: 0_level_0,Unnamed: 1_level_0,children_per_woman,co2_emissions_percapita,gini_index,gdppercapita_us_infla_adjust,20_yrs_old_gdppercapita_us_infla_adjust,50%_dropout_20-24-Primary_OL_comp,20-24-In_Primary_OL,20-24-Primary_OL,population,20-24_female-In_Primary_OL,20-24_female-Primary_OL
country,year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
mongolia,1960,6.95,1.350,32.0,,,29.0,14.0,51.7,956000.0,17.1,60.8
greece,1960,2.33,1.140,46.8,5030.0,,83.0,18.8,68.3,8270000.0,25.4,74.5
romania,1960,2.34,2.870,19.3,,,50.0,3.4,36.4,18600000.0,4.6,44.3
sri lanka,1960,5.54,0.229,36.4,,,94.0,44.9,62.4,9870000.0,49.0,65.5
thailand,1960,6.15,0.136,41.5,581.0,,40.0,16.1,89.2,27400000.0,18.8,93.2
...,...,...,...,...,...,...,...,...,...,...,...,...
morocco,2015,2.53,1.760,39.6,3220.0,1730.0,49.0,29.9,46.2,34700000.0,34.5,48.2
myanmar,2015,2.23,0.421,38.1,1140.0,208.0,68.0,27.1,46.1,52700000.0,29.0,48.4
oman,2015,2.74,15.100,40.0,16000.0,15400.0,35.0,7.3,25.5,4270000.0,7.8,23.5
belarus,2015,1.69,6.290,26.9,5950.0,1890.0,30.0,0.2,0.6,9440000.0,0.1,0.4


In [None]:
output_df

In [35]:
input_df.isna().sum()

children_per_woman                            0
co2_emissions_percapita                     143
gini_index                                    0
gdppercapita_us_infla_adjust               1706
20_yrs_old_gdppercapita_us_infla_adjust    4552
50%_dropout_20-24-Primary_OL_comp             0
20-24-In_Primary_OL                           0
20-24-Primary_OL                              0
population                                    0
20-24_female-In_Primary_OL                    0
20-24_female-Primary_OL                       0
dtype: int64

In [None]:
input_df.shape

In [36]:
X_train, X_test, y_train, y_test = train_test_split(input_df, output_df, test_size=0.30, random_state=43)

### Random Forest Model

In [37]:

# converting pandas to tensorFlow dataset
train_ds = tfdf.keras.pd_dataframe_to_tf_dataset(combine_dfs(X_train,y_train), label="o_"+OUTPUT, task=tfdf.keras.Task.REGRESSION)
model = tfdf.keras.RandomForestModel(task = tfdf.keras.Task.REGRESSION)

model.fit(x=train_ds)
# Convert it to a TensorFlow dataset
test_ds = tfdf.keras.pd_dataframe_to_tf_dataset(combine_dfs(X_train,y_train), label="o_"+OUTPUT, task=tfdf.keras.Task.REGRESSION)

# Evaluate the model
model.compile(metrics=["mse"])
# Evaluate the model on the test dataset.
evaluation = model.evaluate(test_ds, return_dict=True)
print(OUTPUT.upper())
print(evaluation)
print()
print(f"MSE: {evaluation['mse']}")
print(f"RMSE: {math.sqrt(evaluation['mse'])}")
print()

Use /tmp/tmpzexdudqb as temporary training directory
Starting reading the dataset
1/6 [====>.........................] - ETA: 1s
Dataset read in 0:00:00.250049
Training model
Model trained in 0:00:05.368916
Compiling model
LIFE_EXPECTANCY
{'loss': 0.0, 'mse': 0.03206025809049606}

MSE: 0.03206025809049606
RMSE: 0.17905378546821082



In [28]:
tfdf.model_plotter.plot_model_in_colab(model, tree_idx=0)

In [38]:
# %set_cell_height 300
model.summary()

Model: "random_forest_model_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
Total params: 1
Trainable params: 0
Non-trainable params: 1
_________________________________________________________________
Type: "RANDOM_FOREST"
Task: REGRESSION
Label: "__LABEL"

Input Features (11):
	20-24-In_Primary_OL
	20-24-Primary_OL
	20-24_female-In_Primary_OL
	20-24_female-Primary_OL
	20_yrs_old_gdppercapita_us_infla_adjust
	50__dropout_20-24-Primary_OL_comp
	children_per_woman
	co2_emissions_percapita
	gdppercapita_us_infla_adjust
	gini_index
	population

No weights

Variable Importance: MEAN_MIN_DEPTH:
    1.                                 "__LABEL" 10.795130 ################
    2. "20_yrs_old_gdppercapita_us_infla_adjust" 10.305558 ###############
    3.       "50__dropout_20-24-Primary_OL_comp"  9.958144 ##############
    4.                              "population"  9.935802 ##############
    5.           

### Gradient Tree

In [39]:
# converting pandas to tensorFlow dataset
train_ds = tfdf.keras.pd_dataframe_to_tf_dataset(combine_dfs(X_train,y_train), label="o_"+OUTPUT, task=tfdf.keras.Task.REGRESSION)
model = tfdf.keras.GradientBoostedTreesModel(task = tfdf.keras.Task.REGRESSION)

model.fit(x=train_ds)
# Convert it to a TensorFlow dataset
test_ds = tfdf.keras.pd_dataframe_to_tf_dataset(combine_dfs(X_train,y_train), label="o_"+OUTPUT, task=tfdf.keras.Task.REGRESSION)

# Evaluate the model
model.compile(metrics=["mse"])
# Evaluate the model on the test dataset.
evaluation = model.evaluate(test_ds, return_dict=True)
print(OUTPUT.upper())
print(evaluation)
print()
print(f"MSE: {evaluation['mse']}")
print(f"RMSE: {math.sqrt(evaluation['mse'])}")
print()

Use /tmp/tmp6jh4zmcv as temporary training directory
Starting reading the dataset
1/6 [====>.........................] - ETA: 1s
Dataset read in 0:00:00.246450
Training model
Model trained in 0:00:02.530178
Compiling model
LIFE_EXPECTANCY
{'loss': 0.0, 'mse': 0.001707651885226369}

MSE: 0.001707651885226369
RMSE: 0.041323744811262796



In [73]:
tfdf.model_plotter.plot_model_in_colab(model, tree_idx=0)

In [40]:
model.summary()

Model: "gradient_boosted_trees_model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
Total params: 1
Trainable params: 0
Non-trainable params: 1
_________________________________________________________________
Type: "GRADIENT_BOOSTED_TREES"
Task: REGRESSION
Label: "__LABEL"

Input Features (11):
	20-24-In_Primary_OL
	20-24-Primary_OL
	20-24_female-In_Primary_OL
	20-24_female-Primary_OL
	20_yrs_old_gdppercapita_us_infla_adjust
	50__dropout_20-24-Primary_OL_comp
	children_per_woman
	co2_emissions_percapita
	gdppercapita_us_infla_adjust
	gini_index
	population

No weights

Variable Importance: MEAN_MIN_DEPTH:
    1.                                 "__LABEL"  4.872849 ################
    2.       "50__dropout_20-24-Primary_OL_comp"  4.788965 ###############
    3.            "gdppercapita_us_infla_adjust"  4.737890 ###############
    4. "20_yrs_old_gdppercapita_us_infla_adjust"  4.724926 ###############

## Total Fertility Rate

In [49]:
PREDICT_FUTURE  = 0
OUTPUT         = 'children_per_woman'

In [52]:
datasets = [
            # "infant_mortality",
            "life_expectancy",
            # "child_mortality",
            # "children_per_woman",
            "co2_emissions_percapita",
            "gini_index",
            "gdppercapita_us_infla_adjust",
            create_n_yrs_old_csv("gdppercapita_us_infla_adjust",n = 20),
            create_n_yrs_old_csv("children_per_woman",n = 20),
            create_n_dropout_csv("20-24-Primary_OL_comp",50,95),
            "20-24-In_Primary_OL",
            "20-24-Primary_OL",
            # "20-24-Lower_Secondary_OL",
            "population",
            "20-24_female-In_Primary_OL",
            "20-24_female-Primary_OL",
            # "20-24_female-Lower_Secondary_OL",
            ]
get_countries_count(datasets)

Factor: life_expectancy                  count: 195
Factor: co2_emissions_percapita          count: 194
Factor: gini_index                       count: 195
Factor: gdppercapita_us_infla_adjust     count: 207
Factor: 20_yrs_old_gdppercapita_us_infla_adjust count: 207
Factor: 20_yrs_old_children_per_woman    count: 202
Factor: 50%_dropout_20-24-Primary_OL_comp count: 180
Factor: 20-24-In_Primary_OL              count: 202
Factor: 20-24-Primary_OL                 count: 202
Factor: population                       count: 197
Factor: 20-24_female-In_Primary_OL       count: 202
Factor: 20-24_female-Primary_OL          count: 202


In [80]:
# creating a list of all countries & years
countries = find_common_countries(datasets)
years     = [y for y in range(1960,2015-PREDICT_FUTURE+1)]
keys      = generate_indices(countries, years)

In [81]:
input_df            = load_datasets_to_pd(datasets,keys)
# input_df            = add_n_yrs_old_dataset(input_df,"gdppercapita_us_infla_adjust",20)
input_df.set_index(["country","year"], inplace=True)
output_df           = input_df[["o_" + OUTPUT]]
input_df.drop(labels=["o_" + OUTPUT], axis = 1, inplace=True)

In [82]:
input_df

Unnamed: 0_level_0,Unnamed: 1_level_0,life_expectancy,co2_emissions_percapita,gini_index,gdppercapita_us_infla_adjust,20_yrs_old_gdppercapita_us_infla_adjust,20_yrs_old_children_per_woman,50%_dropout_20-24-Primary_OL_comp,20-24-In_Primary_OL,20-24-Primary_OL,population,20-24_female-In_Primary_OL,20-24_female-Primary_OL
country,year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
greece,1980,75.3,5.290,37.5,15300.0,5030.0,2.33,83.0,2.0,28.4,9630000.0,2.0,30.9
thailand,1980,66.5,0.844,44.8,1430.0,581.0,6.15,40.0,4.3,67.9,47400000.0,4.9,72.2
senegal,1980,52.9,0.598,54.6,1060.0,1200.0,7.00,100.0,76.4,88.5,5580000.0,82.5,92.9
costa rica,1980,74.7,1.030,46.9,6210.0,3600.0,6.45,75.0,16.1,55.7,2390000.0,16.2,55.1
zimbabwe,1980,60.4,1.300,38.5,1490.0,1160.0,7.16,43.0,31.2,49.6,7410000.0,39.1,58.4
...,...,...,...,...,...,...,...,...,...,...,...,...,...
morocco,2015,72.5,1.760,39.6,3220.0,1730.0,3.30,49.0,29.9,46.2,34700000.0,34.5,48.2
myanmar,2015,67.7,0.421,38.1,1140.0,208.0,3.04,68.0,27.1,46.1,52700000.0,29.0,48.4
oman,2015,72.4,15.100,40.0,16000.0,15400.0,5.35,35.0,7.3,25.5,4270000.0,7.8,23.5
belarus,2015,73.7,6.290,26.9,5950.0,1890.0,1.47,30.0,0.2,0.6,9440000.0,0.1,0.4


In [83]:
input_df.isna().sum()

life_expectancy                            0
co2_emissions_percapita                    0
gini_index                                 0
gdppercapita_us_infla_adjust               0
20_yrs_old_gdppercapita_us_infla_adjust    0
20_yrs_old_children_per_woman              0
50%_dropout_20-24-Primary_OL_comp          0
20-24-In_Primary_OL                        0
20-24-Primary_OL                           0
population                                 0
20-24_female-In_Primary_OL                 0
20-24_female-Primary_OL                    0
dtype: int64

In [84]:
input_df.shape

(3558, 12)

In [85]:
X_train, X_test, y_train, y_test = train_test_split(input_df, output_df, test_size=0.30, random_state=43)

### Random Forest

In [86]:

# converting pandas to tensorFlow dataset
train_ds = tfdf.keras.pd_dataframe_to_tf_dataset(combine_dfs(X_train,y_train), label="o_"+OUTPUT, task=tfdf.keras.Task.REGRESSION)
model = tfdf.keras.RandomForestModel(task = tfdf.keras.Task.REGRESSION)

model.fit(x=train_ds)
# Convert it to a TensorFlow dataset
test_ds = tfdf.keras.pd_dataframe_to_tf_dataset(combine_dfs(X_train,y_train), label="o_"+OUTPUT, task=tfdf.keras.Task.REGRESSION)

# Evaluate the model
model.compile(metrics=["mse"])
# Evaluate the model on the test dataset.
evaluation = model.evaluate(test_ds, return_dict=True)
print(OUTPUT.upper())
print(evaluation)
print()
print(f"MSE: {evaluation['mse']}")
print(f"RMSE: {math.sqrt(evaluation['mse'])}")
print()

Use /tmp/tmp8e1b_w4f as temporary training directory
Starting reading the dataset
Dataset read in 0:00:00.228079
Training model
Model trained in 0:00:02.122216
Compiling model




CHILDREN_PER_WOMAN
{'loss': 0.0, 'mse': 0.06606526672840118}

MSE: 0.06606526672840118
RMSE: 0.2570316453832119



In [None]:
tfdf.model_plotter.plot_model_in_colab(model, tree_idx=0)

In [87]:
# %set_cell_height 300

model.summary()

Model: "random_forest_model_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
Total params: 1
Trainable params: 0
Non-trainable params: 1
_________________________________________________________________
Type: "RANDOM_FOREST"
Task: REGRESSION
Label: "__LABEL"

Input Features (12):
	20-24-In_Primary_OL
	20-24-Primary_OL
	20-24_female-In_Primary_OL
	20-24_female-Primary_OL
	20_yrs_old_children_per_woman
	20_yrs_old_gdppercapita_us_infla_adjust
	50__dropout_20-24-Primary_OL_comp
	co2_emissions_percapita
	gdppercapita_us_infla_adjust
	gini_index
	life_expectancy
	population

No weights

Variable Importance: MEAN_MIN_DEPTH:
    1.                                 "__LABEL"  9.403145 ################
    2.       "50__dropout_20-24-Primary_OL_comp"  8.705904 ##############
    3.                              "population"  8.702443 ##############
    4. "20_yrs_old_gdppercapita_us_infla_adjust"  8.670487 #####

### Gradient Tree

In [None]:
# converting pandas to tensorFlow dataset
train_ds = tfdf.keras.pd_dataframe_to_tf_dataset(combine_dfs(X_train,y_train), label="o_"+OUTPUT, task=tfdf.keras.Task.REGRESSION)
model = tfdf.keras.GradientBoostedTreesModel(task = tfdf.keras.Task.REGRESSION)

model.fit(x=train_ds)
# Convert it to a TensorFlow dataset
test_ds = tfdf.keras.pd_dataframe_to_tf_dataset(combine_dfs(X_train,y_train), label="o_"+OUTPUT, task=tfdf.keras.Task.REGRESSION)

# Evaluate the model
model.compile(metrics=["mse"])
# Evaluate the model on the test dataset.
evaluation = model.evaluate(test_ds, return_dict=True)
print(OUTPUT.upper())
print(evaluation)
print()
print(f"MSE: {evaluation['mse']}")
print(f"RMSE: {math.sqrt(evaluation['mse'])}")
print()

In [None]:
tfdf.model_plotter.plot_model_in_colab(model, tree_idx=0)

In [None]:
model.summary()

## Primary education OL

In [174]:
PREDICT_FUTURE = 0
OUTPUT         = '20-24-Primary_OL'


In [194]:
datasets = [
            "infant_mortality",
            "life_expectancy",
            "child_mortality",
            "children_per_woman",
            "co2_emissions_percapita",
            "gini_index",
            "gdppercapita_us_infla_adjust",
            create_n_yrs_old_csv("gdppercapita_us_infla_adjust",n = 20),
            "population",
            # "20-24_female-In_Primary_OL",
            # "20-24_female-Primary_OL",
            # "20-24_female-Lower_Secondary_OL",
            
            ]
get_countries_count(datasets)

Factor: infant_mortality                 count: 266
Factor: life_expectancy                  count: 195
Factor: child_mortality                  count: 197
Factor: children_per_woman               count: 202
Factor: co2_emissions_percapita          count: 194
Factor: gini_index                       count: 195
Factor: gdppercapita_us_infla_adjust     count: 207
Factor: 20_yrs_old_gdppercapita_us_infla_adjust count: 207
Factor: population                       count: 197


In [195]:
# creating a list of all countries & years
countries = find_common_countries(datasets)
years     = [y for y in range(1960,2015-PREDICT_FUTURE+1)]
keys      = generate_indices(countries, years)

In [196]:
input_df            = load_datasets_to_pd(datasets,keys)
# input_df.dropna(subset=["gdppercapita_us_infla_adjust"],inplace=True)
input_df.set_index(["country","year"], inplace=True)

In [197]:
output_df           = input_df[["o_" + OUTPUT]]
input_df.drop(labels=["o_" + OUTPUT], axis = 1, inplace=True)

In [198]:
input_df

Unnamed: 0_level_0,Unnamed: 1_level_0,infant_mortality,life_expectancy,child_mortality,children_per_woman,co2_emissions_percapita,gini_index,gdppercapita_us_infla_adjust,20_yrs_old_gdppercapita_us_infla_adjust,population
country,year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
mongolia,1960,,43.9,228.00,6.95,1.350,32.0,,,956000.0
greece,1960,39.5,72.4,46.30,2.33,1.140,46.8,5030.0,,8270000.0
romania,1960,73.3,65.8,95.20,2.34,2.870,19.3,,,18600000.0
sri lanka,1960,68.1,64.0,97.80,5.54,0.229,36.4,,,9870000.0
thailand,1960,101.3,60.9,146.00,6.15,0.136,41.5,581.0,,27400000.0
...,...,...,...,...,...,...,...,...,...,...
belarus,2015,3.0,73.7,4.03,1.69,6.290,26.9,5950.0,1890.0,9440000.0
netherlands,2015,3.4,81.7,4.03,1.75,9.830,28.3,45200.0,33700.0,16900000.0
grenada,2015,14.2,73.1,15.10,2.13,2.380,40.0,9100.0,5210.0,110000.0
suriname,2015,18.0,72.5,20.90,2.40,3.060,61.0,9170.0,6010.0,559000.0


In [None]:
input_df.isna().sum()

In [None]:
input_df.shape

In [199]:
X_train, X_test, y_train, y_test = train_test_split(input_df, output_df, test_size=0.30, random_state=43)

In [200]:

# converting pandas to tensorFlow dataset
train_ds = tfdf.keras.pd_dataframe_to_tf_dataset(combine_dfs(X_train,y_train), label="o_"+OUTPUT, task=tfdf.keras.Task.REGRESSION)
model = tfdf.keras.RandomForestModel(task = tfdf.keras.Task.REGRESSION)

model.fit(x=train_ds)
# Convert it to a TensorFlow dataset
test_ds = tfdf.keras.pd_dataframe_to_tf_dataset(combine_dfs(X_train,y_train), label="o_"+OUTPUT, task=tfdf.keras.Task.REGRESSION)

# Evaluate the model
model.compile(metrics=["mse"])
# Evaluate the model on the test dataset.
evaluation = model.evaluate(test_ds, return_dict=True)
print(OUTPUT.upper())
print(evaluation)
print()
print(f"MSE: {evaluation['mse']}")
print(f"RMSE: {math.sqrt(evaluation['mse'])}")
print()

Use /tmp/tmp96irv_7w as temporary training directory
Starting reading the dataset
1/7 [===>..........................] - ETA: 1s
Dataset read in 0:00:00.234163
Training model
Model trained in 0:00:05.974334
Compiling model
20-24-PRIMARY_OL
{'loss': 0.0, 'mse': 14514752847872.0}

MSE: 14514752847872.0
RMSE: 3809823.2042802195



In [None]:
tfdf.model_plotter.plot_model_in_colab(model, tree_idx=0)

In [201]:
# %set_cell_height 300

model.summary()

Model: "random_forest_model_11"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
Total params: 1
Trainable params: 0
Non-trainable params: 1
_________________________________________________________________
Type: "RANDOM_FOREST"
Task: REGRESSION
Label: "__LABEL"

Input Features (9):
	20_yrs_old_gdppercapita_us_infla_adjust
	child_mortality
	children_per_woman
	co2_emissions_percapita
	gdppercapita_us_infla_adjust
	gini_index
	infant_mortality
	life_expectancy
	population

No weights

Variable Importance: MEAN_MIN_DEPTH:
    1.                                 "__LABEL" 12.654467 ################
    2.                         "child_mortality" 10.775153 #############
    3.                        "infant_mortality" 10.485424 ############
    4.                         "life_expectancy" 10.031618 ############
    5.                 "co2_emissions_percapita"  8.635455 ##########
    6.                      

## GDP per capita

In [88]:
PREDICT_FUTURE  = 0
OUTPUT         = 'gdppercapita_us_infla_adjust'


In [143]:
datasets = [
            "infant_mortality",
            "life_expectancy",
            "child_mortality",
            "children_per_woman",
            "co2_emissions_percapita",
            "gini_index",
            create_n_yrs_old_csv("gdppercapita_us_infla_adjust",n = 20),
            create_n_dropout_csv("20-24-Primary_OL_comp",50,95),
            "20-24-In_Primary_OL",
            "20-24-Primary_OL",
            # "20-24-Lower_Secondary_OL",
            "population",
            "20-24_female-In_Primary_OL",
            "20-24_female-Primary_OL",
            create_n_yrs_old_csv("20-24-Primary_OL",n = 10),
            create_n_yrs_old_csv("20-24-Primary_OL",n = 30),
            # "20-24_female-Lower_Secondary_OL",
            ]
get_countries_count(datasets)

Factor: infant_mortality                 count: 266
Factor: life_expectancy                  count: 195
Factor: child_mortality                  count: 197
Factor: children_per_woman               count: 202
Factor: co2_emissions_percapita          count: 194
Factor: gini_index                       count: 195
Factor: 20_yrs_old_gdppercapita_us_infla_adjust count: 207
Factor: 50%_dropout_20-24-Primary_OL_comp count: 180
Factor: 20-24-In_Primary_OL              count: 202
Factor: 20-24-Primary_OL                 count: 202
Factor: population                       count: 197
Factor: 20-24_female-In_Primary_OL       count: 202
Factor: 20-24_female-Primary_OL          count: 202
Factor: 10_yrs_old_20-24-Primary_OL      count: 202
Factor: 30_yrs_old_20-24-Primary_OL      count: 202


In [144]:
# creating a list of all countries & years
countries = find_common_countries(datasets)
years     = [y for y in range(1960,2015-PREDICT_FUTURE+1)]
keys      = generate_indices(countries, years)

In [145]:
input_df            = load_datasets_to_pd(datasets,keys)
input_df.set_index(["country","year"], inplace=True)
# input_df['o_gdppercapita_us_infla_adjust'] = np.log(input_df['o_gdppercapita_us_infla_adjust'])
# input_df['20_yrs_old_gdppercapita_us_infla_adjust'] = np.log(input_df['20_yrs_old_gdppercapita_us_infla_adjust'])

In [146]:
output_df           = input_df[["o_" + OUTPUT]]
input_df.drop(labels=["o_" + OUTPUT], axis = 1, inplace=True)

In [147]:
print(input_df.shape)
input_df.isna().sum()

(7840, 15)


infant_mortality                            681
life_expectancy                               0
child_mortality                               0
children_per_woman                            0
co2_emissions_percapita                     143
gini_index                                    0
20_yrs_old_gdppercapita_us_infla_adjust    4396
50%_dropout_20-24-Primary_OL_comp             0
20-24-In_Primary_OL                           0
20-24-Primary_OL                              0
population                                    0
20-24_female-In_Primary_OL                    0
20-24_female-Primary_OL                       0
10_yrs_old_20-24-Primary_OL                1400
30_yrs_old_20-24-Primary_OL                4200
dtype: int64

In [97]:
input_df.describe()

Unnamed: 0,infant_mortality,life_expectancy,child_mortality,children_per_woman,co2_emissions_percapita,gini_index,20_yrs_old_gdppercapita_us_infla_adjust,50%_dropout_20-24-Primary_OL_comp,20-24-In_Primary_OL,20-24-Primary_OL,population,20-24_female-In_Primary_OL,20-24_female-Primary_OL
count,7159.0,7840.0,7840.0,7840.0,7697.0,7840.0,3444.0,7840.0,7840.0,7840.0,7840.0,7840.0,7840.0
mean,56.293463,64.354694,91.103636,4.233316,4.367401,39.993597,7.834533,64.228571,32.061505,47.110485,29596890.0,35.416569,49.490179
std,47.84211,10.419046,86.000612,2.053817,7.727844,10.065395,1.396819,29.141753,31.418196,33.431005,121332100.0,34.776651,35.628778
min,1.8,9.5,2.18,1.14,0.0,16.0,4.820282,9.0,0.0,0.0,41200.0,0.0,0.0
25%,16.0,56.3,20.7,2.2575,0.309,32.4,6.72022,40.0,1.8,11.89,2040000.0,1.8,11.36
50%,42.9,66.5,60.6,4.025,1.52,39.8,7.811973,62.5,23.1,49.21,6035000.0,23.4,51.62
75%,87.4,72.4,142.0,6.18,5.94,46.1,8.731094,100.0,59.545,78.5,15700000.0,68.7,85.08
max,232.0,84.3,423.0,8.46,101.0,77.0,11.626254,128.0,98.9,99.9,1410000000.0,100.0,100.0


In [127]:
input_df

Unnamed: 0_level_0,Unnamed: 1_level_0,infant_mortality,life_expectancy,child_mortality,children_per_woman,co2_emissions_percapita,gini_index,20_yrs_old_gdppercapita_us_infla_adjust,50%_dropout_20-24-Primary_OL_comp,20-24-In_Primary_OL,20-24-Primary_OL,population,20-24_female-In_Primary_OL,20-24_female-Primary_OL,20_yrs_old_20-24-Primary_OL
country,year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
mongolia,1960,,43.9,228.00,6.95,1.350,32.0,,29.0,14.0,51.7,956000.0,17.1,60.8,
greece,1960,39.5,72.4,46.30,2.33,1.140,46.8,,83.0,18.8,68.3,8270000.0,25.4,74.5,
romania,1960,73.3,65.8,95.20,2.34,2.870,19.3,,50.0,3.4,36.4,18600000.0,4.6,44.3,
sri lanka,1960,68.1,64.0,97.80,5.54,0.229,36.4,,94.0,44.9,62.4,9870000.0,49.0,65.5,
thailand,1960,101.3,60.9,146.00,6.15,0.136,41.5,,40.0,16.1,89.2,27400000.0,18.8,93.2,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
morocco,2015,21.7,72.5,25.50,2.53,1.760,39.6,1730.0,49.0,29.9,46.2,34700000.0,34.5,48.2,72.7
myanmar,2015,40.7,67.7,51.60,2.23,0.421,38.1,208.0,68.0,27.1,46.1,52700000.0,29.0,48.4,62.1
oman,2015,9.6,72.4,11.20,2.74,15.100,40.0,15400.0,35.0,7.3,25.5,4270000.0,7.8,23.5,53.8
belarus,2015,3.0,73.7,4.03,1.69,6.290,26.9,1890.0,30.0,0.2,0.6,9440000.0,0.1,0.4,0.6


In [151]:
X_train, X_test, y_train, y_test = train_test_split(input_df, output_df, test_size=0.30, random_state=43)

In [None]:
X_train.isna().sum()

In [None]:
y_train.isna().sum()

In [152]:

# converting pandas to tensorFlow dataset
train_ds = tfdf.keras.pd_dataframe_to_tf_dataset(combine_dfs(X_train,y_train), label="o_"+OUTPUT, task=tfdf.keras.Task.REGRESSION)
model = tfdf.keras.RandomForestModel(task = tfdf.keras.Task.REGRESSION)

model.fit(x=train_ds)
# Convert it to a TensorFlow dataset
test_ds = tfdf.keras.pd_dataframe_to_tf_dataset(combine_dfs(X_train,y_train), label="o_"+OUTPUT, task=tfdf.keras.Task.REGRESSION)

# Evaluate the model
model.compile(metrics=["mse"])
# Evaluate the model on the test dataset.
evaluation = model.evaluate(test_ds, return_dict=True)
print(OUTPUT.upper())
print(evaluation)
print()
print(f"MSE: {evaluation['mse']}")
print(f"RMSE: {math.sqrt(evaluation['mse'])}")
print()

Use /tmp/tmpomvpgn0r as temporary training directory
Starting reading the dataset
Dataset read in 0:00:00.322430
Training model
Model trained in 0:00:02.556905
Compiling model
GDPPERCAPITA_US_INFLA_ADJUST
{'loss': 0.0, 'mse': 0.17563371360301971}

MSE: 0.17563371360301971
RMSE: 0.41908676142658063



In [None]:
tfdf.model_plotter.plot_model_in_colab(model, tree_idx=0)

In [153]:
# %set_cell_height 300

model.summary()


Model: "random_forest_model_9"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
Total params: 1
Trainable params: 0
Non-trainable params: 1
_________________________________________________________________
Type: "RANDOM_FOREST"
Task: REGRESSION
Label: "__LABEL"

Input Features (15):
	10_yrs_old_20-24-Primary_OL
	20-24-In_Primary_OL
	20-24-Primary_OL
	20-24_female-In_Primary_OL
	20-24_female-Primary_OL
	20_yrs_old_gdppercapita_us_infla_adjust
	30_yrs_old_20-24-Primary_OL
	50__dropout_20-24-Primary_OL_comp
	child_mortality
	children_per_woman
	co2_emissions_percapita
	gini_index
	infant_mortality
	life_expectancy
	population

No weights

Variable Importance: MEAN_MIN_DEPTH:
    1.                                 "__LABEL"  9.774058 ################
    2.                 "co2_emissions_percapita"  9.029549 ##############
    3.                      "children_per_woman"  8.924762 ##############
    4. "20_y

### Gradient Tree

In [162]:
# converting pandas to tensorFlow dataset
train_ds = tfdf.keras.pd_dataframe_to_tf_dataset(combine_dfs(X_train,y_train), label="o_"+OUTPUT, task=tfdf.keras.Task.REGRESSION)
model = tfdf.keras.GradientBoostedTreesModel(task = tfdf.keras.Task.REGRESSION)

model.fit(x=train_ds)
# Convert it to a TensorFlow dataset
test_ds = tfdf.keras.pd_dataframe_to_tf_dataset(combine_dfs(X_train,y_train), label="o_"+OUTPUT, task=tfdf.keras.Task.REGRESSION)

# Evaluate the model
model.compile(metrics=["mse"])
# Evaluate the model on the test dataset.
evaluation = model.evaluate(test_ds, return_dict=True)
print(OUTPUT.upper())
print(evaluation)
print()
print(f"MSE: {evaluation['mse']}")
print(f"RMSE: {math.sqrt(evaluation['mse'])}")
print()

Use /tmp/tmp9clzskjs as temporary training directory
Starting reading the dataset
Dataset read in 0:00:00.290885
Training model
Model trained in 0:00:01.571320
Compiling model
GDPPERCAPITA_US_INFLA_ADJUST
{'loss': 0.0, 'mse': 0.0015414361841976643}

MSE: 0.0015414361841976643
RMSE: 0.039261128157474845



In [163]:
tfdf.model_plotter.plot_model_in_colab(model, tree_idx=0)

In [164]:
model.summary()

Model: "gradient_boosted_trees_model_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
Total params: 1
Trainable params: 0
Non-trainable params: 1
_________________________________________________________________
Type: "GRADIENT_BOOSTED_TREES"
Task: REGRESSION
Label: "__LABEL"

Input Features (15):
	10_yrs_old_20-24-Primary_OL
	20-24-In_Primary_OL
	20-24-Primary_OL
	20-24_female-In_Primary_OL
	20-24_female-Primary_OL
	20_yrs_old_gdppercapita_us_infla_adjust
	30_yrs_old_20-24-Primary_OL
	50__dropout_20-24-Primary_OL_comp
	child_mortality
	children_per_woman
	co2_emissions_percapita
	gini_index
	infant_mortality
	life_expectancy
	population

No weights

Variable Importance: MEAN_MIN_DEPTH:
    1.                                 "__LABEL"  4.938178 ################
    2.       "50__dropout_20-24-Primary_OL_comp"  4.894394 ###############
    3.                        "infant_mortality"  4.892383 ########

In [165]:
for f in created_datasets:
  if os.path.isfile(datasets_path[f]): # this makes the code more robust
        os.remove(datasets_path[f])

# Save the summary

In [202]:
#@markdown What changes you made to datasets & why ?


approach = ''  #@param {type: "string"}


In [203]:
inspector = model.make_inspector()

In [204]:
model_name = inspector.model_type()
num_trees  = inspector.num_trees()
objective  = inspector.objective()
eval = inspector.evaluation()

In [205]:

input_features_list = inspector.features()
input_features = ""
for i in range(len(input_features_list)):
  input_features += str(i+1)+ "  "
  input_features += input_features_list[i][0] + "\n"

In [206]:
variable_imp = []
variable_imp.append(extract_variable_imp(variable = 'MEAN_MIN_DEPTH'))
variable_imp.append(extract_variable_imp(variable = 'SUM_SCORE'))


In [207]:
#@ Analysis
#@markdown Observations made from the output ?


anaylsis = ''  #@param {type: "string"}


In [208]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [209]:
with open('/content/drive/My Drive/education-impact/document.txt', 'a') as f:
  f.write("Model Trained : " + model_name+ "\n")
  f.write("Predicting : " + OUTPUT + "\n")
  f.write("Approach : " + approach + "\n")
  f.write("Num Examples: " + str(eval.num_examples) + "\n\n")
  f.write("Input Features\n")
  f.write(input_features+"\n\n")
  f.write("Model Performance\n")
  f.write("RMSE Score : "  + str(eval.rmse)+ "\n\n")
  for v in variable_imp:
    f.write(v + "\n\n")
  f.write("Analysis : " + anaylsis + "\n")
  f.write("\n\n\n")

  

# Models to try
 - RandomForestModel
 - GradientBoostedTreesModel
 - CartModel
 - DistributedGradientBoostedTreesModel

 - KNN Regression
 - Support Vector Regression
 - Locally Weighted Scatterplot Smoothing
 - Multivariate Adaptive Regression Splines
  