<a href="https://colab.research.google.com/github/jaya-shankar/education-impact/blob/master/All_Models.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Cloning Repo & installing libs

In [1]:
!rm -rf education-impact

In [2]:
!git clone https://github.com/jaya-shankar/education-impact.git


Cloning into 'education-impact'...
remote: Enumerating objects: 425, done.[K
remote: Counting objects: 100% (425/425), done.[K
remote: Compressing objects: 100% (364/364), done.[K
remote: Total 425 (delta 214), reused 192 (delta 58), pack-reused 0[K
Receiving objects: 100% (425/425), 6.33 MiB | 12.86 MiB/s, done.
Resolving deltas: 100% (214/214), done.


In [3]:
!pip install tensorflow_decision_forests
!pip install wurlitzer
!pip install seaborn

Collecting tensorflow_decision_forests
  Downloading tensorflow_decision_forests-0.2.2-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (17.7 MB)
[K     |████████████████████████████████| 17.7 MB 2.7 MB/s 
Collecting wurlitzer
  Downloading wurlitzer-3.0.2-py3-none-any.whl (7.3 kB)
Installing collected packages: wurlitzer, tensorflow-decision-forests
Successfully installed tensorflow-decision-forests-0.2.2 wurlitzer-3.0.2


In [4]:
import pandas as pd
import os
import numpy as np
import math
import seaborn as sns
import tensorflow_decision_forests as tfdf
from sklearn.model_selection import train_test_split
from wurlitzer import sys_pipes



# Setting paths

In [5]:
root = "education-impact/datasets/" 
datasets_path = {
                    "infant_mortality"                :  root+ "Infant_Mortality_Rate.csv",
                    "child_mortality"                 :  root+ "child_mortality_0_5_year_olds_dying_per_1000_born.csv",
                    "children_per_woman"              :  root+ "children_per_woman_total_fertility.csv",
                    "co2_emissions_percapita"         :  root+ "co2_emissions_tonnes_per_person.csv",
                    "population"                      :  root+ "converted_pop.csv",
                    "population_density"              :  root+ "population_per_area.csv",
                    "gdp_growth"                      :  root+ "gdp_per_capita_yearly_growth.csv",
                    "Avg_daily_income_ppp"            :  root+ "mincpcap_cppp.csv",
                    "gdppercapita_us_infla_adjust"    :  root+ "gdppercapita_us_inflation_adjusted.csv",
                    "gini_index"                      :  root+ "gini.csv",
                    "life_expectancy"                 :  root+ "life_expectancy_years.csv",
                    "poverty_index"                   :  root+ "mincpcap_cppp.csv",
                    "people_in_poverty"               :  root+ "number_of_people_in_poverty.csv",
                    "ratio_b/g_in_primary"            :  root+ "ratio_of_girls_to_boys_in_primary_and_secondary_education_perc.csv",
                    "wcde-25--34"                     :  root+ "wcde-25--34.csv",
                    "20-24-In_Primary_OL"             :  root+ "In_Primary_OL.csv",
                    "20-24-Primary_OL"                :  root+ "Primary_OL.csv",
                    "20-24-Lower_Secondary_OL"        :  root+ "Lower_Secondary_OL.csv",
                    "20-24_female-In_Primary_OL"      :  root+ "female_In_Primary_OL.csv",
                    "20-24_female-Primary_OL"         :  root+ "female_Primary_OL.csv",
                    "20-24_female-Lower_Secondary_OL" :  root+ "female_Lower_Secondary_OL.csv",
                    "20-24-In_Primary_OL_comp"        :  root+ "In_Primary_OL_complete.csv",
                    "20-24-Primary_OL_comp"           :  root+ "Primary_OL_complete.csv",
                    "20-24-Lower_Secondary_OL_comp"   :  root+ "Lower_Secondary_OL_complete.csv",
                }

created_datasets = []

# Defining common functions

In [6]:
def get_countries_count(datasets):
  for dataset in datasets:
    df = pd.read_csv(datasets_path[dataset])
    count = len(set(df.Country.unique()))
    print(f"{'Factor: ' + dataset:<40} count: {count}")


In [7]:
def find_common_countries(datasets):
  common_countries = set()
  for dataset in datasets:
    countries_list = list(pd.read_csv(datasets_path[dataset]).Country)
    countries_list = set(map(lambda x: x.lower(), countries_list))
    if common_countries == set():
      common_countries = countries_list
    else:
      common_countries = common_countries.intersection(countries_list)
  return list(common_countries)

In [8]:
def generate_indices(countries,years):
  keys=[]
  for y in years:
    for c in countries:
      keys.append((c,str(y)))
  return keys

In [9]:
def load_datasets_to_pd(datasets,keys,include_output=True):
  combined_df = pd.DataFrame(keys,columns=['country','year'])
  for dataset in datasets:
    combined_df = add_dataset(combined_df,dataset)
    

  if include_output:
    combined_df = add_dataset(combined_df,dataset,output = True)
  return combined_df

  

In [10]:
def add_dataset(input_df,dataset,output = False):
  label = dataset
  if output : 
    label = "o_"+OUTPUT
  input_df[label] = [math.nan]*len(input_df)
  df = pd.read_csv(datasets_path[dataset])
  df["Country"] = df["Country"].str.lower()
  df.set_index("Country", inplace=True)
  for e in range(len(input_df)):
    country = input_df.iloc[e].country
    year    = input_df.iloc[e].year
    if output:
      input_df.at[e,label] = df.loc[country][str( int(year) + PREDICT_FUTURE )]
    else:
      input_df.at[e,label] = df.loc[country][year]
  return input_df

In [11]:
def create_n_yrs_old_csv(dataset,n):
    table_name = str(n)+"_yrs_old_"+dataset
    # if table_name in created_datasets:
    #   return table_name
    df = pd.read_csv(datasets_path[dataset])
    years = list(range(1960, 2016))
    countries = list(df['Country'])
    new_df = pd.DataFrame(countries,columns=['Country'])
    for i in range(1960,2016):
      new_df[i] = np.nan
    new_df.set_index('Country',inplace=True)
    df.set_index('Country',inplace=True)
    for c in countries:
      for y in range(1960+n,2016):
        new_df[y].at[c] = df.loc[c][str(y-n)]
    
    
    datasets_path[table_name] = root+table_name+".csv"
    new_df.to_csv(root+table_name+".csv",encoding='utf-8', index=True)

    created_datasets.append(table_name)
    return table_name

In [12]:
def create_n_dropout_csv(dataset,n,s_n = 90):
    df = pd.read_csv(datasets_path[dataset])
    years = list(range(1875, 2016))
    countries = list(df['Country'])
    drop_out_years = []
    for i in range(len(df)):
      started = False
      s_year  = 1875
      for year in years:
        if not started and df.iloc[i][str(year)] < s_n :
          started = True
          s_year  = year
        elif started and df.iloc[i][str(year)] < n :
          drop_out_years.append((df.iloc[i]['Country'],(year-s_year)))
          break
      else:
        if started:
          drop_out_years.append((df.iloc[i]['Country'], 100))
        else:
          drop_out_years.append((df.iloc[i]['Country'], year-s_year))
    new_df = pd.DataFrame(countries,columns=['Country'])
    for i in years:
      new_df[i] = np.nan
    new_df.set_index('Country',inplace=True)
    for c,y in drop_out_years:
      for i in years:
        new_df.at[c,i] = y
    table_name = str(n)+"%_dropout_"+dataset
    datasets_path[table_name] = root+table_name+".csv"
    new_df = new_df[(new_df.T != 1).any()]
    new_df.to_csv(root+table_name+".csv",encoding='utf-8', index=True)

    created_datasets.append(table_name)
    return table_name

In [13]:
def combine_dfs(X,y):
  label = y.columns[0]
  X[label] = y
  X.dropna(subset=[label],inplace=True)
  return X

In [14]:
def extract_variable_imp(variable):
  v_list = inspector.variable_importances()[variable]
  v = variable + "\n"

  for i in range(1,len(v_list)):
    v += str(i)+ "  "
    v += f"{v_list[i][0][0]:<50}" 
    v += str(v_list[i][1]) + "\n"
  return v

In [15]:
def rate_of_dropout(dataset,n):
  table_name = str(n)+"_yr_period_rate_of_change_of_"+dataset

  years  = [y for y in range(1960,2016)]
  edu_df = pd.read_csv(datasets_path[dataset])
  countries = list(edu_df['Country'])
  new_df = pd.DataFrame(countries,columns=['Country'])
  for i in years:
    new_df[i] = np.nan
  new_df.set_index('Country',inplace=True)
  edu_df.set_index('Country',inplace=True)
  for c in countries:
    for y in range(1960+n,2016):
      new_df[y].at[c] = (edu_df.loc[c][str(y-n)] - edu_df.loc[c][str(y)])/n

  datasets_path[table_name] = root+table_name+".csv"
  
  new_df.to_csv(root+table_name+".csv",encoding='utf-8', index=True)

  created_datasets.append(table_name)
  return table_name




In [16]:
rate_of_dropout("20-24-Primary_OL",5)

'5_yr_period_rate_of_change_of_20-24-Primary_OL'

# Plotting Data

In [None]:
datasets_to_plot = [
            "infant_mortality",
            "child_mortality",
            "children_per_woman",
            "co2_emissions_percapita",
            "gini_index",
            "gdppercapita_us_infla_adjust",
            "20-24-In_Primary_OL",
            "20-24-Primary_OL",
            "20-24-Lower_Secondary_OL",
            "population",
            "population_density",
            "20-24_female-In_Primary_OL",
            "20-24_female-Primary_OL" ,
            "20-24_female-Lower_Secondary_OL",
            "life_expectancy"
            ]

# creating a list of all countries & years
countries   = find_common_countries(datasets_to_plot)
years       = [y for y in range(1960,2016)]
keys        = generate_indices(countries, years)

combined_df = load_datasets_to_pd(datasets_to_plot,keys,include_output=False)

## Life Expectancy

In [None]:
PREDICT_FUTURE  = 0
OUTPUT          = 'life_expectancy'
n = len(datasets_to_plot)
r = math.ceil(math.sqrt(n))
i=0
for m in range(r):
  plt = sns.pairplot(combined_df,diag_kind="kde", y_vars=[OUTPUT], x_vars=[datasets_to_plot[i] for i in range(r*m,min(n,r*(m+1)))], height=4,dropna=True)
  plt

## Total Fertility Rate

In [None]:
PREDICT_FUTURE  = 0
OUTPUT          = 'children_per_woman
n = len(datasets_to_plot)
r = math.ceil(math.sqrt(n))
i=0
for m in range(r):
  plt = sns.pairplot(combined_df,diag_kind="kde", y_vars=[OUTPUT], x_vars=[datasets_to_plot[i] for i in range(r*m,min(n,r*(m+1)))], height=4,dropna=True)
  plt

## Primary education OL

In [None]:
PREDICT_FUTURE  = 0
OUTPUT          = '20-24-Primary_OL'
n = len(datasets_to_plot)
r = math.ceil(math.sqrt(n))
i=0
for m in range(r):
  plt = sns.pairplot(combined_df,diag_kind="kde", y_vars=[OUTPUT], x_vars=[datasets_to_plot[i] for i in range(r*m,min(n,r*(m+1)))], height=4,dropna=True)
  plt

## GDP per capita

In [None]:
PREDICT_FUTURE  = 0
OUTPUT          = 'gdppercapita_us_infla_adjust'
n = len(datasets_to_plot)
r = math.ceil(math.sqrt(n))
i=0
for m in range(r):
  plt = sns.pairplot(combined_df,diag_kind="kde", y_vars=[OUTPUT], x_vars=[datasets_to_plot[i] for i in range(r*m,min(n,r*(m+1)))], height=4,dropna=True)
  plt

# Building Model

## Life Expectancy

### Preparing the Data

In [15]:
PREDICT_FUTURE  = 0
OUTPUT         = 'life_expectancy'

In [16]:
datasets = [
            # "infant_mortality",
            # "life_expectancy",
            # "child_mortality",
            "children_per_woman",
            "co2_emissions_percapita",
            "gini_index",
            "gdppercapita_us_infla_adjust",
            create_n_yrs_old_csv("gdppercapita_us_infla_adjust",n = 20),
            create_n_dropout_csv("20-24-Primary_OL_comp",50,95),
            "20-24-In_Primary_OL",
            "20-24-Primary_OL",
            # "20-24-Lower_Secondary_OL",
            "population",
            "20-24_female-In_Primary_OL",
            "20-24_female-Primary_OL",
            # "20-24_female-Lower_Secondary_OL",
            ]
get_countries_count(datasets)

Factor: children_per_woman               count: 202
Factor: co2_emissions_percapita          count: 194
Factor: gini_index                       count: 195
Factor: gdppercapita_us_infla_adjust     count: 207
Factor: 20_yrs_old_gdppercapita_us_infla_adjust count: 207
Factor: 50%_dropout_20-24-Primary_OL_comp count: 180
Factor: 20-24-In_Primary_OL              count: 202
Factor: 20-24-Primary_OL                 count: 202
Factor: population                       count: 197
Factor: 20-24_female-In_Primary_OL       count: 202
Factor: 20-24_female-Primary_OL          count: 202


In [17]:
# creating a list of all countries & years
countries = find_common_countries(datasets)
years     = [y for y in range(1960,2015-PREDICT_FUTURE+1)]
keys      = generate_indices(countries, years)

In [18]:
input_df            = load_datasets_to_pd(datasets,keys)
input_df.set_index(["country","year"], inplace=True)

In [19]:
output_df           = input_df[["o_" + OUTPUT]]
input_df.drop(labels=["o_" + OUTPUT], axis = 1, inplace=True)

In [20]:
input_df

Unnamed: 0,year,children_per_woman,co2_emissions_percapita,gini_index,gdppercapita_us_infla_adjust,20_yrs_old_gdppercapita_us_infla_adjust,50%_dropout_20-24-Primary_OL_comp,20-24-In_Primary_OL,20-24-Primary_OL,population,20-24_female-In_Primary_OL,20-24_female-Primary_OL
0,1960,6.37,,35.9,,,128.0,97.4,97.8,475000.0,98.3,98.3
1,1960,2.94,2.7600,28.9,,,19.0,3.9,13.0,4010000.0,4.5,13.3
2,1960,6.55,1.9500,31.5,,,16.0,9.9,25.1,2089999.0,14.7,34.4
3,1960,5.84,0.0586,65.7,653.0,,100.0,96.1,98.9,1500000.0,100.0,100.0
4,1960,7.02,,41.3,,,25.0,85.7,97.0,89900.0,87.0,97.3
...,...,...,...,...,...,...,...,...,...,...,...,...
8115,2015,2.59,0.5450,42.0,1160.0,388.0,100.0,31.8,56.7,15500000.0,36.0,61.9
8116,2015,1.72,3.6900,31.8,7640.0,6240.0,32.0,1.2,13.3,6530000.0,0.9,11.4
8117,2015,2.93,12.6000,46.4,6430.0,1970.0,21.0,0.1,0.4,5570000.0,0.2,0.5
8118,2015,1.25,5.0400,35.7,19200.0,15800.0,50.0,2.5,10.9,10400000.0,2.1,8.7


In [21]:
output_df

Unnamed: 0,o_life_expectancy
0,98.3
1,13.3
2,34.4
3,100.0
4,97.3
...,...
8115,61.9
8116,11.4
8117,0.5
8118,8.7


In [35]:
input_df.isna().sum()

children_per_woman                            0
co2_emissions_percapita                     143
gini_index                                    0
gdppercapita_us_infla_adjust               1706
20_yrs_old_gdppercapita_us_infla_adjust    4552
50%_dropout_20-24-Primary_OL_comp             0
20-24-In_Primary_OL                           0
20-24-Primary_OL                              0
population                                    0
20-24_female-In_Primary_OL                    0
20-24_female-Primary_OL                       0
dtype: int64

In [None]:
input_df.shape

In [22]:
X_train, X_test, y_train, y_test = train_test_split(input_df, output_df, test_size=0.30, random_state=43)

### Random Forest Model

In [23]:

# converting pandas to tensorFlow dataset
train_ds = tfdf.keras.pd_dataframe_to_tf_dataset(combine_dfs(X_train,y_train), label="o_"+OUTPUT, task=tfdf.keras.Task.REGRESSION)
model = tfdf.keras.RandomForestModel(task = tfdf.keras.Task.REGRESSION)

model.fit(x=train_ds)
# Convert it to a TensorFlow dataset
test_ds = tfdf.keras.pd_dataframe_to_tf_dataset(combine_dfs(X_train,y_train), label="o_"+OUTPUT, task=tfdf.keras.Task.REGRESSION)

# Evaluate the model
model.compile(metrics=["mse"])
# Evaluate the model on the test dataset.
evaluation = model.evaluate(test_ds, return_dict=True)
print(OUTPUT.upper())
print(evaluation)
print()
print(f"MSE: {evaluation['mse']}")
print(f"RMSE: {math.sqrt(evaluation['mse'])}")
print()

Use /tmp/tmpfmqygf7q as temporary training directory
Starting reading the dataset
1/6 [====>.........................] - ETA: 31s
Dataset read in 0:00:06.405007
Training model
Model trained in 0:00:05.858180
Compiling model
LIFE_EXPECTANCY
{'loss': 0.0, 'mse': 0.0687904804944992}

MSE: 0.0687904804944992
RMSE: 0.2622793939570915



In [24]:
tfdf.model_plotter.plot_model_in_colab(model, tree_idx=0)

In [25]:
# %set_cell_height 300
model.summary()

Model: "random_forest_model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
Total params: 1
Trainable params: 0
Non-trainable params: 1
_________________________________________________________________
Type: "RANDOM_FOREST"
Task: REGRESSION
Label: "__LABEL"

Input Features (12):
	20-24-In_Primary_OL
	20-24-Primary_OL
	20-24_female-In_Primary_OL
	20-24_female-Primary_OL
	20_yrs_old_gdppercapita_us_infla_adjust
	50__dropout_20-24-Primary_OL_comp
	children_per_woman
	co2_emissions_percapita
	gdppercapita_us_infla_adjust
	gini_index
	population
	year

No weights

Variable Importance: MEAN_MIN_DEPTH:
    1.                                 "__LABEL" 10.728899 ################
    2. "20_yrs_old_gdppercapita_us_infla_adjust" 10.380274 ###############
    3.                              "population" 10.109561 ##############
    4.       "50__dropout_20-24-Primary_OL_comp"  9.967797 ##############
    5.       

### Gradient Tree

In [26]:
# converting pandas to tensorFlow dataset
train_ds = tfdf.keras.pd_dataframe_to_tf_dataset(combine_dfs(X_train,y_train), label="o_"+OUTPUT, task=tfdf.keras.Task.REGRESSION)
model = tfdf.keras.GradientBoostedTreesModel(task = tfdf.keras.Task.REGRESSION)

model.fit(x=train_ds)
# Convert it to a TensorFlow dataset
test_ds = tfdf.keras.pd_dataframe_to_tf_dataset(combine_dfs(X_train,y_train), label="o_"+OUTPUT, task=tfdf.keras.Task.REGRESSION)

# Evaluate the model
model.compile(metrics=["mse"])
# Evaluate the model on the test dataset.
evaluation = model.evaluate(test_ds, return_dict=True)
print(OUTPUT.upper())
print(evaluation)
print()
print(f"MSE: {evaluation['mse']}")
print(f"RMSE: {math.sqrt(evaluation['mse'])}")
print()

Use /tmp/tmp5wabnd_r as temporary training directory
Starting reading the dataset
1/6 [====>.........................] - ETA: 0s
Dataset read in 0:00:00.214795
Training model
Model trained in 0:00:02.402261
Compiling model
LIFE_EXPECTANCY
{'loss': 0.0, 'mse': 0.0013651149347424507}

MSE: 0.0013651149347424507
RMSE: 0.03694746181732178



In [27]:
tfdf.model_plotter.plot_model_in_colab(model, tree_idx=0)

In [28]:
model.summary()

Model: "gradient_boosted_trees_model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
Total params: 1
Trainable params: 0
Non-trainable params: 1
_________________________________________________________________
Type: "GRADIENT_BOOSTED_TREES"
Task: REGRESSION
Label: "__LABEL"

Input Features (12):
	20-24-In_Primary_OL
	20-24-Primary_OL
	20-24_female-In_Primary_OL
	20-24_female-Primary_OL
	20_yrs_old_gdppercapita_us_infla_adjust
	50__dropout_20-24-Primary_OL_comp
	children_per_woman
	co2_emissions_percapita
	gdppercapita_us_infla_adjust
	gini_index
	population
	year

No weights

Variable Importance: MEAN_MIN_DEPTH:
    1.                                 "__LABEL"  4.906737 ################
    2.       "50__dropout_20-24-Primary_OL_comp"  4.857166 ###############
    3.                              "gini_index"  4.811431 ###############
    4.                              "population"  4.803850 #########

## Total Fertility Rate

In [49]:
PREDICT_FUTURE  = 0
OUTPUT         = 'children_per_woman'

In [52]:
datasets = [
            # "infant_mortality",
            "life_expectancy",
            # "child_mortality",
            # "children_per_woman",
            "co2_emissions_percapita",
            "gini_index",
            "gdppercapita_us_infla_adjust",
            create_n_yrs_old_csv("gdppercapita_us_infla_adjust",n = 20),
            create_n_yrs_old_csv("children_per_woman",n = 20),
            create_n_dropout_csv("20-24-Primary_OL_comp",50,95),
            "20-24-In_Primary_OL",
            "20-24-Primary_OL",
            # "20-24-Lower_Secondary_OL",
            "population",
            "20-24_female-In_Primary_OL",
            "20-24_female-Primary_OL",
            # "20-24_female-Lower_Secondary_OL",
            ]
get_countries_count(datasets)

Factor: life_expectancy                  count: 195
Factor: co2_emissions_percapita          count: 194
Factor: gini_index                       count: 195
Factor: gdppercapita_us_infla_adjust     count: 207
Factor: 20_yrs_old_gdppercapita_us_infla_adjust count: 207
Factor: 20_yrs_old_children_per_woman    count: 202
Factor: 50%_dropout_20-24-Primary_OL_comp count: 180
Factor: 20-24-In_Primary_OL              count: 202
Factor: 20-24-Primary_OL                 count: 202
Factor: population                       count: 197
Factor: 20-24_female-In_Primary_OL       count: 202
Factor: 20-24_female-Primary_OL          count: 202


In [80]:
# creating a list of all countries & years
countries = find_common_countries(datasets)
years     = [y for y in range(1960,2015-PREDICT_FUTURE+1)]
keys      = generate_indices(countries, years)

In [81]:
input_df            = load_datasets_to_pd(datasets,keys)
# input_df            = add_n_yrs_old_dataset(input_df,"gdppercapita_us_infla_adjust",20)
input_df.set_index(["country","year"], inplace=True)
output_df           = input_df[["o_" + OUTPUT]]
input_df.drop(labels=["o_" + OUTPUT], axis = 1, inplace=True)

In [82]:
input_df

Unnamed: 0_level_0,Unnamed: 1_level_0,life_expectancy,co2_emissions_percapita,gini_index,gdppercapita_us_infla_adjust,20_yrs_old_gdppercapita_us_infla_adjust,20_yrs_old_children_per_woman,50%_dropout_20-24-Primary_OL_comp,20-24-In_Primary_OL,20-24-Primary_OL,population,20-24_female-In_Primary_OL,20-24_female-Primary_OL
country,year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
greece,1980,75.3,5.290,37.5,15300.0,5030.0,2.33,83.0,2.0,28.4,9630000.0,2.0,30.9
thailand,1980,66.5,0.844,44.8,1430.0,581.0,6.15,40.0,4.3,67.9,47400000.0,4.9,72.2
senegal,1980,52.9,0.598,54.6,1060.0,1200.0,7.00,100.0,76.4,88.5,5580000.0,82.5,92.9
costa rica,1980,74.7,1.030,46.9,6210.0,3600.0,6.45,75.0,16.1,55.7,2390000.0,16.2,55.1
zimbabwe,1980,60.4,1.300,38.5,1490.0,1160.0,7.16,43.0,31.2,49.6,7410000.0,39.1,58.4
...,...,...,...,...,...,...,...,...,...,...,...,...,...
morocco,2015,72.5,1.760,39.6,3220.0,1730.0,3.30,49.0,29.9,46.2,34700000.0,34.5,48.2
myanmar,2015,67.7,0.421,38.1,1140.0,208.0,3.04,68.0,27.1,46.1,52700000.0,29.0,48.4
oman,2015,72.4,15.100,40.0,16000.0,15400.0,5.35,35.0,7.3,25.5,4270000.0,7.8,23.5
belarus,2015,73.7,6.290,26.9,5950.0,1890.0,1.47,30.0,0.2,0.6,9440000.0,0.1,0.4


In [83]:
input_df.isna().sum()

life_expectancy                            0
co2_emissions_percapita                    0
gini_index                                 0
gdppercapita_us_infla_adjust               0
20_yrs_old_gdppercapita_us_infla_adjust    0
20_yrs_old_children_per_woman              0
50%_dropout_20-24-Primary_OL_comp          0
20-24-In_Primary_OL                        0
20-24-Primary_OL                           0
population                                 0
20-24_female-In_Primary_OL                 0
20-24_female-Primary_OL                    0
dtype: int64

In [84]:
input_df.shape

(3558, 12)

In [85]:
X_train, X_test, y_train, y_test = train_test_split(input_df, output_df, test_size=0.30, random_state=43)

### Random Forest

In [86]:

# converting pandas to tensorFlow dataset
train_ds = tfdf.keras.pd_dataframe_to_tf_dataset(combine_dfs(X_train,y_train), label="o_"+OUTPUT, task=tfdf.keras.Task.REGRESSION)
model = tfdf.keras.RandomForestModel(task = tfdf.keras.Task.REGRESSION)

model.fit(x=train_ds)
# Convert it to a TensorFlow dataset
test_ds = tfdf.keras.pd_dataframe_to_tf_dataset(combine_dfs(X_train,y_train), label="o_"+OUTPUT, task=tfdf.keras.Task.REGRESSION)

# Evaluate the model
model.compile(metrics=["mse"])
# Evaluate the model on the test dataset.
evaluation = model.evaluate(test_ds, return_dict=True)
print(OUTPUT.upper())
print(evaluation)
print()
print(f"MSE: {evaluation['mse']}")
print(f"RMSE: {math.sqrt(evaluation['mse'])}")
print()

Use /tmp/tmp8e1b_w4f as temporary training directory
Starting reading the dataset
Dataset read in 0:00:00.228079
Training model
Model trained in 0:00:02.122216
Compiling model




CHILDREN_PER_WOMAN
{'loss': 0.0, 'mse': 0.06606526672840118}

MSE: 0.06606526672840118
RMSE: 0.2570316453832119



In [None]:
tfdf.model_plotter.plot_model_in_colab(model, tree_idx=0)

In [87]:
# %set_cell_height 300

model.summary()

Model: "random_forest_model_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
Total params: 1
Trainable params: 0
Non-trainable params: 1
_________________________________________________________________
Type: "RANDOM_FOREST"
Task: REGRESSION
Label: "__LABEL"

Input Features (12):
	20-24-In_Primary_OL
	20-24-Primary_OL
	20-24_female-In_Primary_OL
	20-24_female-Primary_OL
	20_yrs_old_children_per_woman
	20_yrs_old_gdppercapita_us_infla_adjust
	50__dropout_20-24-Primary_OL_comp
	co2_emissions_percapita
	gdppercapita_us_infla_adjust
	gini_index
	life_expectancy
	population

No weights

Variable Importance: MEAN_MIN_DEPTH:
    1.                                 "__LABEL"  9.403145 ################
    2.       "50__dropout_20-24-Primary_OL_comp"  8.705904 ##############
    3.                              "population"  8.702443 ##############
    4. "20_yrs_old_gdppercapita_us_infla_adjust"  8.670487 #####

### Gradient Tree

In [None]:
# converting pandas to tensorFlow dataset
train_ds = tfdf.keras.pd_dataframe_to_tf_dataset(combine_dfs(X_train,y_train), label="o_"+OUTPUT, task=tfdf.keras.Task.REGRESSION)
model = tfdf.keras.GradientBoostedTreesModel(task = tfdf.keras.Task.REGRESSION)

model.fit(x=train_ds)
# Convert it to a TensorFlow dataset
test_ds = tfdf.keras.pd_dataframe_to_tf_dataset(combine_dfs(X_train,y_train), label="o_"+OUTPUT, task=tfdf.keras.Task.REGRESSION)

# Evaluate the model
model.compile(metrics=["mse"])
# Evaluate the model on the test dataset.
evaluation = model.evaluate(test_ds, return_dict=True)
print(OUTPUT.upper())
print(evaluation)
print()
print(f"MSE: {evaluation['mse']}")
print(f"RMSE: {math.sqrt(evaluation['mse'])}")
print()

In [None]:
tfdf.model_plotter.plot_model_in_colab(model, tree_idx=0)

In [None]:
model.summary()

## Primary education OL

In [19]:
PREDICT_FUTURE = 0
OUTPUT         = '20-24-Primary_OL'


In [32]:
datasets = [
            "infant_mortality",
            "life_expectancy",
            "child_mortality",
            "children_per_woman",
            "co2_emissions_percapita",
            "gini_index",
            "gdppercapita_us_infla_adjust",
            create_n_yrs_old_csv("gdppercapita_us_infla_adjust",n = 20),
            "population",
            create_n_yrs_old_csv('20-24-Primary_OL',n = 20),
            # "20-24_female-In_Primary_OL",
            # "20-24_female-Primary_OL",
            # "20-24_female-Lower_Secondary_OL",
            
            ]
get_countries_count(datasets)

Factor: infant_mortality                 count: 266
Factor: life_expectancy                  count: 195
Factor: child_mortality                  count: 197
Factor: children_per_woman               count: 202
Factor: co2_emissions_percapita          count: 194
Factor: gini_index                       count: 195
Factor: gdppercapita_us_infla_adjust     count: 207
Factor: 20_yrs_old_gdppercapita_us_infla_adjust count: 207
Factor: population                       count: 197
Factor: 20_yrs_old_20-24-Primary_OL      count: 202


In [33]:
# creating a list of all countries & years
countries = find_common_countries(datasets)
years     = [y for y in range(1960,2015-PREDICT_FUTURE+1)]
keys      = generate_indices(countries, years)

In [34]:
input_df            = load_datasets_to_pd(datasets,keys)
# input_df.dropna(subset=["gdppercapita_us_infla_adjust"],inplace=True)
input_df.set_index(["country","year"], inplace=True)

In [35]:
output_df           = input_df[["o_" + OUTPUT]]
input_df.drop(labels=["o_" + OUTPUT], axis = 1, inplace=True)

In [198]:
input_df

Unnamed: 0_level_0,Unnamed: 1_level_0,infant_mortality,life_expectancy,child_mortality,children_per_woman,co2_emissions_percapita,gini_index,gdppercapita_us_infla_adjust,20_yrs_old_gdppercapita_us_infla_adjust,population
country,year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
mongolia,1960,,43.9,228.00,6.95,1.350,32.0,,,956000.0
greece,1960,39.5,72.4,46.30,2.33,1.140,46.8,5030.0,,8270000.0
romania,1960,73.3,65.8,95.20,2.34,2.870,19.3,,,18600000.0
sri lanka,1960,68.1,64.0,97.80,5.54,0.229,36.4,,,9870000.0
thailand,1960,101.3,60.9,146.00,6.15,0.136,41.5,581.0,,27400000.0
...,...,...,...,...,...,...,...,...,...,...
belarus,2015,3.0,73.7,4.03,1.69,6.290,26.9,5950.0,1890.0,9440000.0
netherlands,2015,3.4,81.7,4.03,1.75,9.830,28.3,45200.0,33700.0,16900000.0
grenada,2015,14.2,73.1,15.10,2.13,2.380,40.0,9100.0,5210.0,110000.0
suriname,2015,18.0,72.5,20.90,2.40,3.060,61.0,9170.0,6010.0,559000.0


In [None]:
input_df.isna().sum()

In [None]:
input_df.shape

In [36]:
X_train, X_test, y_train, y_test = train_test_split(input_df, output_df, test_size=0.30, random_state=43)

In [37]:

# converting pandas to tensorFlow dataset
train_ds = tfdf.keras.pd_dataframe_to_tf_dataset(combine_dfs(X_train,y_train), label="o_"+OUTPUT, task=tfdf.keras.Task.REGRESSION)
model = tfdf.keras.RandomForestModel(task = tfdf.keras.Task.REGRESSION)

model.fit(x=train_ds)
# Convert it to a TensorFlow dataset
test_ds = tfdf.keras.pd_dataframe_to_tf_dataset(combine_dfs(X_train,y_train), label="o_"+OUTPUT, task=tfdf.keras.Task.REGRESSION)

# Evaluate the model
model.compile(metrics=["mse"])
# Evaluate the model on the test dataset.
evaluation = model.evaluate(test_ds, return_dict=True)
print(OUTPUT.upper())
print(evaluation)
print()
print(f"MSE: {evaluation['mse']}")
print(f"RMSE: {math.sqrt(evaluation['mse'])}")
print()

Use /tmp/tmpvplcpbeh as temporary training directory
Starting reading the dataset
Dataset read in 0:00:00.209966
Training model
Model trained in 0:00:03.680719
Compiling model
20-24-PRIMARY_OL
{'loss': 0.0, 'mse': 0.08771263808012009}

MSE: 0.08771263808012009
RMSE: 0.2961631950126823



In [42]:
tfdf.model_plotter.plot_model_in_colab(model, tree_idx=0)

In [41]:
tfdf.model_plotter.plot_model_in_colab(model, tree_idx=123)

In [31]:
popu = input_df["population">537500000]
popu


TypeError: ignored

In [40]:
# %set_cell_height 300

model.summary()

Model: "random_forest_model_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
Total params: 1
Trainable params: 0
Non-trainable params: 1
_________________________________________________________________
Type: "RANDOM_FOREST"
Task: REGRESSION
Label: "__LABEL"

Input Features (10):
	20_yrs_old_20-24-Primary_OL
	20_yrs_old_gdppercapita_us_infla_adjust
	child_mortality
	children_per_woman
	co2_emissions_percapita
	gdppercapita_us_infla_adjust
	gini_index
	infant_mortality
	life_expectancy
	population

No weights

Variable Importance: MEAN_MIN_DEPTH:
    1.                                 "__LABEL" 10.487812 ################
    2. "20_yrs_old_gdppercapita_us_infla_adjust"  8.756432 #############
    3.            "gdppercapita_us_infla_adjust"  8.754435 #############
    4.                              "population"  8.388881 ############
    5.                         "life_expectancy"  8.240425 #########

### Gradient Tree

In [43]:
# converting pandas to tensorFlow dataset
train_ds = tfdf.keras.pd_dataframe_to_tf_dataset(combine_dfs(X_train,y_train), label="o_"+OUTPUT, task=tfdf.keras.Task.REGRESSION)
model = tfdf.keras.GradientBoostedTreesModel(task = tfdf.keras.Task.REGRESSION)

model.fit(x=train_ds)
# Convert it to a TensorFlow dataset
test_ds = tfdf.keras.pd_dataframe_to_tf_dataset(combine_dfs(X_train,y_train), label="o_"+OUTPUT, task=tfdf.keras.Task.REGRESSION)

# Evaluate the model
model.compile(metrics=["mse"])
# Evaluate the model on the test dataset.
evaluation = model.evaluate(test_ds, return_dict=True)
print(OUTPUT.upper())
print(evaluation)
print()
print(f"MSE: {evaluation['mse']}")
print(f"RMSE: {math.sqrt(evaluation['mse'])}")
print()

Use /tmp/tmptop5ziez as temporary training directory
Starting reading the dataset
Dataset read in 0:00:00.237410
Training model
Model trained in 0:00:01.681829
Compiling model
20-24-PRIMARY_OL
{'loss': 0.0, 'mse': 0.001658075489103794}

MSE: 0.001658075489103794
RMSE: 0.04071947309462383



In [44]:
tfdf.model_plotter.plot_model_in_colab(model, tree_idx=0)

In [45]:
model.summary()

Model: "gradient_boosted_trees_model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
Total params: 1
Trainable params: 0
Non-trainable params: 1
_________________________________________________________________
Type: "GRADIENT_BOOSTED_TREES"
Task: REGRESSION
Label: "__LABEL"

Input Features (10):
	20_yrs_old_20-24-Primary_OL
	20_yrs_old_gdppercapita_us_infla_adjust
	child_mortality
	children_per_woman
	co2_emissions_percapita
	gdppercapita_us_infla_adjust
	gini_index
	infant_mortality
	life_expectancy
	population

No weights

Variable Importance: MEAN_MIN_DEPTH:
    1.                                 "__LABEL"  4.903201 ################
    2.                         "child_mortality"  4.835685 ###############
    3. "20_yrs_old_gdppercapita_us_infla_adjust"  4.804822 ###############
    4.            "gdppercapita_us_infla_adjust"  4.797126 ###############
    5.                      "children_per_wom

In [None]:
for f in created_datasets:
  if os.path.isfile(datasets_path[f]): # this makes the code more robust
        os.remove(datasets_path[f])

## GDP per capita

### Preparing the data

In [14]:
PREDICT_FUTURE  = 0
OUTPUT         = 'gdppercapita_us_infla_adjust'


In [62]:
datasets = [
            "infant_mortality",
            "life_expectancy",
            "child_mortality",
            "children_per_woman",
            "co2_emissions_percapita",
            "gini_index",
            create_n_yrs_old_csv("gdppercapita_us_infla_adjust",n = 20),
            "gdp_growth",
            # create_n_dropout_csv("20-24-Primary_OL_comp",50,95),
            rate_of_dropout("20-24-Primary_OL",5),
            "20-24-In_Primary_OL",
            "20-24-Primary_OL",
            # "20-24-Lower_Secondary_OL",
            "population",
            # "20-24_female-In_Primary_OL",
            # "20-24_female-Primary_OL",
            create_n_yrs_old_csv("20-24-Primary_OL",n = 20),
            # "20-24_female-Lower_Secondary_OL",
            ]
get_countries_count(datasets)

Factor: infant_mortality                 count: 266
Factor: life_expectancy                  count: 195
Factor: child_mortality                  count: 197
Factor: children_per_woman               count: 202
Factor: co2_emissions_percapita          count: 194
Factor: gini_index                       count: 195
Factor: 20_yrs_old_gdppercapita_us_infla_adjust count: 207
Factor: gdp_growth                       count: 221
Factor: 5_yr_period_rate_of_change_of_20-24-Primary_OL count: 202
Factor: 20-24-In_Primary_OL              count: 202
Factor: 20-24-Primary_OL                 count: 202
Factor: population                       count: 197
Factor: 20_yrs_old_20-24-Primary_OL      count: 202


In [63]:
# creating a list of all countries & years
countries = find_common_countries(datasets)
years     = [y for y in range(1960,2015-PREDICT_FUTURE+1)]
keys      = generate_indices(countries, years)

In [64]:
input_df            = load_datasets_to_pd(datasets,keys)
input_df.set_index(["country","year"], inplace=True)
# input_df['o_gdppercapita_us_infla_adjust'] = np.log(input_df['o_gdppercapita_us_infla_adjust'])
# input_df['20_yrs_old_gdppercapita_us_infla_adjust'] = np.log(input_df['20_yrs_old_gdppercapita_us_infla_adjust'])

In [65]:
output_df           = input_df[["o_" + OUTPUT]]
input_df.drop(labels=["o_" + OUTPUT], axis = 1, inplace=True)

In [66]:
print(input_df.shape)
input_df.isna().sum()

(8568, 13)


infant_mortality                                   768
life_expectancy                                      0
child_mortality                                      0
children_per_woman                                   0
co2_emissions_percapita                            146
gini_index                                           0
20_yrs_old_gdppercapita_us_infla_adjust           4828
gdp_growth                                           7
5_yr_period_rate_of_change_of_20-24-Primary_OL     765
20-24-In_Primary_OL                                  0
20-24-Primary_OL                                     0
population                                           0
20_yrs_old_20-24-Primary_OL                       3060
dtype: int64

In [32]:
input_df.describe()

Unnamed: 0,infant_mortality,life_expectancy,child_mortality,children_per_woman,co2_emissions_percapita,gini_index,20_yrs_old_gdppercapita_us_infla_adjust,5_yr_period_rate_of_change_of_20-24-Primary_OL,20-24-In_Primary_OL,20-24-Primary_OL,population,20-24_female-In_Primary_OL,20-24_female-Primary_OL,10_yrs_old_20-24-Primary_OL,30_yrs_old_20-24-Primary_OL
count,7800.0,8568.0,8568.0,8568.0,8422.0,8568.0,3740.0,7803.0,8568.0,8568.0,8568.0,8568.0,8568.0,7038.0,3978.0
mean,54.539269,64.778093,88.092704,4.162458,4.431884,39.858625,7148.180214,0.654328,29.464099,43.541375,27923130.0,32.556081,45.731396,46.414066,53.131322
std,47.116938,10.244332,84.434388,2.045186,7.5144,10.035381,11577.835465,0.809398,31.277497,34.265188,116350500.0,34.610299,36.458871,34.773838,35.019483
min,1.8,9.5,2.18,1.12,0.0,16.0,124.0,-14.32,0.0,0.0,41200.0,0.0,0.0,0.0,0.0
25%,15.0,57.1,19.7,2.22,0.351,32.3,919.5,0.1,1.08,6.655,1770000.0,1.12,6.5,7.86,15.825
50%,40.85,67.0,57.6,3.91,1.64,39.9,2655.0,0.56,17.0,43.14,5580000.0,17.2,43.64,49.0,61.72
75%,84.7,72.6,137.0,6.11,6.15,45.9,7092.5,1.074,56.045,75.9,14800000.0,64.505,82.665,79.915,86.0
max,232.0,84.3,423.0,8.46,101.0,77.0,112000.0,5.18,98.9,99.9,1410000000.0,100.0,100.0,99.9,99.9


In [69]:
input_df

Unnamed: 0_level_0,Unnamed: 1_level_0,infant_mortality,life_expectancy,child_mortality,children_per_woman,co2_emissions_percapita,gini_index,20_yrs_old_gdppercapita_us_infla_adjust,gdp_growth,5_yr_period_rate_of_change_of_20-24-Primary_OL,20-24-In_Primary_OL,20-24-Primary_OL,population,20_yrs_old_20-24-Primary_OL
country,year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
comoros,1960,,47.1,325.00,6.79,0.0576,44.6,,6.120,,94.3,96.5,191000.0,
iceland,1960,17.3,73.4,21.40,4.13,6.9100,25.8,,1.240,,0.0,0.1,176000.0,
sierra leone,1960,221.2,42.7,393.00,6.13,0.3090,52.5,,1.760,,91.1,93.8,2320000.0,
malawi,1960,,37.7,367.00,6.94,0.1200,47.1,,0.295,,85.1,91.7,3660000.0,
ghana,1960,123.9,51.0,209.00,6.75,0.2210,39.3,,4.490,,70.7,75.9,6640000.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
iraq,2015,24.6,70.7,29.50,4.43,4.6600,29.5,2030.0,3.540,1.70,27.0,54.9,35600000.0,65.5
jamaica,2015,13.4,76.1,15.90,2.03,2.6200,45.5,5160.0,1.180,0.18,0.6,3.4,2890000.0,11.4
sao tome and principe,2015,26.8,70.0,35.10,4.52,0.5700,30.8,,3.450,1.32,33.3,69.4,199000.0,84.2
estonia,2015,2.5,77.8,3.18,1.62,12.1000,33.5,7130.0,3.250,0.72,0.1,7.8,1320000.0,1.7


In [67]:
X_train, X_test, y_train, y_test = train_test_split(input_df, output_df, test_size=0.30, random_state=43)

In [70]:
X_train.isna().sum()

infant_mortality                                    22
life_expectancy                                      0
child_mortality                                      0
children_per_woman                                   0
co2_emissions_percapita                             34
gini_index                                           0
20_yrs_old_gdppercapita_us_infla_adjust           1272
gdp_growth                                           5
5_yr_period_rate_of_change_of_20-24-Primary_OL       0
20-24-In_Primary_OL                                  0
20-24-Primary_OL                                     0
population                                           0
20_yrs_old_20-24-Primary_OL                          0
o_gdppercapita_us_infla_adjust                       0
dtype: int64

In [None]:
y_train.isna().sum()

### Random Forest

In [68]:

# converting pandas to tensorFlow dataset
train_ds = tfdf.keras.pd_dataframe_to_tf_dataset(combine_dfs(X_train,y_train), label="o_"+OUTPUT, task=tfdf.keras.Task.REGRESSION)
model = tfdf.keras.RandomForestModel(task = tfdf.keras.Task.REGRESSION)

model.fit(x=train_ds)
# Convert it to a TensorFlow dataset
test_ds = tfdf.keras.pd_dataframe_to_tf_dataset(combine_dfs(X_train,y_train), label="o_"+OUTPUT, task=tfdf.keras.Task.REGRESSION)

# Evaluate the model
model.compile(metrics=["mse"])
# Evaluate the model on the test dataset.
evaluation = model.evaluate(test_ds, return_dict=True)
print(OUTPUT.upper())
print(evaluation)
print()
print(f"MSE: {evaluation['mse']}")
print(f"RMSE: {math.sqrt(evaluation['mse'])}")
print()

Use /tmp/tmpssr7c6vp as temporary training directory
Starting reading the dataset
Dataset read in 0:00:00.216816
Training model
Model trained in 0:00:03.890596
Compiling model
GDPPERCAPITA_US_INFLA_ADJUST
{'loss': 0.0, 'mse': 0.21468329429626465}

MSE: 0.21468329429626465
RMSE: 0.463339286372594



In [None]:
tfdf.model_plotter.plot_model_in_colab(model, tree_idx=0)

In [71]:
# %set_cell_height 300

model.summary()


Model: "random_forest_model_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
Total params: 1
Trainable params: 0
Non-trainable params: 1
_________________________________________________________________
Type: "RANDOM_FOREST"
Task: REGRESSION
Label: "__LABEL"

Input Features (13):
	20-24-In_Primary_OL
	20-24-Primary_OL
	20_yrs_old_20-24-Primary_OL
	20_yrs_old_gdppercapita_us_infla_adjust
	5_yr_period_rate_of_change_of_20-24-Primary_OL
	child_mortality
	children_per_woman
	co2_emissions_percapita
	gdp_growth
	gini_index
	infant_mortality
	life_expectancy
	population

No weights

Variable Importance: MEAN_MIN_DEPTH:
    1.                                        "__LABEL" 10.386118 ################
    2.                                     "gdp_growth"  9.687724 ##############
    3.        "20_yrs_old_gdppercapita_us_infla_adjust"  9.487857 ##############
    4.                                     "gini

### Gradient Tree

In [None]:
# converting pandas to tensorFlow dataset
train_ds = tfdf.keras.pd_dataframe_to_tf_dataset(combine_dfs(X_train,y_train), label="o_"+OUTPUT, task=tfdf.keras.Task.REGRESSION)
model = tfdf.keras.GradientBoostedTreesModel(task = tfdf.keras.Task.REGRESSION)

model.fit(x=train_ds)
# Convert it to a TensorFlow dataset
test_ds = tfdf.keras.pd_dataframe_to_tf_dataset(combine_dfs(X_train,y_train), label="o_"+OUTPUT, task=tfdf.keras.Task.REGRESSION)

# Evaluate the model
model.compile(metrics=["mse"])
# Evaluate the model on the test dataset.
evaluation = model.evaluate(test_ds, return_dict=True)
print(OUTPUT.upper())
print(evaluation)
print()
print(f"MSE: {evaluation['mse']}")
print(f"RMSE: {math.sqrt(evaluation['mse'])}")
print()

In [163]:
tfdf.model_plotter.plot_model_in_colab(model, tree_idx=0)

In [164]:
model.summary()

Model: "gradient_boosted_trees_model_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
Total params: 1
Trainable params: 0
Non-trainable params: 1
_________________________________________________________________
Type: "GRADIENT_BOOSTED_TREES"
Task: REGRESSION
Label: "__LABEL"

Input Features (15):
	10_yrs_old_20-24-Primary_OL
	20-24-In_Primary_OL
	20-24-Primary_OL
	20-24_female-In_Primary_OL
	20-24_female-Primary_OL
	20_yrs_old_gdppercapita_us_infla_adjust
	30_yrs_old_20-24-Primary_OL
	50__dropout_20-24-Primary_OL_comp
	child_mortality
	children_per_woman
	co2_emissions_percapita
	gini_index
	infant_mortality
	life_expectancy
	population

No weights

Variable Importance: MEAN_MIN_DEPTH:
    1.                                 "__LABEL"  4.938178 ################
    2.       "50__dropout_20-24-Primary_OL_comp"  4.894394 ###############
    3.                        "infant_mortality"  4.892383 ########

In [53]:
for f in created_datasets:
  if os.path.isfile(datasets_path[f]): # this makes the code more robust
        os.remove(datasets_path[f])

# Save the summary

In [48]:
#@markdown What changes you made to datasets & why ?


approach = 'gradient tree for edu model'  #@param {type: "string"}


In [49]:
inspector = model.make_inspector()

In [50]:
model_name = inspector.model_type()
num_trees  = inspector.num_trees()
objective  = inspector.objective()
eval = inspector.evaluation()

In [51]:

input_features_list = inspector.features()
input_features = ""
for i in range(len(input_features_list)):
  input_features += str(i+1)+ "  "
  input_features += input_features_list[i][0] + "\n"

In [52]:
variable_imp = []
variable_imp.append(extract_variable_imp(variable = 'MEAN_MIN_DEPTH'))
variable_imp.append(extract_variable_imp(variable = 'SUM_SCORE'))


In [53]:
#@ Analysis
#@markdown Observations made from the output ?


anaylsis = 'the current gdp growth of a country doesnot effect its gdp per capita'  #@param {type: "string"}


In [54]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [55]:
with open('/content/drive/My Drive/education-impact/document.txt', 'a') as f:
  f.write("Model Trained : " + model_name+ "\n")
  f.write("Predicting : " + OUTPUT + "\n")
  f.write("Approach : " + approach + "\n")
  f.write("Num Examples: " + str(eval.num_examples) + "\n\n")
  f.write("Input Features\n")
  f.write(input_features+"\n\n")
  f.write("Model Performance\n")
  f.write("RMSE Score : "  + str(eval.rmse)+ "\n\n")
  for v in variable_imp:
    f.write(v + "\n\n")
  f.write("Analysis : " + anaylsis + "\n")
  f.write("\n\n\n")

  

# Models to try
 - RandomForestModel
 - GradientBoostedTreesModel
 - CartModel
 - DistributedGradientBoostedTreesModel

 - KNN Regression
 - Support Vector Regression
 - Locally Weighted Scatterplot Smoothing
 - Multivariate Adaptive Regression Splines
  