<a href="https://colab.research.google.com/github/jaya-shankar/education-impact/blob/master/All_Models.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Cloning Repo & installing libs

In [1]:
!rm -rf education-impact

In [2]:
!git clone https://github.com/jaya-shankar/education-impact.git


Cloning into 'education-impact'...
remote: Enumerating objects: 416, done.[K
remote: Counting objects: 100% (416/416), done.[K
remote: Compressing objects: 100% (356/356), done.[K
remote: Total 416 (delta 209), reused 190 (delta 57), pack-reused 0[K
Receiving objects: 100% (416/416), 6.27 MiB | 15.01 MiB/s, done.
Resolving deltas: 100% (209/209), done.


In [3]:
!pip install tensorflow_decision_forests
!pip install wurlitzer
!pip install seaborn

Collecting tensorflow_decision_forests
  Downloading tensorflow_decision_forests-0.2.2-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (17.7 MB)
[K     |████████████████████████████████| 17.7 MB 13.1 MB/s 
[?25hCollecting wurlitzer
  Downloading wurlitzer-3.0.2-py3-none-any.whl (7.3 kB)
Installing collected packages: wurlitzer, tensorflow-decision-forests
Successfully installed tensorflow-decision-forests-0.2.2 wurlitzer-3.0.2


In [4]:
import pandas as pd
import os
import numpy as np
import math
import seaborn as sns
import tensorflow_decision_forests as tfdf
from sklearn.model_selection import train_test_split
from wurlitzer import sys_pipes



# Setting paths

In [5]:
root = "education-impact/datasets/" 
datasets_path = {
                    "infant_mortality"                :  root+ "Infant_Mortality_Rate.csv",
                    "child_mortality"                 :  root+ "child_mortality_0_5_year_olds_dying_per_1000_born.csv",
                    "children_per_woman"              :  root+ "children_per_woman_total_fertility.csv",
                    "co2_emissions_percapita"         :  root+ "co2_emissions_tonnes_per_person.csv",
                    "population"                      :  root+ "converted_pop.csv",
                    "population_density"              :  root+ "population_per_area.csv",
                    "gdp_growth"                      :  root+ "gdp_per_capita_yearly_growth.csv",
                    "Avg_daily_income_ppp"            :  root+ "mincpcap_cppp.csv",
                    "gdppercapita_us_infla_adjust"    :  root+ "gdppercapita_us_inflation_adjusted.csv",
                    "gini_index"                      :  root+ "gini.csv",
                    "life_expectancy"                 :  root+ "life_expectancy_years.csv",
                    "poverty_index"                   :  root+ "mincpcap_cppp.csv",
                    "people_in_poverty"               :  root+ "number_of_people_in_poverty.csv",
                    "ratio_b/g_in_primary"            :  root+ "ratio_of_girls_to_boys_in_primary_and_secondary_education_perc.csv",
                    "wcde-25--34"                     :  root+ "wcde-25--34.csv",
                    "20-24-In_Primary_OL"             :  root+ "In_Primary_OL.csv",
                    "20-24-Primary_OL"                :  root+ "Primary_OL.csv",
                    "20-24-Lower_Secondary_OL"        :  root+ "Lower_Secondary_OL.csv",
                    "20-24_female-In_Primary_OL"      :  root+ "female_In_Primary_OL.csv",
                    "20-24_female-Primary_OL"         :  root+ "female_Primary_OL.csv",
                    "20-24_female-Lower_Secondary_OL" :  root+ "female_Lower_Secondary_OL.csv",
                    "20-24-In_Primary_OL_comp"        :  root+ "In_Primary_OL_complete.csv",
                    "20-24-Primary_OL_comp"           :  root+ "Primary_OL_complete.csv",
                    "20-24-Lower_Secondary_OL_comp"   :  root+ "Lower_Secondary_OL_complete.csv",
                }

created_datasets = []

# Defining common functions

In [6]:
def get_countries_count(datasets):
  for dataset in datasets:
    df = pd.read_csv(datasets_path[dataset])
    count = len(set(df.Country.unique()))
    print(f"{'Factor: ' + dataset:<40} count: {count}")


In [7]:
def find_common_countries(datasets):
  common_countries = set()
  for dataset in datasets:
    countries_list = list(pd.read_csv(datasets_path[dataset]).Country)
    countries_list = set(map(lambda x: x.lower(), countries_list))
    if common_countries == set():
      common_countries = countries_list
    else:
      common_countries = common_countries.intersection(countries_list)
  return list(common_countries)

In [8]:
def generate_indices(countries,years):
  keys=[]
  for y in years:
    for c in countries:
      keys.append((c,str(y)))
  return keys

In [9]:
def load_datasets_to_pd(datasets,keys,include_output=True):
  combined_df = pd.DataFrame(keys,columns=['country','year'])
  for dataset in datasets:
    combined_df = add_dataset(combined_df,dataset)
    

  if include_output:
    combined_df = add_dataset(combined_df,dataset,output = True)
  return combined_df

  

In [10]:
def add_dataset(input_df,dataset,output = False):
  label = dataset
  if output : 
    label = "o_"+OUTPUT
  input_df[label] = [math.nan]*len(input_df)
  df = pd.read_csv(datasets_path[dataset])
  df["Country"] = df["Country"].str.lower()
  df.set_index("Country", inplace=True)
  for e in range(len(input_df)):
    country = input_df.iloc[e].country
    year    = input_df.iloc[e].year
    if output:
      input_df.at[e,label] = df.loc[country][str( int(year) + PREDICT_FUTURE )]
    else:
      input_df.at[e,label] = df.loc[country][year]
  return input_df

In [11]:
def create_n_yrs_old_csv(dataset,n):
    df = pd.read_csv(datasets_path[dataset])
    years = list(range(1960, 2016))
    countries = list(df['Country'])
    new_df = pd.DataFrame(countries,columns=['Country'])
    for i in range(1960,2016):
      new_df[i] = np.nan
    new_df.set_index('Country',inplace=True)
    df.set_index('Country',inplace=True)
    for c in countries:
      for y in range(1960+n,2016):
        new_df[y].at[c] = df.loc[c][str(y-n)]
    
    table_name = str(n)+"_yrs_old_"+dataset
    datasets_path[table_name] = root+table_name+".csv"
    new_df.to_csv(root+table_name+".csv",encoding='utf-8', index=True)

    created_datasets.append(table_name)
    return table_name

In [17]:
def create_n_dropout_csv(dataset,n,s_n = 90):
    df = pd.read_csv(datasets_path[dataset])
    years = list(range(1875, 2016))
    countries = list(df['Country'])
    drop_out_years = []
    for i in range(len(df)):
      started = False
      s_year  = 1875
      for year in years:
        if not started and df.iloc[i][str(year)] < s_n :
          started = True
          s_year  = year
        elif started and df.iloc[i][str(year)] < n :
          drop_out_years.append((df.iloc[i]['Country'],(year-s_year)))
          break
      else:
        drop_out_years.append((df.iloc[i]['Country'], year-s_year))

    new_df = pd.DataFrame(countries,columns=['Country'])
    for i in years:
      new_df[i] = np.nan
    new_df.set_index('Country',inplace=True)
    for c,y in drop_out_years:
      for i in years:
        new_df.at[c,i] = y
    table_name = str(n)+"%_dropout_"+dataset
    datasets_path[table_name] = root+table_name+".csv"
    new_df = new_df[(new_df.T != 1).any()]
    new_df.to_csv(root+table_name+".csv",encoding='utf-8', index=True)

    created_datasets.append(table_name)
    return table_name

In [13]:
def combine_dfs(X,y):
  label = y.columns[0]
  X[label] = y
  X.dropna(subset=[label],inplace=True)
  return X

In [14]:
def extract_variable_imp(variable):
  v_list = inspector.variable_importances()[variable]
  v = variable + "\n"

  for i in range(1,len(v_list)):
    v += str(i)+ "  "
    v += f"{v_list[i][0][0]:<50}" 
    v += str(v_list[i][1]) + "\n"
  return v

# Plotting Data

In [None]:
datasets_to_plot = [
            "infant_mortality",
            "child_mortality",
            "children_per_woman",
            "co2_emissions_percapita",
            "gini_index",
            "gdppercapita_us_infla_adjust",
            "20-24-In_Primary_OL",
            "20-24-Primary_OL",
            "20-24-Lower_Secondary_OL",
            "population",
            "population_density",
            "20-24_female-In_Primary_OL",
            "20-24_female-Primary_OL" ,
            "20-24_female-Lower_Secondary_OL",
            "life_expectancy"
            ]

# creating a list of all countries & years
countries   = find_common_countries(datasets_to_plot)
years       = [y for y in range(1960,2016)]
keys        = generate_indices(countries, years)

combined_df = load_datasets_to_pd(datasets_to_plot,keys,include_output=False)

## Life Expectancy

In [None]:
PREDICT_FUTURE  = 0
OUTPUT          = 'life_expectancy'
n = len(datasets_to_plot)
r = math.ceil(math.sqrt(n))
i=0
for m in range(r):
  plt = sns.pairplot(combined_df,diag_kind="kde", y_vars=[OUTPUT], x_vars=[datasets_to_plot[i] for i in range(r*m,min(n,r*(m+1)))], height=4,dropna=True)
  plt

## Total Fertility Rate

In [None]:
PREDICT_FUTURE  = 0
OUTPUT          = 'children_per_woman
n = len(datasets_to_plot)
r = math.ceil(math.sqrt(n))
i=0
for m in range(r):
  plt = sns.pairplot(combined_df,diag_kind="kde", y_vars=[OUTPUT], x_vars=[datasets_to_plot[i] for i in range(r*m,min(n,r*(m+1)))], height=4,dropna=True)
  plt

## Primary education OL

In [None]:
PREDICT_FUTURE  = 0
OUTPUT          = '20-24-Primary_OL'
n = len(datasets_to_plot)
r = math.ceil(math.sqrt(n))
i=0
for m in range(r):
  plt = sns.pairplot(combined_df,diag_kind="kde", y_vars=[OUTPUT], x_vars=[datasets_to_plot[i] for i in range(r*m,min(n,r*(m+1)))], height=4,dropna=True)
  plt

## GDP per capita

In [None]:
PREDICT_FUTURE  = 0
OUTPUT          = 'gdppercapita_us_infla_adjust'
n = len(datasets_to_plot)
r = math.ceil(math.sqrt(n))
i=0
for m in range(r):
  plt = sns.pairplot(combined_df,diag_kind="kde", y_vars=[OUTPUT], x_vars=[datasets_to_plot[i] for i in range(r*m,min(n,r*(m+1)))], height=4,dropna=True)
  plt

# Building Model

## Life Expectancy

### Preparing the Data

In [15]:
PREDICT_FUTURE  = 0
OUTPUT         = 'life_expectancy'

In [18]:
datasets = [
            # "infant_mortality",
            # "life_expectancy",
            # "child_mortality",
            "children_per_woman",
            "co2_emissions_percapita",
            "gini_index",
            "gdppercapita_us_infla_adjust",
            #  create_n_yrs_old_csv("gdppercapita_us_infla_adjust",n = 20),
            create_n_dropout_csv("20-24-Primary_OL_comp",70),
            create_n_dropout_csv("20-24-Primary_OL_comp",50),
            "20-24-In_Primary_OL",
            "20-24-Primary_OL",
            # "20-24-Lower_Secondary_OL",
            "population",
            "20-24_female-In_Primary_OL",
            "20-24_female-Primary_OL",
            # "20-24_female-Lower_Secondary_OL",
            ]
get_countries_count(datasets)

Factor: children_per_woman               count: 202
Factor: co2_emissions_percapita          count: 194
Factor: gini_index                       count: 195
Factor: gdppercapita_us_infla_adjust     count: 207
Factor: 70%_dropout_20-24-Primary_OL_comp count: 179
Factor: 50%_dropout_20-24-Primary_OL_comp count: 180
Factor: 20-24-In_Primary_OL              count: 202
Factor: 20-24-Primary_OL                 count: 202
Factor: population                       count: 197
Factor: 20-24_female-In_Primary_OL       count: 202
Factor: 20-24_female-Primary_OL          count: 202


In [53]:
# creating a list of all countries & years
countries = find_common_countries(datasets)
years     = [y for y in range(1960,2015-PREDICT_FUTURE+1)]
keys      = generate_indices(countries, years)

In [54]:
input_df            = load_datasets_to_pd(datasets,keys)
input_df.set_index(["country","year"], inplace=True)

In [55]:
output_df           = input_df[["o_" + OUTPUT]]
input_df.drop(labels=["o_" + OUTPUT], axis = 1, inplace=True)

In [56]:
input_df

Unnamed: 0_level_0,Unnamed: 1_level_0,children_per_woman,co2_emissions_percapita,gini_index,gdppercapita_us_infla_adjust,20_yrs_old_gdppercapita_us_infla_adjust,70%_dropout_20-24-Primary_OL_comp,20-24-In_Primary_OL,20-24-Primary_OL,population,20-24_female-In_Primary_OL,20-24_female-Primary_OL
country,year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
angola,1960,7.48,0.101,55.8,,,140.0,91.0,97.9,5450000.0,96.8,98.7
belgium,1960,2.60,9.920,30.5,11700.0,,44.0,8.4,37.9,9170000.0,8.0,36.1
peru,1960,6.97,0.805,57.1,2710.0,,82.0,58.4,65.9,10200000.0,65.3,71.9
sierra leone,1960,6.13,0.309,52.5,497.0,,138.0,91.1,93.8,2320000.0,95.9,97.5
jordan,1960,7.69,0.797,36.9,,,85.0,51.1,69.7,933000.0,50.2,68.6
...,...,...,...,...,...,...,...,...,...,...,...,...
spain,2015,1.35,5.810,36.2,25700.0,20000.0,81.0,1.5,9.6,46700000.0,1.4,7.6
indonesia,2015,2.39,1.960,39.1,3330.0,1930.0,106.0,3.4,21.4,258000000.0,3.3,21.5
pakistan,2015,3.55,0.926,32.1,1360.0,1010.0,123.0,38.0,49.0,199000000.0,46.0,55.7
belize,2015,2.54,1.610,53.3,4770.0,3820.0,105.0,13.1,36.8,361000.0,12.3,34.7


In [57]:
output_df

Unnamed: 0_level_0,Unnamed: 1_level_0,o_life_expectancy
country,year,Unnamed: 2_level_1
angola,1960,98.7
belgium,1960,36.1
peru,1960,71.9
sierra leone,1960,97.5
jordan,1960,68.6
...,...,...
spain,2015,7.6
indonesia,2015,21.5
pakistan,2015,55.7
belize,2015,34.7


In [58]:
input_df.isna().sum()

children_per_woman                            0
co2_emissions_percapita                     143
gini_index                                    0
gdppercapita_us_infla_adjust               1671
20_yrs_old_gdppercapita_us_infla_adjust    4497
70%_dropout_20-24-Primary_OL_comp             0
20-24-In_Primary_OL                           0
20-24-Primary_OL                              0
population                                    0
20-24_female-In_Primary_OL                    0
20-24_female-Primary_OL                       0
dtype: int64

In [59]:
input_df.shape

(8064, 11)

In [60]:
X_train, X_test, y_train, y_test = train_test_split(input_df, output_df, test_size=0.30, random_state=43)

### Random Forest Model

In [61]:

# converting pandas to tensorFlow dataset
train_ds = tfdf.keras.pd_dataframe_to_tf_dataset(combine_dfs(X_train,y_train), label="o_"+OUTPUT, task=tfdf.keras.Task.REGRESSION)
model = tfdf.keras.RandomForestModel(task = tfdf.keras.Task.REGRESSION)

model.fit(x=train_ds)
# Convert it to a TensorFlow dataset
test_ds = tfdf.keras.pd_dataframe_to_tf_dataset(combine_dfs(X_train,y_train), label="o_"+OUTPUT, task=tfdf.keras.Task.REGRESSION)

# Evaluate the model
model.compile(metrics=["mse"])
# Evaluate the model on the test dataset.
evaluation = model.evaluate(test_ds, return_dict=True)
print(OUTPUT.upper())
print(evaluation)
print()
print(f"MSE: {evaluation['mse']}")
print(f"RMSE: {math.sqrt(evaluation['mse'])}")
print()

Use /tmp/tmpsd6q64w2 as temporary training directory
Starting reading the dataset
1/6 [====>.........................] - ETA: 1s
Dataset read in 0:00:00.245027
Training model
Model trained in 0:00:05.464445
Compiling model
LIFE_EXPECTANCY
{'loss': 0.0, 'mse': 0.0311956238001585}

MSE: 0.0311956238001585
RMSE: 0.1766228292157005



In [62]:
tfdf.model_plotter.plot_model_in_colab(model, tree_idx=0)

In [63]:
# %set_cell_height 300
model.summary()

Model: "random_forest_model_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
Total params: 1
Trainable params: 0
Non-trainable params: 1
_________________________________________________________________
Type: "RANDOM_FOREST"
Task: REGRESSION
Label: "__LABEL"

Input Features (11):
	20-24-In_Primary_OL
	20-24-Primary_OL
	20-24_female-In_Primary_OL
	20-24_female-Primary_OL
	20_yrs_old_gdppercapita_us_infla_adjust
	70__dropout_20-24-Primary_OL_comp
	children_per_woman
	co2_emissions_percapita
	gdppercapita_us_infla_adjust
	gini_index
	population

No weights

Variable Importance: MEAN_MIN_DEPTH:
    1.                                 "__LABEL" 10.781031 ################
    2. "20_yrs_old_gdppercapita_us_infla_adjust" 10.297628 ###############
    3.                              "population"  9.905709 ##############
    4.            "gdppercapita_us_infla_adjust"  9.842118 ##############
    5.           

### Gradient Tree

In [72]:
# converting pandas to tensorFlow dataset
train_ds = tfdf.keras.pd_dataframe_to_tf_dataset(combine_dfs(X_train,y_train), label="o_"+OUTPUT, task=tfdf.keras.Task.REGRESSION)
model = tfdf.keras.GradientBoostedTreesModel(task = tfdf.keras.Task.REGRESSION)

model.fit(x=train_ds)
# Convert it to a TensorFlow dataset
test_ds = tfdf.keras.pd_dataframe_to_tf_dataset(combine_dfs(X_train,y_train), label="o_"+OUTPUT, task=tfdf.keras.Task.REGRESSION)

# Evaluate the model
model.compile(metrics=["mse"])
# Evaluate the model on the test dataset.
evaluation = model.evaluate(test_ds, return_dict=True)
print(OUTPUT.upper())
print(evaluation)
print()
print(f"MSE: {evaluation['mse']}")
print(f"RMSE: {math.sqrt(evaluation['mse'])}")
print()

Use /tmp/tmp5e2l3ym6 as temporary training directory
Starting reading the dataset
1/6 [====>.........................] - ETA: 0s
Dataset read in 0:00:00.248100
Training model
Model trained in 0:00:02.115837
Compiling model
LIFE_EXPECTANCY
{'loss': 0.0, 'mse': 0.0018499698489904404}

MSE: 0.0018499698489904404
RMSE: 0.043011275835418326



In [73]:
tfdf.model_plotter.plot_model_in_colab(model, tree_idx=0)

In [74]:
model.summary()

Model: "gradient_boosted_trees_model_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
Total params: 1
Trainable params: 0
Non-trainable params: 1
_________________________________________________________________
Type: "GRADIENT_BOOSTED_TREES"
Task: REGRESSION
Label: "__LABEL"

Input Features (11):
	20-24-In_Primary_OL
	20-24-Primary_OL
	20-24_female-In_Primary_OL
	20-24_female-Primary_OL
	20_yrs_old_gdppercapita_us_infla_adjust
	70__dropout_20-24-Primary_OL_comp
	children_per_woman
	co2_emissions_percapita
	gdppercapita_us_infla_adjust
	gini_index
	population

No weights

Variable Importance: MEAN_MIN_DEPTH:
    1.                                 "__LABEL"  4.957533 ################
    2.                              "gini_index"  4.886836 ###############
    3. "20_yrs_old_gdppercapita_us_infla_adjust"  4.885517 ###############
    4.                      "children_per_woman"  4.885280 #############

## Total Fertility Rate

In [None]:
PREDICT_FUTURE  = 0
OUTPUT         = 'children_per_woman'

In [None]:
datasets = [
            # "infant_mortality",
            # "life_expectancy",
            # "child_mortality",
            "children_per_woman",
            "co_emissions_percapita",
            "gini_index",
            "gdppercapita_us_infla_adjust",
            "gdppercapita_us_infla_adjust",
            "20-24-In_Primary_OL",
            "20-24-Primary_OL",
            # "20-24-Lower_Secondary_OL",
            "population",
            "20-24_female-In_Primary_OL",
            "20-24_female-Primary_OL", ,
            # "20-24_female-Lower_Secondary_OL",
            ]
get_countries_count(datasets)

In [None]:
# creating a list of all countries & years
countries = find_common_countries(datasets)
years     = [y for y in range(1960,2015-PREDICT_FUTURE+1)]
keys      = generate_indices(countries, years)

In [None]:
input_df            = load_datasets_to_pd(datasets,keys)
# input_df            = add_n_yrs_old_dataset(input_df,"gdppercapita_us_infla_adjust",20)
input_df.set_index(["country","year"], inplace=True)
output_df           = input_df[["o_" + OUTPUT]]
input_df.drop(labels=["o_" + OUTPUT], axis = 1, inplace=True)

In [None]:
input_df

In [None]:
input_df.isna().sum()

In [None]:
input_df.shape

In [None]:
X_train, X_test, y_train, y_test = train_test_split(input_df, output_df, test_size=0.30, random_state=43)

In [None]:

# converting pandas to tensorFlow dataset
train_ds = tfdf.keras.pd_dataframe_to_tf_dataset(combine_dfs(X_train,y_train), label="o_"+OUTPUT, task=tfdf.keras.Task.REGRESSION)
model = tfdf.keras.RandomForestModel(task = tfdf.keras.Task.REGRESSION)

model.fit(x=train_ds)
# Convert it to a TensorFlow dataset
test_ds = tfdf.keras.pd_dataframe_to_tf_dataset(combine_dfs(X_train,y_train), label="o_"+OUTPUT, task=tfdf.keras.Task.REGRESSION)

# Evaluate the model
model.compile(metrics=["mse"])
# Evaluate the model on the test dataset.
evaluation = model.evaluate(test_ds, return_dict=True)
print(OUTPUT.upper())
print(evaluation)
print()
print(f"MSE: {evaluation['mse']}")
print(f"RMSE: {math.sqrt(evaluation['mse'])}")
print()

In [None]:
tfdf.model_plotter.plot_model_in_colab(model, tree_idx=0)

In [None]:
# %set_cell_height 300

model.summary()

## Primary education OL

In [None]:
PREDICT_FUTURE = 0
OUTPUT         = '20-24-Primary_OL'


Factor: infant_mortality                 count: 266
Factor: life_expectancy                  count: 195
Factor: child_mortality                  count: 197
Factor: children_per_woman               count: 202
Factor: co2_emissions_percapita          count: 194
Factor: gdppercapita_us_infla_adjust     count: 207


In [None]:
datasets = [
            # "infant_mortality",
            # "life_expectancy",
            # "child_mortality",
            "children_per_woman",
            "co_emissions_percapita",
            "gini_index",
            "gdppercapita_us_infla_adjust",
            "gdppercapita_us_infla_adjust",
            "20-24-In_Primary_OL",
            "20-24-Primary_OL",
            # "20-24-Lower_Secondary_OL",
            "population",
            "20-24_female-In_Primary_OL",
            "20-24_female-Primary_OL", ,
            # "20-24_female-Lower_Secondary_OL",
            ]
get_countries_count(datasets)

In [None]:
# creating a list of all countries & years
countries = find_common_countries(datasets)
years     = [y for y in range(1960,2015-PREDICT_FUTURE+1)]
keys      = generate_indices(countries, years)

In [None]:
input_df            = load_datasets_to_pd(datasets,keys)
input_df            = add_n_yrs_old_dataset(input_df,"gdppercapita_us_infla_adjust",20)
input_df.dropna(subset=["gdppercapita_us_infla_adjust"],inplace=True)
input_df.set_index(["country","year"], inplace=True)

In [None]:
output_df           = input_df[["o_" + OUTPUT]]
input_df.drop(labels=["o_" + OUTPUT], axis = 1, inplace=True)

In [None]:
input_df

In [None]:
input_df.isna().sum()

In [None]:
input_df.shape

In [None]:
X_train, X_test, y_train, y_test = train_test_split(input_df, output_df, test_size=0.30, random_state=43)

In [None]:

# converting pandas to tensorFlow dataset
train_ds = tfdf.keras.pd_dataframe_to_tf_dataset(combine_dfs(X_train,y_train), label="o_"+OUTPUT, task=tfdf.keras.Task.REGRESSION)
model = tfdf.keras.RandomForestModel(task = tfdf.keras.Task.REGRESSION)

model.fit(x=train_ds)
# Convert it to a TensorFlow dataset
test_ds = tfdf.keras.pd_dataframe_to_tf_dataset(combine_dfs(X_train,y_train), label="o_"+OUTPUT, task=tfdf.keras.Task.REGRESSION)

# Evaluate the model
model.compile(metrics=["mse"])
# Evaluate the model on the test dataset.
evaluation = model.evaluate(test_ds, return_dict=True)
print(OUTPUT.upper())
print(evaluation)
print()
print(f"MSE: {evaluation['mse']}")
print(f"RMSE: {math.sqrt(evaluation['mse'])}")
print()

In [None]:
tfdf.model_plotter.plot_model_in_colab(model, tree_idx=0)

In [None]:
# %set_cell_height 300

model.summary()

## GDP per capita

In [None]:
created_datasets = []
created_datasets.append(create_n_yrs_gst_old_csv("gdppercapita_us_infla_adjust",20))

In [None]:
PREDICT_FUTURE  = 0
OUTPUT         = 'gdppercapita_us_infla_adjust'


In [None]:
datasets = [
            # "infant_mortality",
            # "life_expectancy",
            # "child_mortality",
            "children_per_woman",
            "co_emissions_percapita",
            "gini_index",
            "gdppercapita_us_infla_adjust",
            "gdppercapita_us_infla_adjust",
            "20-24-In_Primary_OL",
            "20-24-Primary_OL",
            # "20-24-Lower_Secondary_OL",
            "population",
            "20-24_female-In_Primary_OL",
            "20-24_female-Primary_OL", ,
            # "20-24_female-Lower_Secondary_OL",
            ]
get_countries_count(datasets)

In [None]:
# creating a list of all countries & years
countries = find_common_countries(datasets)
years     = [y for y in range(1960,2015-PREDICT_FUTURE+1)]
keys      = generate_indices(countries, years)

In [None]:
input_df            = load_datasets_to_pd(datasets,keys)
input_df            = add_n_yrs_old_dataset(input_df,"gdppercapita_us_infla_adjust",20)
input_df.set_index(["country","year"], inplace=True)
input_df['o_gdppercapita_us_infla_adjust'] = np.log(input_df['o_gdppercapita_us_infla_adjust'])
input_df['gdppercapita_us_infla_adjust'+"_"+str(20)+"years_before"] = np.log(input_df['gdppercapita_us_infla_adjust'+"_"+str(20)+"years_before"])

In [None]:
output_df           = input_df[["o_" + OUTPUT]]
input_df.drop(labels=["o_" + OUTPUT], axis = 1, inplace=True)

In [None]:
print(input_df.shape)
input_df.isna().sum()

(8624, 11)


infant_mortality                                782
life_expectancy                                   0
child_mortality                                   0
children_per_woman                                0
20-24-In_Primary_OL                               0
20-24-Primary_OL                                  0
20-24-Lower_Secondary_OL                          0
20-24_female-In_Primary_OL                        0
20-24_female-Primary_OL                           0
20-24_female-Lower_Secondary_OL                   0
gdppercapita_us_infla_adjust_20years_before    4884
dtype: int64

In [None]:
input_df.describe()

In [None]:
input_df

In [None]:
X_train, X_test, y_train, y_test = train_test_split(input_df, output_df, test_size=0.30, random_state=43)

In [None]:
X_train.isna().sum()

In [None]:
y_train.isna().sum()

In [None]:

# converting pandas to tensorFlow dataset
train_ds = tfdf.keras.pd_dataframe_to_tf_dataset(combine_dfs(X_train,y_train), label="o_"+OUTPUT, task=tfdf.keras.Task.REGRESSION)
model = tfdf.keras.RandomForestModel(task = tfdf.keras.Task.REGRESSION)

model.fit(x=train_ds)
# Convert it to a TensorFlow dataset
test_ds = tfdf.keras.pd_dataframe_to_tf_dataset(combine_dfs(X_train,y_train), label="o_"+OUTPUT, task=tfdf.keras.Task.REGRESSION)

# Evaluate the model
model.compile(metrics=["mse"])
# Evaluate the model on the test dataset.
evaluation = model.evaluate(test_ds, return_dict=True)
print(OUTPUT.upper())
print(evaluation)
print()
print(f"MSE: {evaluation['mse']}")
print(f"RMSE: {math.sqrt(evaluation['mse'])}")
print()

In [None]:
tfdf.model_plotter.plot_model_in_colab(model, tree_idx=0)

In [None]:
# %set_cell_height 300

model.summary()

In [None]:
for f in created_datasets:
  if os.path.isfile(datasets_path[f]): # this makes the code more robust
        os.remove(datasets_path[f])

In [None]:
#@markdown What changes you made to datasets & why ?


approach = ''  #@param {type: "string"}


In [None]:
inspector = model.make_inspector()

In [None]:
model_name = inspector.model_type()
num_trees  = inspector.num_trees()
objective  = inspector.objective()
eval = inspector.evaluation()

In [None]:

input_features_list = inspector.features()
input_features = ""
for i in range(len(input_features_list)):
  input_features += str(i+1)+ "  "
  input_features += input_features_list[i][0] + "\n"

'1  20-24-In_Primary_OL\n2  20-24-Primary_OL\n3  20-24_female-In_Primary_OL\n4  20-24_female-Primary_OL\n5  20_yrs_old_gdppercapita_us_infla_adjust\n6  children_per_woman\n7  co2_emissions_percapita\n8  gdppercapita_us_infla_adjust\n9  gini_index\n10  population\n'

In [None]:
variable_imp = []
variable_imp.append(extract_variable_imp(variable = 'MEAN_MIN_DEPTH'))
variable_imp.append(extract_variable_imp(variable = 'SUM_SCORE'))


In [None]:
#@ Analysis
#@markdown Observations made from the output ?


anaylsis = ''  #@param {type: "string"}


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
with open('/content/drive/My Drive/document.txt', 'a') as f:
  f.write("Model Trained : " + model_name+ "\n")
  f.write("Predicting : " + OUTPUT + "\n")
  f.write("Approach : " + approach + "\n")
  f.write("Num Examples: " + str(eval.num_examples) + "\n\n")
  f.write("Input Features\n")
  f.write(input_features+"\n\n")
  f.write("Model Performance\n")
  f.write("RMSE Score : "  + str(eval.rmse)+ "\n\n")
  for v in variable_imp:
    f.write(v + "\n\n")
  f.write("Analysis : " + approach + "\n")
  f.write("\n\n\n")

  

# Save the summary

In [75]:
#@markdown What changes you made to datasets & why ?


approach = 'Added dataset containing how many yrs each country took to achieve <70% drop out for primary education '  #@param {type: "string"}


In [76]:
inspector = model.make_inspector()

In [77]:
model_name = inspector.model_type()
num_trees  = inspector.num_trees()
objective  = inspector.objective()
eval = inspector.evaluation()

In [78]:

input_features_list = inspector.features()
input_features = ""
for i in range(len(input_features_list)):
  input_features += str(i+1)+ "  "
  input_features += input_features_list[i][0] + "\n"

In [79]:
variable_imp = []
variable_imp.append(extract_variable_imp(variable = 'MEAN_MIN_DEPTH'))
variable_imp.append(extract_variable_imp(variable = 'SUM_SCORE'))


In [80]:
#@ Analysis
#@markdown Observations made from the output ?


anaylsis = ''  #@param {type: "string"}


In [81]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [82]:
with open('/content/drive/My Drive/education-impact/document.txt', 'a') as f:
  f.write("Model Trained : " + model_name+ "\n")
  f.write("Predicting : " + OUTPUT + "\n")
  f.write("Approach : " + approach + "\n")
  f.write("Num Examples: " + str(eval.num_examples) + "\n\n")
  f.write("Input Features\n")
  f.write(input_features+"\n\n")
  f.write("Model Performance\n")
  f.write("RMSE Score : "  + str(eval.rmse)+ "\n\n")
  for v in variable_imp:
    f.write(v + "\n\n")
  f.write("Analysis : " + approach + "\n")
  f.write("\n\n\n")

  

# Models to try
 - RandomForestModel
 - GradientBoostedTreesModel
 - CartModel
 - DistributedGradientBoostedTreesModel

 - KNN Regression
 - Support Vector Regression
 - Locally Weighted Scatterplot Smoothing
 - Multivariate Adaptive Regression Splines
  