<a href="https://colab.research.google.com/github/jaya-shankar/education-impact/blob/master/randomForest.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!rm -rf education-impact
!rm education-impact

rm: cannot remove 'education-impact': No such file or directory


In [None]:
!git clone https://github.com/jaya-shankar/education-impact.git


In [7]:
!pip install tensorflow_decision_forests
!pip install wurlitzer

Collecting wurlitzer
  Downloading wurlitzer-3.0.2-py3-none-any.whl (7.3 kB)
Installing collected packages: wurlitzer
Successfully installed wurlitzer-3.0.2


In [8]:
root = "education-impact/" 
datasets_path = {
                    "infant_mortality"              :  root+ "datasets/Infant_Mortality_Rate.csv",
                    "child_mortality"               :  root+ "datasets/child_mortality_0_5_year_olds_dying_per_1000_born.csv",
                    "children_per_woman"            :  root+ "datasets/children_per_woman_total_fertility.csv",
                    "co2_emissions"                 :  root+"datasets/co2_emissions_tonnes_per_person.csv",
                    "population"                    :  root+ "datasets/converted_pop.csv",
                    "food_supply"                   :  root+ "datasets/food_supply_kilocalories_per_person_and_day.csv",
                    "gdp_per_captia"                :  root+ "datasets/gdp_per_capita_yearly_growth.csv",
                    "gini_index"                    :  root+ "datasets/gini.csv",
                    "life_expectancy"               :  root+ "datasets/life_expectancy_years.csv",
                    "malnutrition"                  :  root+ "datasets/malnutrition_weight_for_age_percent_of_children_under_5.csv",
                    "poverty_index"                 :  root+ "datasets/mincpcap_cppp.csv",
                    "maternal_mortality"            :  root+ "datasets/mmr_who.csv",
                    "people_in_poverty"             :  root+ "datasets/number_of_people_in_poverty.csv",
                    "primary_completion"            :  root+ "datasets/primary_school_completion_percent_of_girls.csv",
                    "ratio_b/g_in_primary"          :  root+ "datasets/ratio_of_girls_to_boys_in_primary_and_secondary_education_perc.csv",
                    "wcde-15--24"                   :  root+ "datasets/wcde-15--24.csv",
                    "wcde-25--34"                   :  root+ "datasets/wcde-25--34.csv",
                    "wcde-35--44"                   :  root+ "datasets/wcde-35--44.csv",
                    "wcde-45--54"                   :  root+ "datasets/wcde-45--54.csv",
                    "wcde-55--64"                   :  root+ "datasets/wcde-55--64.csv",
                    "wcde-65--74"                   :  root+ "datasets/wcde-65--74.csv",
                    "wcde-75--84"                   :  root+ "datasets/wcde-75--84.csv",
                    "wcde-85--94"                   :  root+ "datasets/wcde-85--94.csv",
                    "wcde-95--"                     :  root+ "datasets/wcde-95--.csv",
                 
                }

In [9]:
import pandas as pd
import numpy as np
import math
import tensorflow_decision_forests as tfdf
from sklearn.model_selection import train_test_split
from wurlitzer import sys_pipes

In [10]:
# to find out how many countries each dataset has
countries_arr = []
for path in datasets_path:
  df = pd.read_csv(datasets_path[path])
  print(f"{'Factor: ' + path:<30} count: {len(set(df.Country.unique()))}")
  

Factor: infant_mortality       count: 266
Factor: child_mortality        count: 197
Factor: children_per_woman     count: 202
Factor: co2_emissions          count: 194
Factor: population             count: 197
Factor: food_supply            count: 179
Factor: gdp_per_captia         count: 221
Factor: gini_index             count: 195
Factor: life_expectancy        count: 195
Factor: malnutrition           count: 156
Factor: poverty_index          count: 195
Factor: maternal_mortality     count: 184
Factor: people_in_poverty      count: 145
Factor: primary_completion     count: 195
Factor: ratio_b/g_in_primary   count: 200
Factor: wcde-15--24            count: 202
Factor: wcde-25--34            count: 202
Factor: wcde-35--44            count: 202
Factor: wcde-45--54            count: 202
Factor: wcde-55--64            count: 202
Factor: wcde-65--74            count: 202
Factor: wcde-75--84            count: 202
Factor: wcde-85--94            count: 202
Factor: wcde-95--              cou

from the above output
- **malnutrition & people in povery** have least no of countries
- **infant mortality & gdp per captia** have highest no of countries

*Doubt:* Does having more data for one factor will make the decision tree bias?


###Steps
1. create a csv file such that each row contains all values of particular year & country present
2. the output for each row is year + 40 years corresponding value 
    1. **outputs** - life expectany, education level, gdp




In [11]:
PREDICT_FUTURE  = 40
OUTPUTS         = ['life_expectancy', 'gdp_per_captia', 'primary_completion' ]

In [12]:
# creating a list of all countries & years
countries = list(pd.read_csv('education-impact/datasets/Infant_Mortality_Rate.csv').Country.unique())
years     = [y for y in range(1960,2015-PREDICT_FUTURE+1)]

In [13]:
keys=[]
for y in years:
  for c in countries:
    keys.append((c,str(y)))

In [14]:
big_dic = {k : [] for k in keys}
for path in datasets_path:
  df = pd.read_csv(datasets_path[path])
  df.set_index("Country", inplace=True)
  for k in keys:
    try:
      big_dic[k].append(df.loc[k[0]][k[1]])
    except:
      big_dic[k].append(np.NaN)
 

In [15]:
for output_path in OUTPUTS:
  df = pd.read_csv(datasets_path[output_path])
  df.set_index("Country", inplace=True)
  for k in keys:
    try:
      big_dic[k].append(df.loc[k[0]][str(int(k[1])+PREDICT_FUTURE)])
    except:
      big_dic[k].append(np.NaN)

In [16]:
columns = [k for k in datasets_path ]
output_columns = ["o_"+o for o in OUTPUTS]
columns.extend(output_columns)

In [17]:
input_df = pd.DataFrame.from_dict(big_dic,orient='index', columns = columns)
output_df = input_df[["o_"+o for o in OUTPUTS]]
input_df.drop(labels=["o_"+o for o in OUTPUTS], axis = 1, inplace=True)

From above output
- if we dont drop any rows our table size = 4256 entries
- if we drop rows containing any if all of outputs missing then our table size = 3039 entries
- if we drop rows containing any one of output missing then our table size = 1745 entries

so, I think its is better to go with second choice and build different models, but not sure it will not effect performance of the model


now we have the dataframe containing both inputs and ouputs,our next step is
1. split the data into train & test data
  1. try to split data based on continents to reduce bias
2. build DF model using tensorflow
3. check the accuracy of the model

In [18]:
X_train, X_test, y_train, y_test = train_test_split(input_df, output_df, test_size=0.30, random_state=43)

In [19]:
frames      = [X_train,y_train['o_life_expectancy']]
le_model_df = pd.concat(frames,axis=1)
le_model_df.dropna(subset=['o_life_expectancy'],inplace=True)
le_model_df

Unnamed: 0,infant_mortality,child_mortality,children_per_woman,co2_emissions,population,food_supply,gdp_per_captia,gini_index,life_expectancy,malnutrition,poverty_index,maternal_mortality,people_in_poverty,primary_completion,ratio_b/g_in_primary,wcde-15--24,wcde-25--34,wcde-35--44,wcde-45--54,wcde-55--64,wcde-65--74,wcde-75--84,wcde-85--94,wcde-95--,o_life_expectancy
"(Senegal, 1975)",112.2,252.0,7.29,0.5230,4930000.0,2200.0,5.010,54.7,49.9,,2.63,,,,,4.18,3.71,1.88,1.10,0.78,0.63,0.51,0.39,0.29,67.0
"(Andorra, 1962)",,32.6,,,15400.0,,3.800,40.0,75.9,,22.10,,,,,,,,,,,,,,81.1
"(Comoros, 1966)",,255.0,7.03,0.0867,211000.0,,8.340,46.5,49.8,,5.53,,,,,,,,,,,,,,64.4
"(Ukraine, 1970)",,34.9,2.05,9.2900,47100000.0,,6.690,31.4,70.9,,6.11,,,,,18.58,19.42,14.88,7.92,3.56,1.69,0.97,0.49,0.18,70.8
"(Lithuania, 1970)",17.9,23.1,2.31,6.5100,3140000.0,,6.690,26.3,71.3,,6.59,,,,,19.69,22.26,16.58,13.14,10.21,7.69,5.54,3.76,2.44,73.2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
"(Senegal, 1974)",115.6,261.0,7.30,0.3860,4800000.0,2260.0,1.090,54.7,49.4,,2.50,,,,,,,,,,,,,,66.8
"(St. Kitts and Nevis, 1962)",,114.0,,,51000.0,1780.0,1.200,40.0,60.0,,2.63,,,,,,,,,,,,,,70.7
"(Cote d'Ivoire, 1961)",206.7,309.0,7.43,0.1520,3630000.0,2330.0,9.530,47.7,46.1,,6.79,,,,,,,,,,,,,,51.5
"(Nicaragua, 1968)",120.2,180.0,6.94,0.5440,2260000.0,2130.0,-1.810,49.2,56.5,,8.43,,,,,,,,,,,,,,74.0


In [23]:
train_ds = tfdf.keras.pd_dataframe_to_tf_dataset(le_model_df, label='o_life_expectancy', task=tfdf.keras.Task.REGRESSION)

In [24]:
model = tfdf.keras.RandomForestModel(task = tfdf.keras.Task.REGRESSION)
with sys_pipes():
  model.fit(x=train_ds)



[INFO kernel.cc:736] Start Yggdrasil model training
[INFO kernel.cc:737] Collect training examples
[INFO kernel.cc:392] Number of batches: 32
[INFO kernel.cc:393] Number of examples: 2000
[INFO kernel.cc:759] Dataset:
Number of records: 2000
Number of columns: 25

Number of columns by type:
	NUMERICAL: 25 (100%)

Columns:

NUMERICAL: 25 (100%)
	0: "child_mortality" NUMERICAL mean:134.307 min:9.54 max:423 sd:95.2927
	1: "children_per_woman" NUMERICAL num-nas:103 (5.15%) mean:5.29917 min:1.5 max:8.37 sd:1.8935
	2: "co2_emissions" NUMERICAL num-nas:175 (8.75%) mean:3.91127 min:0 max:101 sd:8.67813
	3: "food_supply" NUMERICAL num-nas:638 (31.9%) mean:2355.13 min:1310 max:3580 sd:495.106
	4: "gdp_per_captia" NUMERICAL mean:3.05844 min:-31.3 max:145 sd:6.70481
	5: "gini_index" NUMERICAL mean:40.1922 min:16 max:77 sd:10.5302
	6: "infant_mortality" NUMERICAL num-nas:589 (29.45%) mean:86.4636 min:8.7 max:227.8 sd:54.1238
	7: "life_expectancy" NUMERICAL mean:59.5651 min:17.1 max:77.7 sd:10.0422


Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module, class, method, function, traceback, frame, or code object was expected, got cython_function_or_method


Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module, class, method, function, traceback, frame, or code object was expected, got cython_function_or_method


Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module, class, method, function, traceback, frame, or code object was expected, got cython_function_or_method
Cause: while/else statement not yet supported


Cause: while/else statement not yet supported


Cause: while/else statement not yet supported




In [25]:
frames      = [X_test,y_test['o_life_expectancy']]
le_model_test_df = pd.concat(frames,axis=1)
le_model_test_df.dropna(subset=['o_life_expectancy'],inplace=True)
le_model_test_df

Unnamed: 0,infant_mortality,child_mortality,children_per_woman,co2_emissions,population,food_supply,gdp_per_captia,gini_index,life_expectancy,malnutrition,poverty_index,maternal_mortality,people_in_poverty,primary_completion,ratio_b/g_in_primary,wcde-15--24,wcde-25--34,wcde-35--44,wcde-45--54,wcde-55--64,wcde-65--74,wcde-75--84,wcde-85--94,wcde-95--,o_life_expectancy
"(Ukraine, 1971)",27.9,34.2,2.06,9.7900,47400000.0,,1.65,31.3,71.0,,6.24,,,,,,,,,,,,,,71.5
"(St. Kitts and Nevis, 1960)",,127.0,,,51200.0,,1.20,40.0,58.3,,2.57,,,,,,,,,,,,,,69.9
"(Portugal, 1967)",61.5,77.6,3.13,1.4200,8750000.0,2950.0,7.62,45.8,66.3,,8.24,,,,,,,,,,,,,,79.4
"(Mauritania, 1971)",103.5,191.0,6.77,0.3380,1180000.0,1960.0,-0.68,40.3,55.2,,2.30,,,,,,,,,,,,,,68.4
"(Uganda, 1960)",131.4,223.0,7.00,0.0623,6770000.0,,1.84,53.5,49.0,,3.14,,,,,5.78,4.23,3.22,2.84,1.39,0.42,0.12,0.03,0.01,49.6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
"(Mexico, 1969)",78.8,112.0,6.83,2.0500,49900000.0,2510.0,3.08,50.0,61.2,,5.55,,,,,,,,,,,,,,75.0
"(Rwanda, 1969)",129.0,218.0,8.22,0.0161,3640000.0,2260.0,7.71,37.3,43.5,,1.68,,,,,,,,,,,,,,63.4
"(India, 1967)",147.6,222.0,5.75,0.3310,520000000.0,1990.0,5.90,31.4,46.4,,2.01,,,,,,,,,,,,,,65.8
"(Nepal, 1974)",161.9,243.0,5.85,0.0324,13100000.0,1760.0,3.96,31.6,50.3,,1.42,,,,,,,,,,,,,,69.9


In [28]:
# Convert it to a TensorFlow dataset
test_ds = tfdf.keras.pd_dataframe_to_tf_dataset(le_model_test_df, label='o_life_expectancy', task=tfdf.keras.Task.REGRESSION)

# Evaluate the model
model.compile(metrics=["mse"])
# Evaluate the model on the test dataset.
evaluation = model.evaluate(test_ds, return_dict=True)

print(evaluation)
print()
print(f"MSE: {evaluation['mse']}")
print(f"RMSE: {math.sqrt(evaluation['mse'])}")

{'loss': 0.0, 'mse': 5.545248985290527}

MSE: 5.545248985290527
RMSE: 2.354835235274546


In [29]:
tfdf.model_plotter.plot_model_in_colab(model, tree_idx=0)