In [53]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix


In [54]:
# these are the independent variables
df = pd.read_csv("../input/trends-assessment-prediction/loading.csv")
features = list(df.columns[1:])

# these are the dependent variables
labels_df = pd.read_csv("../input/trends-assessment-prediction/train_scores.csv")

df = df.merge(labels_df, on="Id", how="left")
df.dropna()


Unnamed: 0,Id,IC_01,IC_07,IC_05,IC_16,IC_26,IC_06,IC_10,IC_09,IC_18,...,IC_20,IC_30,IC_22,IC_29,IC_14,age,domain1_var1,domain1_var2,domain2_var1,domain2_var2
0,10001,0.006070,0.014466,0.004136,0.000658,-0.002742,0.005033,0.016720,0.003484,0.001797,...,0.010496,0.002892,-0.023235,0.022177,0.017192,57.436077,30.571975,62.553736,53.325130,51.427998
1,10002,0.009087,0.009291,0.007049,-0.002076,-0.002227,0.004605,0.012277,0.002946,0.004086,...,0.005739,0.002880,-0.016609,0.025543,0.014524,59.580851,50.969456,67.470628,60.651856,58.311361
3,10004,0.004675,0.000957,0.006154,-0.000429,-0.001222,0.011755,0.013010,0.000193,0.008075,...,-0.000319,0.005866,-0.015182,0.024476,0.014760,71.413018,53.152498,58.012103,52.418389,62.536641
6,10007,0.005192,0.010585,0.012160,-0.000920,-0.002255,0.011416,0.013838,0.001929,0.003051,...,0.003731,0.000733,-0.008462,0.026733,0.014358,38.617381,49.197021,65.674285,40.151376,34.096421
7,10008,0.007745,0.009748,0.009356,-0.004219,-0.003852,0.012024,0.010205,0.002903,0.000870,...,0.004483,0.000688,-0.013822,0.029328,0.010936,35.326582,15.769168,65.782269,44.643805,50.448485
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11745,21746,-0.001115,0.007108,0.008652,0.003596,0.000950,0.016314,0.017090,0.003513,0.004217,...,0.006943,0.003312,-0.011562,0.032932,0.011053,14.257265,21.358872,61.165998,51.778483,54.640179
11746,21747,0.007263,0.016489,0.012704,0.004357,-0.005044,0.013909,0.019284,-0.006267,-0.000456,...,0.001316,0.003792,-0.022357,0.031624,0.016982,55.456978,68.169675,29.907995,55.349257,54.019517
11749,21750,0.005996,0.003873,0.012353,0.000242,-0.002159,0.020201,0.020931,0.003684,-0.002458,...,0.004942,0.007751,-0.020226,0.028821,0.017492,48.948756,55.114811,60.878271,38.617246,50.679885
11751,21752,0.000627,0.011407,0.010957,0.000534,-0.000347,0.013499,0.010541,0.001867,0.007447,...,0.002026,0.001876,-0.014612,0.021665,0.019592,66.532630,59.844808,72.303110,55.458281,46.870235


In [55]:
# How a regression tree works: create a step function by learning the optimal boundary between steps
# by minimizing sum of square residuals against every possible boundary in the data
# then use the average value (or a linear regression?) for each step, to calculate the prediction of y
# it's typical to truncate step splitting at steps of size <=20 to prevent overfitting

# How a random forest works: a random forest is a collection of decision trees where the boundaries
# are learned from only a randomly selected subset of k of the possible regressors  at each step,
# on top of the bootstrapped data set. Then take an average of all the trees' predictions.

train_df, test_df = train_test_split(df, test_size=0.33, shuffle=True)

# Create the random forest regressor
# a bootstrapped dataset is sample that was randomly sampled with replacement from some source dataset
model = RandomForestRegressor(n_estimators=100, criterion="mse", bootstrap = True)
model.fit(train_df[features], train_df["age"])

print("Accuracy score of Random Forest Regressor on age in training set")
print(model.score(train_df[features], train_df["age"]))



ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

In [None]:
# Now we can create predictions and evaluate our model

y_predicted = model.predict(test_df[features])

print("Accuracy score of Random Forest Regressor on predicting age in test set")
print(model.score(test_df[features], test_df["age"]))

In [None]:
# It would be nice to have a visual of how well we predicted ages, so convert our prediction to a
# classification by decade of age, and plot of confusion matrix to show the accuracy of our prediction
cm = confusion_matrix(test_df["age"].round(-1).astype(int), y_predicted.round(-1).astype(int))

plt.figure(figsize=(10,7))
sns.heatmap(cm, annot=True, fmt = "d")
plt.xlabel('Predicted (Age in decades)')
plt.ylabel('Actual (Age in decades)')