In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

df = pd.read_csv('ch6_cell28_dev_feat1_filtered.tsv',sep='\t')
features,label = df.drop(['name','population'],axis=1), df.iloc[:,-1]
feature_names = list(features.columns)
X_train, X_test, y_train, y_test = train_test_split(features, label, test_size=0.30, random_state=42)
rf_model = RandomForestRegressor()
rf_model.fit(X_train,y_train)
pred_y = rf_model.predict(X_test)
mse_score_old = mean_squared_error(y_test,pred_y)
print("MSE value of base model is",mse_score_old)

print("Features sorted by their score:")
print(sorted(zip(map(lambda x: round(x, 4), rf_model.feature_importances_),feature_names),reverse=True))



MSE value of base model is 0.10613839386116722
Features sorted by their score:
[(0.2365, 'http://dbpedia.org/ontology/birthPlace?inv#count'), (0.117, 'http://dbpedia.org/ontology/areaTotal#1'), (0.1009, 'http://dbpedia.org/ontology/country#1@OTHER'), (0.0639, 'http://dbpedia.org/ontology/country#1@<http://dbpedia.org/resource/India>'), (0.0458, 'http://dbpedia.org/ontology/timeZone#1@<http://dbpedia.org/resource/China_Standard_Time>'), (0.0428, 'rel#count'), (0.0391, 'http://dbpedia.org/ontology/populationDensity#1'), (0.0329, 'http://dbpedia.org/ontology/elevation#1'), (0.0197, 'http://www.w3.org/1999/02/22-rdf-syntax-ns#type#1@<http://dbpedia.org/ontology/City>'), (0.0191, 'http://dbpedia.org/ontology/utcOffset#count'), (0.017, 'http://dbpedia.org/ontology/country#1@<http://dbpedia.org/resource/Brazil>'), (0.0125, 'http://dbpedia.org/ontology/areaLand#1'), (0.0121, 'http://dbpedia.org/ontology/isPartOf#count'), (0.0115, 'http://dbpedia.org/ontology/city?inv#count'), (0.0095, 'http://

In [2]:
import skimage.filters

#Calculating a threshold value using Otsu's method used in image thresholding
f_new = np.array(features['http://dbpedia.org/ontology/birthPlace?inv#count'])
t = skimage.filters.threshold_otsu(f_new) #Otsu's method

#Data binning into 0's and 1's
c = (f_new >= t).astype(int) 
features['f_new'] = c

#Running model on new set of features
X_train, X_test, y_train, y_test = train_test_split(features, label, test_size=0.30, random_state=42)
rf_model.fit(X_train,y_train)
pred_y = rf_model.predict(X_test)
mse_score = mean_squared_error(y_test,pred_y)
print("MSE value of base model is",mse_score_old)
print("Updated MSE value is",mse_score)

MSE value of base model is 0.10613839386116722
Updated MSE value is 0.1044993194622291
