In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

In [2]:
data = pd.read_csv('ch6_cell28_dev_feat1_filtered.tsv.gz', sep='\t')
data = data.select_dtypes(include=['float64'])

In [3]:
labels = np.array(data['population'])
features_df = data.drop('population', axis = 1)
feature_list = list(features_df.columns)
features = np.array(features_df)

In [4]:
train_features, test_features, train_labels, test_labels = train_test_split(features, labels, test_size = 0.2, random_state = 42)

In [5]:
rf = RandomForestRegressor(n_estimators = 5)
rf.fit(train_features, train_labels);

In [6]:
predictions = rf.predict(test_features)
mape = 100 * (abs(predictions - test_labels) / test_labels)
accuracy = 100 - np.mean(mape)
print('Accuracy:', accuracy, '%')

Accuracy: 93.99720969951022 %


In [7]:
# Get  feature importances
importances = list(rf.feature_importances_)
feature_importances = [(feature, round(importance, 2)) for feature, importance in zip(feature_list, importances)]
feature_importances = sorted(feature_importances, key = lambda x: x[1], reverse = True)
[print('Variable: {:10} Importance: {}'.format(*pair)) for pair in feature_importances];

Variable: http://dbpedia.org/ontology/birthPlace?inv#count Importance: 0.23
Variable: http://dbpedia.org/ontology/areaTotal#1 Importance: 0.12
Variable: http://dbpedia.org/ontology/country#1@OTHER Importance: 0.1
Variable: http://dbpedia.org/ontology/country#1@<http://dbpedia.org/resource/India> Importance: 0.06
Variable: http://dbpedia.org/ontology/timeZone#1@<http://dbpedia.org/resource/China_Standard_Time> Importance: 0.05
Variable: rel#count  Importance: 0.04
Variable: http://dbpedia.org/ontology/populationDensity#1 Importance: 0.04
Variable: http://dbpedia.org/ontology/elevation#1 Importance: 0.03
Variable: http://dbpedia.org/ontology/areaLand#1 Importance: 0.02
Variable: http://dbpedia.org/ontology/utcOffset#count Importance: 0.02
Variable: http://www.w3.org/1999/02/22-rdf-syntax-ns#type#1@<http://dbpedia.org/ontology/City> Importance: 0.02
Variable: http://dbpedia.org/ontology/city?inv#count Importance: 0.01
Variable: http://dbpedia.org/ontology/country#1@<http://dbpedia.org/res

In [8]:
data2 = data
data2['new_Feature'] = np.where(data2['http://dbpedia.org/ontology/birthPlace?inv#count']<=0.5, 0, 1)
# replace the feature
#data2 = data2.drop('http://dbpedia.org/ontology/country#1@<http://dbpedia.org/resource/Brazil>', axis=1)  
labels = np.array(data2['population'])
features_df = data2.drop('population', axis = 1)
feature_list = list(features_df.columns)
features = np.array(features_df)
train_features, test_features, train_labels, test_labels = train_test_split(features, labels, test_size = 0.2, random_state = 42)
rf = RandomForestRegressor(n_estimators = 5, random_state = 42)
# Train the model on training data
rf.fit(train_features, train_labels)
# predict method on the test data
predictions = rf.predict(test_features)
# Calculate mean absolute percentage error 
mape = 100 * (abs(predictions - test_labels) / test_labels)
# Calculate and display accuracy
accuracy = 100 - np.mean(mape)
print('Accuracy:', round(accuracy, 2), '%')


Accuracy: 93.98 %
