In [1]:
# Evaluate importances of different features

# In the metrics evaluation, random forest is chosen as 
# the model to be used. 
# So, the importances of features will be done on the
# random forest only.

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from configparser import ConfigParser

#import warnings
print("Libraries imported")

Libraries imported


In [2]:
# Open the importances file,

config = ConfigParser(allow_no_value=True)
filename = "output_all/importance.conf"
config.read(filename)
config.sections()

['polynomial', 'sgd', 'svr', 'forest', 'knn']

In [3]:
# Also read in the name of features used

config_feature = ConfigParser(allow_no_value=True)
filename = "output_all/features.conf"
config_feature.read(filename)
config_feature.sections()

['features', 'label']

In [4]:
# Get the feature_names selected (True)

feature_names = []

for k, v in config_feature['features'].items():
    # append only the selected features
    if v == 'True':
        feature_names.append(k)
    print(f"{k} = {v}")

print(feature_names)

direct_admission = True
cca = True
learning_style = True
tuition = True
hours_per_week = True
attendance_rate = True
sleep_duration = True
sleep_enough = True
attendance_enough = True
number_of_siblings_0 = True
number_of_siblings_1 = True
number_of_siblings_2 = True
['direct_admission', 'cca', 'learning_style', 'tuition', 'hours_per_week', 'attendance_rate', 'sleep_duration', 'sleep_enough', 'attendance_enough', 'number_of_siblings_0', 'number_of_siblings_1', 'number_of_siblings_2']


In [5]:
# ConfigParser reads this in as a string 
s = config['forest']['importances_mean']
s


'[2.82414419 1.62504549 2.46429073 2.04280771 4.12932336 2.51049243\n0.19465569 0.09617319 0.48906955 2.6646175  1.18457411 0.44382001]'

In [6]:
s = s.strip("[]")
s

'2.82414419 1.62504549 2.46429073 2.04280771 4.12932336 2.51049243\n0.19465569 0.09617319 0.48906955 2.6646175  1.18457411 0.44382001'

In [7]:
s = s.replace("\n", " ")
s

'2.82414419 1.62504549 2.46429073 2.04280771 4.12932336 2.51049243 0.19465569 0.09617319 0.48906955 2.6646175  1.18457411 0.44382001'

In [8]:
# Get the mean of importances
impt = []

value_list = s.split()
print(value_list)
for i in value_list:
    impt.append(float(i))

print(impt)

['2.82414419', '1.62504549', '2.46429073', '2.04280771', '4.12932336', '2.51049243', '0.19465569', '0.09617319', '0.48906955', '2.6646175', '1.18457411', '0.44382001']
[2.82414419, 1.62504549, 2.46429073, 2.04280771, 4.12932336, 2.51049243, 0.19465569, 0.09617319, 0.48906955, 2.6646175, 1.18457411, 0.44382001]


In [9]:
# Make into dataframe for plotting
impt_df = pd.DataFrame()
impt_df['feature_names'] = feature_names
impt_df['importances_mean'] = impt

In [10]:
# Rank the importances

impt_df.sort_values(by='importances_mean', ascending=False, ignore_index=True)

Unnamed: 0,feature_names,importances_mean
0,hours_per_week,4.129323
1,direct_admission,2.824144
2,number_of_siblings_0,2.664617
3,attendance_rate,2.510492
4,learning_style,2.464291
5,tuition,2.042808
6,cca,1.625045
7,number_of_siblings_1,1.184574
8,attendance_enough,0.48907
9,number_of_siblings_2,0.44382


From the ranking, the revision `hours_per_week` is the highest. 

`direct_admission` ranks second in the importances. The boxplot in EDA also show that its mean is different from those that is not `direct_admission`.

The number_of_siblings had been encoded by `get_dummies`, hence the `_0`, `_1` and `_2`. 0 number of siblings has the highest mean `final_test` score. This is also reflected in its importances to make regression.

The `number_of_siblings_0` and `direct_admission` rank the second and the third place respectively. It is surprising that `number_of_siblings_0` can be so important to the random forest.

For `sleep_duration` and `sleep_enough`, the interquartile range captures most of the samples, so this feature is general and do not contribute much to score regression. Thus, their rankings are rather low.
