In [1]:
!pip3 install -U ucimlrepo



In [2]:
import pandas as pd
from ucimlrepo import fetch_ucirepo, list_available_datasets
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
nursery = fetch_ucirepo(id=76) 

In [4]:
X = nursery.data.features 
y = nursery.data.targets 

In [5]:
df = pd.DataFrame(data=X, columns=nursery.data.feature_names)
df['target'] = y
df

Unnamed: 0,parents,has_nurs,form,children,housing,finance,social,health,target
0,usual,proper,complete,1,convenient,convenient,nonprob,recommended,recommend
1,usual,proper,complete,1,convenient,convenient,nonprob,priority,priority
2,usual,proper,complete,1,convenient,convenient,nonprob,not_recom,not_recom
3,usual,proper,complete,1,convenient,convenient,slightly_prob,recommended,recommend
4,usual,proper,complete,1,convenient,convenient,slightly_prob,priority,priority
...,...,...,...,...,...,...,...,...,...
12955,great_pret,very_crit,foster,more,critical,inconv,slightly_prob,priority,spec_prior
12956,great_pret,very_crit,foster,more,critical,inconv,slightly_prob,not_recom,not_recom
12957,great_pret,very_crit,foster,more,critical,inconv,problematic,recommended,spec_prior
12958,great_pret,very_crit,foster,more,critical,inconv,problematic,priority,spec_prior


In [6]:
df['target'].value_counts()

target
not_recom     4320
priority      4266
spec_prior    4044
very_recom     328
recommend        2
Name: count, dtype: int64

In [7]:
binary_df = df.copy()

In [8]:
def isRecommend(val):
    if val == 'not_recom':
        return val
    return 'recommend'

binary_df['target'] = df['target'].apply(isRecommend) 

In [9]:
binary_df['target'].value_counts()

target
recommend    8640
not_recom    4320
Name: count, dtype: int64

In [10]:
binary_df['health'].value_counts()

health
recommended    4320
priority       4320
not_recom      4320
Name: count, dtype: int64

In [11]:
df_encoded = pd.get_dummies(binary_df, drop_first=True, dtype='int')
df_encoded

Unnamed: 0,parents_pretentious,parents_usual,has_nurs_improper,has_nurs_less_proper,has_nurs_proper,has_nurs_very_crit,form_completed,form_foster,form_incomplete,children_2,children_3,children_more,housing_critical,housing_less_conv,finance_inconv,social_problematic,social_slightly_prob,health_priority,health_recommended,target_recommend
0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1
1,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1
2,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,1,1
4,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12955,0,0,0,0,0,1,0,1,0,0,0,1,1,0,1,0,1,1,0,1
12956,0,0,0,0,0,1,0,1,0,0,0,1,1,0,1,0,1,0,0,0
12957,0,0,0,0,0,1,0,1,0,0,0,1,1,0,1,1,0,0,1,1
12958,0,0,0,0,0,1,0,1,0,0,0,1,1,0,1,1,0,1,0,1


In [12]:
X_encoded = df_encoded.drop('target_recommend', axis=1)
y_encoded = df_encoded['target_recommend']
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y_encoded, random_state=1)

In [13]:
model = RandomForestClassifier(random_state=1)

In [14]:
model.fit(X_train, y_train)

In [15]:
model.score(X_test, y_test)

1.0

In [16]:
accuracy_score(y_test, model.predict(X_test))

1.0

In [17]:
sum(model.feature_importances_)


1.0

In [18]:
model.feature_importances_

array([0.00299835, 0.00278493, 0.0020042 , 0.00181644, 0.00227344,
       0.00214519, 0.00252027, 0.00239395, 0.00234079, 0.00220763,
       0.00216981, 0.00212305, 0.00292467, 0.00291862, 0.00390672,
       0.00368118, 0.00324679, 0.48769236, 0.46785162])

In [19]:
column_names = X_train.columns
# Create a dictionary to map column names to feature importances
importance_dict = dict(zip(column_names, model.feature_importances_))

# Print the feature importances with column names
for feature, importance in importance_dict.items():
    print(f"{feature}: {importance}")

parents_pretentious: 0.0029983500316205945
parents_usual: 0.0027849337445849435
has_nurs_improper: 0.002004200226921632
has_nurs_less_proper: 0.0018164371676570059
has_nurs_proper: 0.002273440976867203
has_nurs_very_crit: 0.002145190510661406
form_completed: 0.002520269516170771
form_foster: 0.0023939490871104185
form_incomplete: 0.002340786816413227
children_2: 0.0022076282769873568
children_3: 0.002169806650946779
children_more: 0.002123045118406424
housing_critical: 0.0029246735476370672
housing_less_conv: 0.002918615361181348
finance_inconv: 0.003906718213445249
social_problematic: 0.003681184866731239
social_slightly_prob: 0.0032467915987206466
health_priority: 0.4876923621002604
health_recommended: 0.4678516161876763


In [20]:
multi_df = df.copy()
multi_df['target'].value_counts()

target
not_recom     4320
priority      4266
spec_prior    4044
very_recom     328
recommend        2
Name: count, dtype: int64

In [21]:
indices = (multi_df['target'] == 'very_recom') | (multi_df['target'] == 'recommend')
multi_df = multi_df.drop(multi_df[indices].index)

In [22]:
multi_df['target'].value_counts()

target
not_recom     4320
priority      4266
spec_prior    4044
Name: count, dtype: int64

In [39]:
X = pd.get_dummies(multi_df.drop('target', axis=1), drop_first=True, dtype='int')
X

Unnamed: 0,parents_pretentious,parents_usual,has_nurs_improper,has_nurs_less_proper,has_nurs_proper,has_nurs_very_crit,form_completed,form_foster,form_incomplete,children_2,children_3,children_more,housing_critical,housing_less_conv,finance_inconv,social_problematic,social_slightly_prob,health_priority,health_recommended
1,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0
2,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,1,0
5,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0
6,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12955,0,0,0,0,0,1,0,1,0,0,0,1,1,0,1,0,1,1,0
12956,0,0,0,0,0,1,0,1,0,0,0,1,1,0,1,0,1,0,0
12957,0,0,0,0,0,1,0,1,0,0,0,1,1,0,1,1,0,0,1
12958,0,0,0,0,0,1,0,1,0,0,0,1,1,0,1,1,0,1,0


In [40]:
y = pd.get_dummies(multi_df['target'], dtype='int')
y

Unnamed: 0,not_recom,priority,spec_prior
1,0,1,0
2,1,0,0
4,0,1,0
5,1,0,0
6,0,1,0
...,...,...,...
12955,0,0,1
12956,1,0,0
12957,0,0,1
12958,0,0,1


In [41]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [42]:
model = RandomForestClassifier(random_state=1, n_estimators=512)
model.fit(X_train, y_train)

In [43]:
model.score(X_test, y_test)

0.9851171627612413

In [33]:
column_names = X_train.columns
# Create a dictionary to map column names to feature importances
importance_dict = dict(zip(column_names, model.feature_importances_))
importance_list = []

# Print the feature importances with column names
for feature, importance in importance_dict.items():
    print(f"{feature}: {importance}")
    importance_list.append({feature: importance})
sorted_list = sorted(importance_list, key=lambda x: list(x.values())[0], reverse=True)
sorted_list

parents_pretentious: 0.0388657101176512
parents_usual: 0.05719843649437825
has_nurs_improper: 0.028497609253430374
has_nurs_less_proper: 0.056568907772925774
has_nurs_proper: 0.05391377873163199
has_nurs_very_crit: 0.04698578171249936
form_completed: 0.00943540770629846
form_foster: 0.013768926123685216
form_incomplete: 0.011239977507190581
children_2: 0.011072950781608526
children_3: 0.016165401217203303
children_more: 0.015792227461563293
housing_critical: 0.021783698547180645
housing_less_conv: 0.018616109172604547
finance_inconv: 0.018687286535716115
social_problematic: 0.024244538885219918
social_slightly_prob: 0.0086256677754716
health_priority: 0.2764701435749043
health_recommended: 0.27206744062883664


[{'health_priority': 0.2764701435749043},
 {'health_recommended': 0.27206744062883664},
 {'parents_usual': 0.05719843649437825},
 {'has_nurs_less_proper': 0.056568907772925774},
 {'has_nurs_proper': 0.05391377873163199},
 {'has_nurs_very_crit': 0.04698578171249936},
 {'parents_pretentious': 0.0388657101176512},
 {'has_nurs_improper': 0.028497609253430374},
 {'social_problematic': 0.024244538885219918},
 {'housing_critical': 0.021783698547180645},
 {'finance_inconv': 0.018687286535716115},
 {'housing_less_conv': 0.018616109172604547},
 {'children_3': 0.016165401217203303},
 {'children_more': 0.015792227461563293},
 {'form_foster': 0.013768926123685216},
 {'form_incomplete': 0.011239977507190581},
 {'children_2': 0.011072950781608526},
 {'form_completed': 0.00943540770629846},
 {'social_slightly_prob': 0.0086256677754716}]

In [29]:
y_pred = model.predict(X_test)
y_pred

array([[0, 0, 1],
       [1, 0, 0],
       [0, 0, 1],
       ...,
       [1, 0, 0],
       [1, 0, 0],
       [0, 1, 0]])

In [30]:
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

ValueError: multilabel-indicator is not supported