In [100]:
# Velger å bruke kNN, Logic regression og 
# Laster inn de nødvendige modulene for oppgaven

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
import os

In [101]:
# Bruker link for enkelheten sin skyld, og legger til navn til de forskjellige attributene
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/mushroom/agaricus-lepiota.data'
names = ['class', 'cap-shape', 'cap-surface', 'cap-color', 'bruises', 'odor', 'gill-attachment', 
         'gill-spacing', 'gill-size', 'gill-color', 'stalk-shape', 'stalk-root', 'stalk-surface-above-ring',
         'stalk-surface-below-ring', 'stalk-color-above-ring', 'stalk-color-below-ring', 'veil-type',
         'veil-color', 'ring-number', 'ring-type', 'spore-print-color', 'population', 'habitat']
mushrooms = pd.read_csv(url, names=names)
mushrooms.head()

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g


In [102]:
# Print info about the dataset
print(mushrooms.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8124 entries, 0 to 8123
Data columns (total 23 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   class                     8124 non-null   object
 1   cap-shape                 8124 non-null   object
 2   cap-surface               8124 non-null   object
 3   cap-color                 8124 non-null   object
 4   bruises                   8124 non-null   object
 5   odor                      8124 non-null   object
 6   gill-attachment           8124 non-null   object
 7   gill-spacing              8124 non-null   object
 8   gill-size                 8124 non-null   object
 9   gill-color                8124 non-null   object
 10  stalk-shape               8124 non-null   object
 11  stalk-root                8124 non-null   object
 12  stalk-surface-above-ring  8124 non-null   object
 13  stalk-surface-below-ring  8124 non-null   object
 14  stalk-color-above-ring  

In [103]:
# Bruker one-hot for å encode categorical features
mushrooms = pd.get_dummies(mushrooms, columns=names[1:])

In [104]:
# Fjerner 'veil-type' fordi den har ingenting å si for å si om soppen er spiselig/uspiselig, den er unik.
# mushrooms.drop('veil-type', axis=1, inplace=True)

#Sjekker om den er fjernet, 23 ble til 22 rows.
mushrooms.head()

Unnamed: 0,class,cap-shape_b,cap-shape_c,cap-shape_f,cap-shape_k,cap-shape_s,cap-shape_x,cap-surface_f,cap-surface_g,cap-surface_s,...,population_s,population_v,population_y,habitat_d,habitat_g,habitat_l,habitat_m,habitat_p,habitat_u,habitat_w
0,p,0,0,0,0,0,1,0,0,1,...,1,0,0,0,0,0,0,0,1,0
1,e,0,0,0,0,0,1,0,0,1,...,0,0,0,0,1,0,0,0,0,0
2,e,1,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,1,0,0,0
3,p,0,0,0,0,0,1,0,0,0,...,1,0,0,0,0,0,0,0,1,0
4,e,0,0,0,0,0,1,0,0,1,...,0,0,0,0,1,0,0,0,0,0


In [105]:
# Separerer dataen til training og testing sets
X = mushrooms.drop('class', axis=1)
y = mushrooms['class']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [106]:
# Trener kNN modellen
k = 15
knn = KNeighborsClassifier(n_neighbors=k)
knn.fit(X_train, y_train)

In [107]:
# Predicter labels for test settet
y_pred = knn.predict(X_test)

In [108]:
# Kalkulerer accuracy til KNN modellen
accuracy = knn.score(X_test, y_test)
print('Accuracy:', accuracy)

Accuracy: 1.0


In [109]:
from sklearn.metrics import classification_report

# Kalkulerer følgende; precision, recall, og F1-score
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           e       1.00      1.00      1.00       852
           p       1.00      1.00      1.00       773

    accuracy                           1.00      1625
   macro avg       1.00      1.00      1.00      1625
weighted avg       1.00      1.00      1.00      1625



In [110]:
from sklearn.model_selection import cross_val_score

# 10-fold cross-validation
scores = cross_val_score(knn, X, y, cv=10)

# Kalkulerer median and standard deviation for cross-validation scores
mean_score = np.mean(scores)
std_score = np.std(scores)
print('Mean accuracy:', mean_score)
print('Standard deviation:', std_score)

Mean accuracy: 0.9560731099921836
Standard deviation: 0.09523823524218636


In [126]:
from sklearn.feature_selection import SelectKBest, chi2

# Bruker SelectKBest og chi2 for å finne de 10 viktigste features.
selector = SelectKBest(chi2, k=10)
selector.fit(X_train, y_train)
viktigste_features = selector.get_support(indices=True)
feature_names = X.columns[viktigste_features]

print('De 10 viktigste features - kNN:')
for feature in feature_names:
    print(feature)

De 10 viktigste features - kNN:
bruises_t
odor_f
odor_n
gill-size_n
gill-color_b
stalk-surface-above-ring_k
stalk-surface-below-ring_k
ring-type_l
ring-type_p
spore-print-color_h


In [111]:
from sklearn.linear_model import LogisticRegression

# Trener logistic regression model
lr = LogisticRegression()
lr.fit(X_train, y_train)

In [112]:
# Predicter labels for test settet
y_pred = lr.predict(X_test)

In [113]:
# Kalkulerer accuracy til Logistic regression modellen vår
accuracy = lr.score(X_test, y_test)
print('Accuracy:', accuracy)

Accuracy: 1.0


In [114]:
# 10-fold cross-validation
scores_lr = cross_val_score(lr, X, y, cv=10)

In [115]:
# Kalkulerer median and standard deviation for cross-validation scores
mean_score_lr = np.mean(scores_lr)
std_score_lr = np.std(scores_lr)
print('Mean accuracy:', mean_score_lr)
print('Standard deviation:', std_score_lr)

Mean accuracy: 0.9677588327607414
Standard deviation: 0.06633124514708309


In [124]:
# Kode for å finne de 10 viktigste features - LR versjon.
importances = abs(lr.coef_[0])
sorted_idx = importances.argsort()[::-1]
viktigste_features = sorted_idx[:10]
feature_names = X.columns[viktigste_features]
print('De 10 viktigste features - LR:')
for navn in feature_names:
    print(navn)

De 10 viktigste features:
odor_n
spore-print-color_r
odor_l
odor_a
odor_c
odor_f
stalk-root_b
odor_p
gill-size_b
gill-size_n


In [116]:
from sklearn.tree import DecisionTreeClassifier

# Trener decision tree model
dtc = DecisionTreeClassifier()
dtc.fit(X_train, y_train)

In [117]:
# Predicter labels for test settet
y_pred = dtc.predict(X_test)

In [118]:
# Kalkulerer accuracy til KNN modellen
accuracy = dtc.score(X_test, y_test)
print('Accuracy:', accuracy)

Accuracy: 1.0


In [119]:
# 10-fold cross-validation på decision tree model
scores_dtc = cross_val_score(dtc, X, y, cv=10)

In [120]:
# Kalkulerer medan og standard deviation for cross-validation scores
mean_score_dtc = np.mean(scores_dtc)
std_score_dtc = np.std(scores_dtc)
print('Mean accuracy:', mean_score_dtc)
print('Standard deviation:', std_score_dtc)

Mean accuracy: 0.968265379698132
Standard deviation: 0.09438569955079258


In [123]:
# Kode for å finne de 10 viktigste features - DTC versjon.
importances = dtc.feature_importances_
sorted_idx = importances.argsort()[::-1]
viktigste_features = sorted_idx[:10]
feature_names = X.columns[viktigste_features]
print('De 10 viktigste features - DTC:')
for navn in feature_names:
    print(navn)

De 10 viktigste features:
odor_n
stalk-root_c
stalk-root_r
spore-print-color_r
odor_l
odor_a
habitat_d
stalk-surface-below-ring_y
ring-number_o
cap-surface_g
