In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

import matplotlib.pyplot as plt
import seaborn as sns
import kagglehub
import pandas as pd

In [None]:
# Download latest version
path = kagglehub.dataset_download("uciml/mushroom-classification")

print("Path to dataset files:", path)

Downloading from https://www.kaggle.com/api/v1/datasets/download/uciml/mushroom-classification?dataset_version_number=1...


100%|██████████| 34.2k/34.2k [00:00<00:00, 10.6MB/s]

Extracting files...
Path to dataset files: /root/.cache/kagglehub/datasets/uciml/mushroom-classification/versions/1





# Exploració de les dades


Aquest conjunt de dades inclou descripcions de mostres hipotètiques corresponents a 23 espècies de bolets branquials del bolet de la família Agaricus i Lepiota extretes de The Audubon Society Field Guide to North American Mushrooms. Cada espècie s'identifica com a definitivament comestible, definitivament verinosa o de comestibilitat desconeguda i no recomanada. Aquesta darrera classe es va combinar amb la verinosa.

La Guia diu clarament que no hi ha una regla senzilla per determinar la comestibilitat d'un bolet, per tant, intentarem trobar el millor model que pugui classificar els bolets en comestibles o no comestibles a partir de les seves característiques.

In [None]:
# Load CSV file into DataFrame
ruta = path + '/mushrooms.csv'
df_orig = pd.read_csv(ruta)
df = df_orig.copy()

A primera vista, tenim un dataset amb 8124 exemplars i 22 característiques categòriques, el target "class" ens indica si el bolet és comestible o no.

De les 22 característiques categòriques 4 són del tipus ordinal, i la resta són totes nominals.

Característiques ordinals:

* gill-spacing: espaiament de les làmines  
* gill-size: mida de les làmines  
* ring-number: nombre d'anells  
* population: població

Característiques nominals:

* cap-shape: forma del barret  
* cap-surface: superfície del barret  
* cap-color: color del barret  
* bruises: presència de marques/morellia  
* odor: olor  
* gill-attachment: adhesió de les làmines  
* gill-color: color de les làmines  
* stalk-shape: forma del peu  
* stalk-root: base del peu  
* stalk-surface-above-ring: superfície del peu per sobre de l'anell  
* stalk-surface-below-ring: superfície del peu per sota de l'anell  
* stalk-color-above-ring: color del peu per sobre de l'anell  
* stalk-color-below-ring: color del peu per sota de l'anell  
* veil-type: tipus de vel  
* veil-color: color del vel  
* ring-type: tipus d'anell  
* spore-print-color: color de l'empremta de les espores  
* habitat: hàbitat


In [None]:
print(f"NaNs al dataset: ", df.isna().sum().sum())

NaNs al dataset:  0


Aquest és un conjunt de dades sense NaNs, això ens simplificarà la feina perquè no haurem de tractar amb aquests valors. No hem d'analitzar el percentatge de NaNs en cada columna i no hem d'eliminar files que continguin NaNs o buscar una forma efectiva de reomplir totes aquestes dades.

In [None]:
df.drop('veil-type', axis = 1, inplace=True)

In [None]:
def encode_nominals(df_orig):
    df = df_orig.copy()
    ignored_columns = ['gill-spacing', 'gill-size', 'ring-number', 'population', 'class']
    encoder = OneHotEncoder(sparse_output=False)  # Using sparse_output=False for easier handling
    nominal_columns = [col for col in df.columns if col not in ignored_columns]
    temp_df = df[nominal_columns]
    temp_df = encoder.fit_transform(temp_df)
    temp_df = pd.DataFrame(temp_df, columns=encoder.get_feature_names_out())
    df = df.drop(columns=nominal_columns)
    df = pd.concat([df, temp_df], axis=1)
    return df

def encode_ordinals(df):
    df_encoded = df.copy()
    ordinal_columns = ['gill-spacing', 'gill-size', 'ring-number', 'population', 'class']
    for col in ordinal_columns:
        if col in df.columns:
            col_values = df[[col]]
            # Initialize and fit encoder for the column
            encoder = OrdinalEncoder()
            encoded_col = encoder.fit_transform(col_values)
            df_encoded[col] = encoded_col.ravel()
    return df_encoded

def encode_target(df):
    df_encoded = df.copy()
    encoder = LabelEncoder()
    df_encoded['class'] = encoder.fit_transform(df['class'])

    return df_encoded

df = df_orig.copy()
df = encode_nominals(df)
df = encode_ordinals(df)
df = encode_target(df)

In [None]:
y = df['class']
X = df.drop('class', axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [None]:
lr_model = LogisticRegression()
svm = SVC(C=1.0, kernel='rbf')
dt = DecisionTreeClassifier()
rt = RandomForestClassifier(random_state = 42)


In [None]:
lr_model.fit(X_train, y_train)
svm.fit(X_train, y_train)
dt.fit(X_train, y_train)
rt.fit(X_train, y_train)

In [None]:
print(f"Regressor logistic: {lr_model.score(X_test, y_test)}")
print(f"SVC: {svm.score(X_test, y_test)}")
print(f"Decision tree: {dt.score(X_test, y_test)}")
print(f"Random forest: {rt.score(X_test, y_test)}")

Regressor logistic: 1.0
SVC: 1.0
Decision tree: 1.0
Random forest: 1.0


In [None]:
from sklearn.model_selection import cross_val_score

lr_model = LogisticRegression()
svm_model = SVC(C=1.0, kernel='rbf')
dt_model = DecisionTreeClassifier()
rf_model = RandomForestClassifier(random_state=42)

models = {
    'Logistic Regression': lr_model,
    'Support Vector Machine': svm_model,
    'Decision Tree': dt_model,
    'Random Forest': rf_model
}

print("Cross-Validation Scores:")
for model_name, model in models.items():
    cv_scores = cross_val_score(model, X_train, y_train, cv=10)
    print(f"{model_name}: {cv_scores.mean():.4f} accuracy with a standard deviation of {cv_scores.std():.4f}")

for model in models.values():
    model.fit(X_train, y_train)

print("\nTest Set Scores:")
for model_name, model in models.items():
    test_score = model.score(X_test, y_test)
    print(f"{model_name}: {test_score:.4f}")

Cross-Validation Scores:
Logistic Regression: 1.0000 accuracy with a standard deviation of 0.0000
Support Vector Machine: 1.0000 accuracy with a standard deviation of 0.0000
Decision Tree: 1.0000 accuracy with a standard deviation of 0.0000
Random Forest: 1.0000 accuracy with a standard deviation of 0.0000

Test Set Scores:
Logistic Regression: 1.0000
Support Vector Machine: 1.0000
Decision Tree: 1.0000
Random Forest: 1.0000


In [None]:
correlation_matrix = df.corr()

absolute_correlation = correlation_matrix['class'].abs()

closest_to_zero = absolute_correlation.sort_values()

print("Columns closest to 0 correlation with 'class':")
print(closest_to_zero.head(n=60))


Columns closest to 0 correlation with 'class':
stalk-surface-above-ring_y    0.016198
stalk-root_b                  0.017712
cap-shape_f                   0.018526
cap-surface_g                 0.023007
cap-shape_c                   0.023007
cap-shape_x                   0.026886
cap-color_c                   0.030910
stalk-color-above-ring_y      0.032545
veil-color_y                  0.032545
cap-color_p                   0.034702
cap-color_r                   0.042854
cap-color_u                   0.042854
cap-color_n                   0.044360
cap-color_g                   0.046456
gill-color_y                  0.046828
gill-color_p                  0.050380
stalk-color-below-ring_y      0.056426
gill-color_r                  0.056426
cap-shape_s                   0.060664
cap-color_b                   0.067544
odor_m                        0.069159
stalk-color-above-ring_c      0.069159
stalk-color-below-ring_c      0.069159
ring-type_n                   0.069159
ring-type_f      

In [None]:
threshold = 0.3
correlation_matrix = df.corr()

absolute_correlation = correlation_matrix['class'].abs()

columns_to_drop = absolute_correlation[absolute_correlation < threshold].index

df_reduced = df.drop(columns=columns_to_drop)

# Display the remaining columns
#print("Remaining columns:")
#print(len(df_reduced.columns))
#sns.heatmap(df_reduced.corr())
df_reduced.sort_values(by='class')

print(df_reduced['veil-type_p'].unique())

[1.]


In [None]:
y = df_reduced['class']
X = df_reduced.drop('class', axis=1)
print(X.columns)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.fit_transform(X_test)


Index(['gill-spacing', 'gill-size', 'bruises_f', 'bruises_t', 'odor_f',
       'odor_n', 'gill-color_b', 'stalk-root_?', 'stalk-surface-above-ring_k',
       'stalk-surface-above-ring_s', 'stalk-surface-below-ring_k',
       'stalk-surface-below-ring_s', 'veil-type_p', 'ring-type_l',
       'ring-type_p', 'spore-print-color_h', 'spore-print-color_k',
       'spore-print-color_n', 'spore-print-color_w', 'habitat_p'],
      dtype='object')


In [None]:
from sklearn.model_selection import cross_val_score
import time


lr_model = LogisticRegression()
svm_model = SVC(C=1.0, kernel='rbf')
dt_model = DecisionTreeClassifier()
rf_model = RandomForestClassifier(random_state=42)

models = {
    'Logistic Regression': lr_model,
    'Support Vector Machine': svm_model,
    'Decision Tree': dt_model,
    'Random Forest': rf_model
}

print("Cross-Validation Scores:")
for model_name, model in models.items():
    start_time = time.time()
    cv_scores = cross_val_score(model, X_train, y_train, cv=10)
    end_time = time.time() -start_time
    print(f"{model_name}: {cv_scores.mean():.4f} accuracy with a standard deviation of {cv_scores.std():.4f}, time {end_time}")
for model in models.values():
    model.fit(X_train, y_train)

print("\nTest Set Scores:")
for model_name, model in models.items():
    test_score = model.score(X_test, y_test)
    print(f"{model_name}: {test_score:.4f}")

Cross-Validation Scores:
Logistic Regression: 0.9787 accuracy with a standard deviation of 0.0064, time 0.2574739456176758
Support Vector Machine: 0.9981 accuracy with a standard deviation of 0.0023, time 0.5380187034606934
Decision Tree: 0.9984 accuracy with a standard deviation of 0.0020, time 0.07332062721252441
Random Forest: 0.9982 accuracy with a standard deviation of 0.0019, time 3.0759692192077637

Test Set Scores:
Logistic Regression: 0.9750
Support Vector Machine: 0.9996
Decision Tree: 0.9996
Random Forest: 0.9996


In [None]:
df
correlation_matrix = df.corr()


high_corr_poison = correlation_matrix.sort_values(by='class').head(n=20)
high_corr_edible = correlation_matrix.sort_values(ascending=False, by='class' ).head(n=20)
high_corr_edible

#print("Columns closest to 0 correlation with 'class':")
#print(closest_to_zero.head())

Unnamed: 0,class,gill-spacing,gill-size,ring-number,population,cap-shape_b,cap-shape_c,cap-shape_f,cap-shape_k,cap-shape_s,...,spore-print-color_u,spore-print-color_w,spore-print-color_y,habitat_d,habitat_g,habitat_l,habitat_m,habitat_p,habitat_u,habitat_w
class,1.0,-0.348387,0.540024,-0.214366,0.298686,-0.182567,0.023007,0.018526,0.163565,-0.060664,...,-0.074371,0.357384,-0.074371,-0.126123,-0.165004,0.15515,-0.138627,0.323346,0.112078,-0.150087
odor_f,0.623842,-0.264112,-0.055394,-0.154142,0.28327,-0.146074,-0.013357,0.083449,-0.025921,-0.037845,...,-0.046396,-0.036033,-0.046396,-0.121806,0.003091,-0.026843,-0.116202,0.256188,0.061839,-0.09363
stalk-surface-above-ring_k,0.587658,-0.175883,0.095225,-0.056607,0.252648,-0.08973,-0.014253,0.019832,0.102231,-0.040383,...,-0.049507,0.225088,-0.049507,-0.072873,-0.031407,0.040251,-0.123995,0.30042,-0.139879,-0.09991
stalk-surface-below-ring_k,0.573524,-0.169265,0.089569,-0.016073,0.273124,-0.095534,-0.013965,0.023582,0.091324,-0.039566,...,-0.048507,0.198267,-0.048507,-0.096854,-0.020547,0.046874,-0.121488,0.310553,-0.137052,-0.09789
gill-size,0.540024,-0.108333,1.0,-0.171362,0.147682,-0.148449,0.033174,-0.075766,0.306333,0.093993,...,0.115232,0.63468,-0.051579,-0.016064,-0.323808,0.336217,-0.129183,0.17019,0.141152,-0.10409
gill-color_b,0.538808,-0.228112,0.776903,-0.133132,0.147788,-0.126163,-0.011536,-0.058296,0.397574,-0.032686,...,-0.040072,0.805573,-0.040072,-0.057785,-0.311623,0.395882,-0.100363,0.287683,-0.11322,-0.080868
bruises_f,0.50153,0.299473,0.369596,-0.056788,-0.088137,-0.115686,-0.003803,-0.045188,0.229578,0.053026,...,-0.091427,0.416877,0.065008,-0.264457,0.131746,0.278238,-0.228986,0.24946,-0.143017,-0.184507
spore-print-color_h,0.490229,-0.220039,-0.303538,-0.12842,0.272121,-0.121699,-0.011128,0.115253,-0.168905,-0.03153,...,-0.038654,-0.323507,-0.038654,-0.096099,0.10066,-0.169359,-0.096811,0.178579,0.103522,-0.078006
ring-type_l,0.451619,-0.191199,-0.291479,-0.111588,0.297862,-0.105748,-0.00967,0.100147,-0.146767,-0.027397,...,-0.033588,-0.281105,-0.033588,-0.048434,0.068096,-0.147161,-0.084122,0.24113,-0.094899,-0.067782
spore-print-color_w,0.357384,-0.024711,0.63468,0.32519,-0.06084,-0.024592,0.034398,-0.125597,0.47924,-0.040575,...,-0.049743,1.0,-0.049743,-0.151596,-0.210385,0.352405,-0.124586,0.217292,-0.140546,0.241127


