In [1]:
import numpy as np
import pandas as pd

In [8]:
import zipfile
import os

# Path to the ZIP file and extraction directory
zip_path = r"C:\Users\jessi\OneDrive\Desktop\ds\mushroom\test.zip"
extract_path = r"C:\Users\jessi\OneDrive\Desktop\ds\mushroom\path"
# Use environment variables or placeholders for paths
# zip_path = os.getenv('ZIP_PATH', 'path/to/your/default/test.zip')
# extract_path = os.getenv('EXTRACT_PATH', 'path/to/your/default/extract/folder')

# Create the extraction directory if it does not exist
if not os.path.exists(extract_path):
    os.makedirs(extract_path)

# Extract the ZIP file
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_path)


In [9]:
import pandas as pd

# Path to the extracted files
train_csv_path = os.path.join(extract_path, 'train.csv')
test_csv_path = os.path.join(extract_path, 'test.csv')

# Load the CSV files into pandas DataFrames
df = pd.read_csv(train_csv_path, index_col='id')
df_test = pd.read_csv(test_csv_path)


# EDA(Explanatory Data Analysis)

In [12]:
df.head()

Unnamed: 0_level_0,class,cap-diameter,cap-shape,cap-surface,cap-color,does-bruise-or-bleed,gill-attachment,gill-spacing,gill-color,stem-height,...,stem-root,stem-surface,stem-color,veil-type,veil-color,has-ring,ring-type,spore-print-color,habitat,season
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,e,8.8,f,s,u,f,a,c,w,4.51,...,,,w,,,f,f,,d,a
1,p,4.51,x,h,o,f,a,c,n,4.79,...,,y,o,,,t,z,,d,w
2,e,6.94,f,s,b,f,x,c,w,6.85,...,,s,n,,,f,f,,l,w
3,e,3.88,f,y,g,f,s,,g,4.16,...,,,w,,,f,f,,d,u
4,e,5.85,x,l,w,f,d,,w,3.37,...,,,w,,,f,f,,g,a


**Finding any column which is present in the training data but not in the testing data/ vice-versa**

In [62]:
# Get the list of columns from both datasets
train_columns = set(df.columns)
test_columns = set(df_test.columns)

# Find columns in the train set but not in the test set
missing_in_test = train_columns - test_columns

# Find columns in the test set but not in the train set
extra_in_test = test_columns - train_columns

print("Columns in train but not in test:", missing_in_test)
print("Columns in test but not in train:", extra_in_test)

Columns in train but not in test: {'class'}
Columns in test but not in train: {'id'}


In [63]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
cap-diameter,3116941.0,6.309848,4.657931,0.03,3.32,5.75,8.24,80.67
stem-height,3116945.0,6.348333,2.699755,0.0,4.67,5.88,7.41,88.72
stem-width,3116945.0,11.153785,8.095477,0.0,4.97,9.65,15.63,102.9


There are three numerical columns, that we have to deal with 

In [64]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3116945 entries, 0 to 3116944
Data columns (total 21 columns):
 #   Column                Dtype  
---  ------                -----  
 0   class                 object 
 1   cap-diameter          float64
 2   cap-shape             object 
 3   cap-surface           object 
 4   cap-color             object 
 5   does-bruise-or-bleed  object 
 6   gill-attachment       object 
 7   gill-spacing          object 
 8   gill-color            object 
 9   stem-height           float64
 10  stem-width            float64
 11  stem-root             object 
 12  stem-surface          object 
 13  stem-color            object 
 14  veil-type             object 
 15  veil-color            object 
 16  has-ring              object 
 17  ring-type             object 
 18  spore-print-color     object 
 19  habitat               object 
 20  season                object 
dtypes: float64(3), object(18)
memory usage: 523.2+ MB


**Finding the null values**

In [65]:
null = df.isnull().sum()
null = null[null>0]
missing_percent = (null/len(df))*100
missing_info = pd.DataFrame({'Missing Values': null, 'Percentage': missing_percent})
missing_info = missing_info.sort_values(by='Percentage', ascending=False)
missing_info

Unnamed: 0,Missing Values,Percentage
veil-type,2957493,94.88435
spore-print-color,2849682,91.425482
stem-root,2757023,88.452732
veil-color,2740947,87.93697
stem-surface,1980861,63.551362
gill-spacing,1258435,40.373988
cap-surface,671023,21.528227
gill-attachment,523936,16.80928
ring-type,128880,4.134818
gill-color,57,0.001829


**Dropping the columns where more than 60% of the data is missing**

In [66]:
columns_to_drop = missing_info[missing_info['Percentage'] > 60].index
print(columns_to_drop)

Index(['veil-type', 'spore-print-color', 'stem-root', 'veil-color',
       'stem-surface'],
      dtype='object')


In [67]:
df = df.drop(columns=columns_to_drop)
# dropping the columns from the testing datset too.
df_test = df_test.drop(columns = columns_to_drop)

# Imputing the missing data Categorical data first

In [68]:
df['gill-spacing'].unique()

array(['c', nan, 'd', 'f', 'x', 'b', 'a', '3.61', '2.69', 'k', '4.8', 'e',
       'y', 'class', 's', '9.01', 'p', '3.92', '5.22', '6.67', '4.04',
       't', '0.73', 'i', '3.57', '24.38', 'w', 'h', 'cap-surface', 'l',
       '1', '12.27', '5.42', 'r', '1.6', 'n', 'g', '0', '3.81', '4.09',
       '1.36', '3.24', '5.55', '5.7', '3.62', 'does f', '6.4', '1.88',
       '55.13'], dtype=object)

In [69]:
df['gill-spacing'].value_counts()

gill-spacing
c              1331054
d               407932
f               119380
e                   24
a                   17
s                   16
b                   12
x                    8
t                    8
p                    7
g                    4
k                    4
h                    3
l                    3
y                    2
r                    2
6.67                 2
0                    2
9.01                 1
2.69                 1
3.61                 1
class                1
4.8                  1
4.04                 1
3.57                 1
i                    1
w                    1
24.38                1
cap-surface          1
0.73                 1
5.22                 1
3.92                 1
5.42                 1
12.27                1
1                    1
1.6                  1
n                    1
3.81                 1
4.09                 1
1.36                 1
3.24                 1
5.55                 1
5.7                  

**But the categorical only should have alphabetical values.
Thus, we will remove the other unwanted values and replace them with np.nan**

In [70]:
def keep_chars_only(value):
    if isinstance(value, str) and (len(value) == 1 and value.isalpha()):
        return value
    else:
        return np.nan 
cat_cols = df.select_dtypes(include = 'object')
for col in cat_cols.columns:
    if col != 'class':
        df[col] = df[col].apply(keep_chars_only)
        df_test[col] = df_test[col].apply(keep_chars_only)

In [71]:
df['gill-spacing'].value_counts()

gill-spacing
c    1331054
d     407932
f     119380
e         24
a         17
s         16
b         12
x          8
t          8
p          7
k          4
g          4
l          3
h          3
y          2
r          2
i          1
w          1
n          1
Name: count, dtype: int64

**Imputing the missing values, in the categorical columns with the mode(most frequent value)**

In [72]:
cat_cols = df.select_dtypes(include = 'object')
cat_cols = cat_cols.drop('class', axis = 1)
for col in cat_cols.columns:
    df[col] = df[col].fillna(df[col].mode()[0])
    df_test[col] = df_test[col].fillna(df_test[col].mode()[0])

In [73]:
print("Testing data")
print(df_test.isnull().sum())
print("___________________________")
print("Training data")
df.isnull().sum()

Testing data
id                      0
cap-diameter            7
cap-shape               0
cap-surface             0
cap-color               0
does-bruise-or-bleed    0
gill-attachment         0
gill-spacing            0
gill-color              0
stem-height             1
stem-width              0
stem-color              0
has-ring                0
ring-type               0
habitat                 0
season                  0
dtype: int64
___________________________
Training data


class                   0
cap-diameter            4
cap-shape               0
cap-surface             0
cap-color               0
does-bruise-or-bleed    0
gill-attachment         0
gill-spacing            0
gill-color              0
stem-height             0
stem-width              0
stem-color              0
has-ring                0
ring-type               0
habitat                 0
season                  0
dtype: int64

**Dealing with the missing numerical data**

In [74]:
df['cap-diameter'] = df['cap-diameter'].fillna(df['cap-diameter'].mode()[0])
df_test['cap-diameter'] = df_test['cap-diameter'].fillna(df_test['cap-diameter'].mode()[0])
df['stem-height'] = df['stem-height'].fillna(df['stem-height'].mode()[0])
df_test['stem-height'] = df_test['stem-height'].fillna(df_test['stem-height'].mode()[0])

In [75]:
df.isnull().sum()

class                   0
cap-diameter            0
cap-shape               0
cap-surface             0
cap-color               0
does-bruise-or-bleed    0
gill-attachment         0
gill-spacing            0
gill-color              0
stem-height             0
stem-width              0
stem-color              0
has-ring                0
ring-type               0
habitat                 0
season                  0
dtype: int64

In [76]:
df_test.isnull().sum()

id                      0
cap-diameter            0
cap-shape               0
cap-surface             0
cap-color               0
does-bruise-or-bleed    0
gill-attachment         0
gill-spacing            0
gill-color              0
stem-height             0
stem-width              0
stem-color              0
has-ring                0
ring-type               0
habitat                 0
season                  0
dtype: int64

**Now that we've deal with the missing data,we move forward to encoding the categorical data**

The CatBoostEncoder,  it requires the target variable (y in fit_transform) to be numeric.

In [77]:
df['class'] = df['class'].replace({"p": 1, "e": 0})
df['class']

  df['class'] = df['class'].replace({"p": 1, "e": 0})


id
0          0
1          1
2          0
3          0
4          0
          ..
3116940    0
3116941    0
3116942    1
3116943    0
3116944    1
Name: class, Length: 3116945, dtype: int64

In [79]:
colss = df.select_dtypes(include=['object']).columns
import category_encoders as ce
encoder = ce.CatBoostEncoder(cols=colss)
df[colss] = encoder.fit_transform(df[colss], df["class"]) 

In [80]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3116945 entries, 0 to 3116944
Data columns (total 16 columns):
 #   Column                Dtype  
---  ------                -----  
 0   class                 int64  
 1   cap-diameter          float64
 2   cap-shape             float64
 3   cap-surface           float64
 4   cap-color             float64
 5   does-bruise-or-bleed  float64
 6   gill-attachment       float64
 7   gill-spacing          float64
 8   gill-color            float64
 9   stem-height           float64
 10  stem-width            float64
 11  stem-color            float64
 12  has-ring              float64
 13  ring-type             float64
 14  habitat               float64
 15  season                float64
dtypes: float64(15), int64(1)
memory usage: 404.3 MB


**Now the data is cleaned**

In [81]:
X = df.drop("class", axis=1)
y = df["class"]

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA

scaler = MinMaxScaler()

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.55, random_state=0)

# pca = PCA(n_components=4)
# X_train, X_test = pca.fit_transform(X_train), pca.transform(X_test)
X_train, X_test = scaler.fit_transform(X_train), scaler.transform(X_test)


In [85]:
import optuna
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_breast_cancer
from sklearn.metrics import accuracy_score

# Define the objective function
def objective(trial):
    # Define hyperparameters to be tuned
    param = {
        'verbosity': 0,
        'objective': 'binary:logistic',
        'eval_metric': 'logloss',
        'use_label_encoder': False,
        'booster': trial.suggest_categorical('booster', ['gbtree', 'gblinear', 'dart']),
        'lambda': trial.suggest_float('lambda', 1e-8, 1.0, log=True),
        'alpha': trial.suggest_float('alpha', 1e-8, 1.0, log=True),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'max_depth': trial.suggest_int('max_depth', 3, 9),
        'n_estimators': trial.suggest_int('n_estimators', 50, 500),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
    }
    
    # Train the model
    model = xgb.XGBClassifier(**param, tree_method="gpu_hist")
    model.fit(X_train, y_train)
        # Predict and calculate accuracy
    preds = model.predict(X_test)
    accuracy = accuracy_score(y_test, preds)
    
    return accuracy

# Create a study object
study = optuna.create_study(direction='maximize')

# Optimize the objective function
study.optimize(objective, n_trials=30)

# Print the best trial
print("Best trial:")
trial = study.best_trial
print(f"  Value: {trial.value}")
print("  Params: ")
for key, value in trial.params.items():
    print(f"    {key}: {value}")
# Train the final model with the best parameters
best_params = study.best_params
best_model = xgb.XGBClassifier(**best_params, tree_method="gpu_hist")
best_model.fit(X_train, y_train)

# Predict on the test set and calculate accuracy
final_preds = best_model.predict(X_test)
final_accuracy = accuracy_score(y_test, final_preds)

print(f"Final Model Accuracy: {final_accuracy}")

[I 2024-08-29 06:57:44,604] A new study created in memory with name: no-name-6a2edeae-5998-47a8-b753-1b1493ae9398
[I 2024-08-29 06:57:58,785] Trial 0 finished with value: 0.9897014559708806 and parameters: {'booster': 'gbtree', 'lambda': 0.4783362168515034, 'alpha': 0.004021315108423867, 'subsample': 0.7786327874041084, 'colsample_bytree': 0.8665960161129658, 'max_depth': 9, 'n_estimators': 457, 'learning_rate': 0.051908356279005585, 'min_child_weight': 5}. Best is trial 0 with value: 0.9897014559708806.
[I 2024-08-29 06:58:18,480] Trial 1 finished with value: 0.7061546269074619 and parameters: {'booster': 'gblinear', 'lambda': 1.0277800804360114e-06, 'alpha': 1.2683216617975556e-08, 'subsample': 0.9361507806158429, 'colsample_bytree': 0.6113877188439919, 'max_depth': 5, 'n_estimators': 448, 'learning_rate': 0.0886089782920439, 'min_child_weight': 3}. Best is trial 0 with value: 0.9897014559708806.
[I 2024-08-29 06:58:24,654] Trial 2 finished with value: 0.6155472723878855 and paramete

Best trial:
  Value: 0.9900514489710206
  Params: 
    booster: gbtree
    lambda: 1.7095214210318354e-08
    alpha: 0.000546122100717449
    subsample: 0.8509960335129325
    colsample_bytree: 0.5184077863693032
    max_depth: 9
    n_estimators: 495
    learning_rate: 0.0797316425702096
    min_child_weight: 6



    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"



Final Model Accuracy: 0.9900514489710206
