In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import SGDClassifier
from sklearn.dummy import DummyClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import cross_val_score, train_test_split

In [12]:
# Loading the mushroom dataset
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/mushroom/agaricus-lepiota.data"
df = pd.read_csv(url, header=None)

In [13]:
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,13,14,15,16,17,18,19,20,21,22
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8119,e,k,s,n,f,n,a,c,b,y,...,s,o,o,p,o,o,p,b,c,l
8120,e,x,s,n,f,n,a,c,b,y,...,s,o,o,p,n,o,p,b,v,l
8121,e,f,s,n,f,n,a,c,b,n,...,s,o,o,p,o,o,p,b,c,l
8122,p,k,y,n,f,y,f,c,n,b,...,k,w,w,p,w,o,e,w,v,l


In [14]:
# Replacing the class labels 'p' and 'e' with 1 and 0 respectively
df[0] = df[0].map({'p': 1, 'e': 0})



In [15]:
#label encoder
le = LabelEncoder()
for col in df.columns[1:]:
    df[col] = le.fit_transform(df[col])

In [16]:
X = df.iloc[:, 1:]
y = df.iloc[:, 0]

In [17]:
# Standardization
scaler = StandardScaler()
X = scaler.fit_transform(X)

In [21]:
# Creating SGDClassifier and DummyClassifier objects
sgd_clf = SGDClassifier(random_state=42)
dummy_clf = DummyClassifier(strategy="most_frequent", random_state=42)

In [24]:
sgd_clf.fit(X, y)

In [27]:
dummy_clf.fit(X, y)

In [25]:
# Evaluating the classifiers using cross-validation
scores_sgd = cross_val_score(sgd_clf, X, y, cv=10)
scores_dummy = cross_val_score(dummy_clf, X, y, cv=10)

In [28]:
# Print the accuracy, precision, recall, and f1 scores of the classifiers
print("SGDClassifier:")
print("Accuracy:", scores_sgd.mean())
print("Precision:", precision_score(y, sgd_clf.predict(X)))
print("Recall:", recall_score(y, sgd_clf.predict(X)))
print("F1 score:", f1_score(y, sgd_clf.predict(X)))

print("DummyClassifier:")
print("Accuracy:", scores_dummy.mean())
print("Precision:", precision_score(y, dummy_clf.predict(X)))
print("Recall:", recall_score(y, dummy_clf.predict(X)))
print("F1 score:", f1_score(y, dummy_clf.predict(X)))

SGDClassifier:
Accuracy: 0.8999350153600059
Precision: 0.9874575385419389
Recall: 0.9650153217568948
F1 score: 0.9761074518920314
DummyClassifier:
Accuracy: 0.5179715097643587
Precision: 0.0
Recall: 0.0
F1 score: 0.0


  _warn_prf(average, modifier, msg_start, len(result))


Conclusion:

Our results showed that the SGDClassifier had an accuracy of 0.8999, precision of 0.9875, recall of 0.9650, and F1 score of 0.9761. In comparison, the DummyClassifier had an accuracy of 0.5180, precision of 0.0, recall of 0.0, and F1 score of 0.0.

Based on these results, we can conclude that the SGDClassifier outperformed the DummyClassifier in terms of all performance metrics, indicating that the former is a more effective classifier for this dataset. Additionally, it is important to note that preprocessing the data using techniques like feature engineering and feature selection may further improve the performance of the classifier.