In [1]:
import pandas as pd

# Load data (adjust path if needed)
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/mushroom/agaricus-lepiota.data"
df = pd.read_csv(url, header=None)

# Add column names (from UCI description)
columns = [
    "class", "cap-shape", "cap-surface", "cap-color", "bruises", "odor",
    "gill-attachment", "gill-spacing", "gill-size", "gill-color", "stalk-shape",
    "stalk-root", "stalk-surface-above-ring", "stalk-surface-below-ring",
    "stalk-color-above-ring", "stalk-color-below-ring", "veil-type", "veil-color",
    "ring-number", "ring-type", "spore-print-color", "population", "habitat"
]
df.columns = columns

# Display first 5 rows
df.head()

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g


In [3]:
# Encode target (class: e=0, p=1)
df["class"] = df["class"].map({"e": 0, "p": 1})

# Select predictors: odor + one other (e.g., gill-color)
predictors = ["odor", "gill-color"]  # Change if needed
X = pd.get_dummies(df[predictors])  # One-hot encoding
y = df["class"]  # Target

# Check encoded features
X.head()

Unnamed: 0,odor_a,odor_c,odor_f,odor_l,odor_m,odor_n,odor_p,odor_s,odor_y,gill-color_b,...,gill-color_g,gill-color_h,gill-color_k,gill-color_n,gill-color_o,gill-color_p,gill-color_r,gill-color_u,gill-color_w,gill-color_y
0,False,False,False,False,False,False,True,False,False,False,...,False,False,True,False,False,False,False,False,False,False
1,True,False,False,False,False,False,False,False,False,False,...,False,False,True,False,False,False,False,False,False,False
2,False,False,False,True,False,False,False,False,False,False,...,False,False,False,True,False,False,False,False,False,False
3,False,False,False,False,False,False,True,False,False,False,...,False,False,False,True,False,False,False,False,False,False
4,False,False,False,False,False,True,False,False,False,False,...,False,False,True,False,False,False,False,False,False,False


In [5]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Split data (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train model
model = LogisticRegression()
model.fit(X_train, y_train)

# Predict and evaluate
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

Accuracy: 0.99


In [7]:
# Test odor only
X_odor = pd.get_dummies(df[["odor"]])
X_train, X_test, y_train, y_test = train_test_split(X_odor, y, test_size=0.2, random_state=42)
model.fit(X_train, y_train)
accuracy_odor = accuracy_score(y_test, model.predict(X_test))

# Test gill-color only (or your chosen column)
X_gill = pd.get_dummies(df[["gill-color"]])
X_train, X_test, y_train, y_test = train_test_split(X_gill, y, test_size=0.2, random_state=42)
model.fit(X_train, y_train)
accuracy_gill = accuracy_score(y_test, model.predict(X_test))

print(f"Odor accuracy: {accuracy_odor:.2f}")
print(f"Gill-color accuracy: {accuracy_gill:.2f}")

Odor accuracy: 0.98
Gill-color accuracy: 0.81


## Conclusions  
- **Odor** is a stronger predictor of mushroom poisoning (`accuracy=0.98`) compared to `gill-color` (`accuracy=0.81`).  
- Recommendations:  
  - Explore other classifiers (e.g., Random Forest).  
  - Include more features to improve robustness.  