Name : KOMMINENI GREESHMA

Number : 2025AA05823

Dataset : Letter Recognition

Link  : https://archive.ics.uci.edu/dataset/59/letter+recognition

In [40]:
#Upload dataset
from google.colab import files
uploaded = files.upload()

Saving letter-recognition.data to letter-recognition (4).data


In [41]:
!pip install pandas scikit-learn xgboost joblib



In [42]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, matthews_corrcoef, roc_auc_score
from sklearn.preprocessing import label_binarize
import joblib

In [43]:
#Load dataset
columns = ["letter","x-box","y-box","width","height","onpix","x-bar","y-bar",
           "x2bar","y2bar","xybar","x2ybr","xy2br","x-ege","xegvy","y-ege","yegvx"]

df = pd.read_csv("letter-recognition.data", names=columns)
X = df.drop("letter", axis=1)
y = df["letter"]
df.head()

Unnamed: 0,letter,x-box,y-box,width,height,onpix,x-bar,y-bar,x2bar,y2bar,xybar,x2ybr,xy2br,x-ege,xegvy,y-ege,yegvx
0,T,2,8,3,5,1,8,13,0,6,6,10,8,0,8,0,8
1,I,5,12,3,7,2,10,5,5,4,13,3,9,2,8,4,10
2,D,4,11,6,8,6,10,6,2,6,10,3,7,3,7,3,9
3,N,7,11,6,6,3,5,9,4,6,4,4,10,6,10,2,8
4,G,2,1,3,1,1,8,6,6,6,6,5,9,1,7,5,10


In [44]:
#Preprocessing
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

le = LabelEncoder()
y_enc = le.fit_transform(y)

X_train, X_val, y_train, y_val = train_test_split(
    X_scaled, y_enc, test_size=0.2, random_state=42, stratify=y_enc
)

In [45]:
# Save the 20% test set for Streamlit app
test_df = pd.DataFrame(X_val, columns=X.columns)
test_df["letter"] = le.inverse_transform(y_val)
test_df.to_csv("test.csv", index=False)

In [46]:
#download test file
from google.colab import files
files.download("test.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [47]:
#Evaluation function
def evaluate(model, Xv, yv, label_encoder):
    y_pred = model.predict(Xv)
    acc = accuracy_score(yv, y_pred)
    prec = precision_score(yv, y_pred, average="macro")
    rec = recall_score(yv, y_pred, average="macro")
    f1 = f1_score(yv, y_pred, average="macro")
    mcc = matthews_corrcoef(yv, y_pred)
    try:
        y_proba = model.predict_proba(Xv)
        classes = list(range(len(label_encoder.classes_)))
        yv_bin = label_binarize(yv, classes=classes)
        auc = roc_auc_score(yv_bin, y_proba, average="macro", multi_class="ovr")
    except Exception:
        auc = None
    return {"Accuracy": acc, "AUC": auc, "Precision": prec, "Recall": rec, "F1": f1, "MCC": mcc}

In [48]:
#Train models
results = {}
models = {}

# Logistic Regression
log_reg = LogisticRegression(max_iter=1000)
log_reg.fit(X_train, y_train)
results["Logistic Regression"] = evaluate(log_reg, X_val, y_val, le)
models["log_reg"] = log_reg

# Decision Tree
dt = DecisionTreeClassifier(random_state=42)
grid_dt = GridSearchCV(dt, {"max_depth":[None,10,20], "min_samples_split":[2,5]}, cv=3, n_jobs=-1)
grid_dt.fit(X_train, y_train)
best_dt = grid_dt.best_estimator_
results["Decision Tree"] = evaluate(best_dt, X_val, y_val, le)
models["decision_tree"] = best_dt

# KNN
knn = KNeighborsClassifier()
grid_knn = GridSearchCV(knn, {"n_neighbors":[3,5,7], "weights":["uniform","distance"]}, cv=3, n_jobs=-1)
grid_knn.fit(X_train, y_train)
best_knn = grid_knn.best_estimator_
results["KNN"] = evaluate(best_knn, X_val, y_val, le)
models["knn"] = best_knn

# Naive Bayes
nb = GaussianNB()
nb.fit(X_train, y_train)
results["Naive Bayes"] = evaluate(nb, X_val, y_val, le)
models["naive_bayes"] = nb

# Random Forest
rf = RandomForestClassifier(random_state=42)
grid_rf = GridSearchCV(
    rf,
    {"n_estimators":[50,100], "max_depth":[10,20], "min_samples_leaf":[2,4]},  # âœ… updated params
    cv=3,
    n_jobs=-1
)
grid_rf.fit(X_train, y_train)
best_rf = grid_rf.best_estimator_
results["Random Forest"] = evaluate(best_rf, X_val, y_val, le)
models["random_forest"] = best_rf

# XGBoost
xgb = XGBClassifier(eval_metric="mlogloss", random_state=42)
xgb.fit(X_train, y_train)
results["XGBoost"] = evaluate(xgb, X_val, y_val, le)
models["xgboost"] = xgb

In [49]:
#save model and metrics
os.makedirs("models", exist_ok=True)
joblib.dump(scaler, "models/scaler.pkl")
joblib.dump(le, "models/label_encoder.pkl")
for name, model in models.items():
    joblib.dump(model, f"models/{name}.pkl")

metrics_df = pd.DataFrame(results).T.round(6)
metrics_df.to_csv("models/metrics_summary.csv", index=True)

print("Training complete. Models and preprocessors saved in /models")
metrics_df

Training complete. Models and preprocessors saved in /models


Unnamed: 0,Accuracy,AUC,Precision,Recall,F1,MCC
Logistic Regression,0.77425,0.980521,0.77501,0.773009,0.772848,0.765306
Decision Tree,0.88325,0.939207,0.883348,0.883084,0.882945,0.878598
KNN,0.9495,0.992721,0.949917,0.949341,0.949382,0.947499
Naive Bayes,0.65225,0.957283,0.664067,0.651197,0.647879,0.639111
Random Forest,0.96175,0.999444,0.962229,0.961411,0.961575,0.960238
XGBoost,0.96425,0.99969,0.964183,0.963996,0.963992,0.962827


In [50]:
import shutil
from google.colab import files

# Zip the folder
shutil.make_archive("models", 'zip', "models")

# Download the zip
files.download("models.zip")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>