In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
import joblib

# Load data
order_info = pd.read_csv("order_info.csv")
order_line = pd.read_csv("order_line.csv")

# Clean headers
order_info = order_info.rename(columns=lambda x: x.strip())
order_line = order_line.rename(columns=lambda x: x.strip())

# Preprocess
order_info['Customer Age'] = order_info['Customer Age'].astype(int)
order_info['Customer Gender'] = order_info['Customer Gender'].str.strip().str.capitalize()

# Merge
df = pd.merge(order_info, order_line, on="Order ID")

# Encode features and target
le_gender = LabelEncoder()
df['Customer Gender'] = le_gender.fit_transform(df['Customer Gender'])

le_category = LabelEncoder()
df['Category'] = le_category.fit_transform(df['Category'])

X = df[['Customer Age', 'Customer Gender']]
y = df['Category']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

# Model training
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "Random Forest": RandomForestClassifier(random_state=42),
    "K-Nearest Neighbors": KNeighborsClassifier(),
    "Support Vector Machine": SVC(probability=True, random_state=42)
}

best_model = None
best_accuracy = 0

for name, model in models.items():
    model.fit(X_train, y_train)
    accuracy = accuracy_score(y_test, model.predict(X_test))
    print(f"{name} Accuracy: {accuracy:.4f}")
    if accuracy > best_accuracy:
        best_model = model
        best_model_name = name
        best_accuracy = accuracy

print(f"\nBest Model: {best_model_name} (Accuracy: {best_accuracy:.4f})")

# Save best model and encoders
joblib.dump(best_model, "best_classification_model.pkl")
joblib.dump(le_gender, "le_gender.pkl")
joblib.dump(le_category, "le_category.pkl")

Logistic Regression Accuracy: 0.1389
Decision Tree Accuracy: 0.1265
Random Forest Accuracy: 0.1287
K-Nearest Neighbors Accuracy: 0.1144
Support Vector Machine Accuracy: 0.1401

Best Model: Support Vector Machine (Accuracy: 0.1401)


['le_category.pkl']