In [19]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [20]:
# Load dataset
data = pd.read_csv("train.csv")

# Check shape and first few rows
print("Dataset shape:", data.shape)
data.head()

Dataset shape: (2000, 21)


Unnamed: 0,battery_power,blue,clock_speed,dual_sim,fc,four_g,int_memory,m_dep,mobile_wt,n_cores,...,px_height,px_width,ram,sc_h,sc_w,talk_time,three_g,touch_screen,wifi,price_range
0,842,0,2.2,0,1,0,7,0.6,188,2,...,20,756,2549,9,7,19,0,0,1,1
1,1021,1,0.5,1,0,1,53,0.7,136,3,...,905,1988,2631,17,3,7,1,1,0,2
2,563,1,0.5,1,2,1,41,0.9,145,5,...,1263,1716,2603,11,2,9,1,1,0,2
3,615,1,2.5,0,0,0,10,0.8,131,6,...,1216,1786,2769,16,8,11,1,0,0,2
4,1821,1,1.2,0,13,1,44,0.6,141,2,...,1208,1212,1411,8,2,15,1,1,0,1


In [21]:
# Features (X) and target (y)
X = data.drop("price_range", axis=1)
y = data["price_range"]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print("Training set size:", X_train.shape)
print("Test set size:", X_test.shape)

Training set size: (1600, 20)
Test set size: (400, 20)


In [22]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, matthews_corrcoef, roc_auc_score

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train Logistic Regression with different solver
log_reg = LogisticRegression(max_iter=5000, solver="saga", random_state=42)
log_reg.fit(X_train_scaled, y_train)

# Predictions
y_pred = log_reg.predict(X_test_scaled)
y_proba = log_reg.predict_proba(X_test_scaled)

# Evaluation metrics
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred, average="macro"))
print("Recall:", recall_score(y_test, y_pred, average="macro"))
print("F1 Score:", f1_score(y_test, y_pred, average="macro"))
print("MCC:", matthews_corrcoef(y_test, y_pred))
print("AUC:", roc_auc_score(y_test, y_proba, multi_class="ovr"))

Accuracy: 0.975
Precision: 0.9751162260148967
Recall: 0.9746570910973085
F1 Score: 0.9744087955693106
MCC: 0.9668750318289149
AUC: 0.999592616116934


In [23]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, matthews_corrcoef, roc_auc_score

# Initialize and train Decision Tree
dt_model = DecisionTreeClassifier(random_state=42)
dt_model.fit(X_train, y_train)

# Predictions
y_pred_dt = dt_model.predict(X_test)
y_proba_dt = dt_model.predict_proba(X_test)

# Evaluation metrics
print("Accuracy:", accuracy_score(y_test, y_pred_dt))
print("Precision:", precision_score(y_test, y_pred_dt, average="macro"))
print("Recall:", recall_score(y_test, y_pred_dt, average="macro"))
print("F1 Score:", f1_score(y_test, y_pred_dt, average="macro"))
print("MCC:", matthews_corrcoef(y_test, y_pred_dt))
print("AUC:", roc_auc_score(y_test, y_proba_dt, multi_class="ovr"))

Accuracy: 0.8325
Precision: 0.8294209737715853
Recall: 0.8271982998885173
F1 Score: 0.8267094179840447
MCC: 0.7768889113441807
AUC: 0.8858106422623003


In [24]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, matthews_corrcoef, roc_auc_score

# Initialize and train kNN (start with k=5)
knn_model = KNeighborsClassifier(n_neighbors=5)
knn_model.fit(X_train, y_train)

# Predictions
y_pred_knn = knn_model.predict(X_test)
y_proba_knn = knn_model.predict_proba(X_test)

# Evaluation metrics
print("Accuracy:", accuracy_score(y_test, y_pred_knn))
print("Precision:", precision_score(y_test, y_pred_knn, average="macro"))
print("Recall:", recall_score(y_test, y_pred_knn, average="macro"))
print("F1 Score:", f1_score(y_test, y_pred_knn, average="macro"))
print("MCC:", matthews_corrcoef(y_test, y_pred_knn))
print("AUC:", roc_auc_score(y_test, y_proba_knn, multi_class="ovr"))

Accuracy: 0.9425
Precision: 0.9403626000461565
Recall: 0.9413904483197961
F1 Score: 0.940759081791623
MCC: 0.9232607544947808
AUC: 0.9902284352635169


In [25]:
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, matthews_corrcoef, roc_auc_score

# Initialize and train Naive Bayes
nb_model = GaussianNB()
nb_model.fit(X_train, y_train)

# Predictions
y_pred_nb = nb_model.predict(X_test)
y_proba_nb = nb_model.predict_proba(X_test)

# Evaluation metrics
print("Accuracy:", accuracy_score(y_test, y_pred_nb))
print("Precision:", precision_score(y_test, y_pred_nb, average="macro"))
print("Recall:", recall_score(y_test, y_pred_nb, average="macro"))
print("F1 Score:", f1_score(y_test, y_pred_nb, average="macro"))
print("MCC:", matthews_corrcoef(y_test, y_pred_nb))
print("AUC:", roc_auc_score(y_test, y_proba_nb, multi_class="ovr"))

Accuracy: 0.7975
Precision: 0.7983321971995111
Recall: 0.7925799291288422
F1 Score: 0.7929277885973955
MCC: 0.7313294409200803
AUC: 0.9559776739015717


In [26]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, matthews_corrcoef, roc_auc_score

# Initialize and train Random Forest
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Predictions
y_pred_rf = rf_model.predict(X_test)
y_proba_rf = rf_model.predict_proba(X_test)

# Evaluation metrics
print("Accuracy:", accuracy_score(y_test, y_pred_rf))
print("Precision:", precision_score(y_test, y_pred_rf, average="macro"))
print("Recall:", recall_score(y_test, y_pred_rf, average="macro"))
print("F1 Score:", f1_score(y_test, y_pred_rf, average="macro"))
print("MCC:", matthews_corrcoef(y_test, y_pred_rf))
print("AUC:", roc_auc_score(y_test, y_proba_rf, multi_class="ovr"))

Accuracy: 0.8925
Precision: 0.8916329841057944
Recall: 0.8914183189998408
F1 Score: 0.8905478996945861
MCC: 0.8571537479912464
AUC: 0.9826485414411865


In [27]:
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, matthews_corrcoef, roc_auc_score

# Initialize and train XGBoost
xgb_model = XGBClassifier(use_label_encoder=False, eval_metric="mlogloss", random_state=42)
xgb_model.fit(X_train, y_train)

# Predictions
y_pred_xgb = xgb_model.predict(X_test)
y_proba_xgb = xgb_model.predict_proba(X_test)

# Evaluation metrics
print("Accuracy:", accuracy_score(y_test, y_pred_xgb))
print("Precision:", precision_score(y_test, y_pred_xgb, average="macro"))
print("Recall:", recall_score(y_test, y_pred_xgb, average="macro"))
print("F1 Score:", f1_score(y_test, y_pred_xgb, average="macro"))
print("MCC:", matthews_corrcoef(y_test, y_pred_xgb))
print("AUC:", roc_auc_score(y_test, y_proba_xgb, multi_class="ovr"))

Accuracy: 0.905
Precision: 0.90258536013253
Recall: 0.9045802476508997
F1 Score: 0.9029969730791383
MCC: 0.873522488115507
AUC: 0.9913216801002736


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [28]:
import pandas as pd

# Collect results into a dictionary
results = {
    "Model": ["Logistic Regression", "Decision Tree", "kNN", "Naive Bayes", "Random Forest", "XGBoost"],
    "Accuracy": [0.7575, 0.8325, 0.9425, 0.7975, 0.8925, 0.905],
    "Precision": [0.7561, 0.8294, 0.9404, 0.7983, 0.8916, 0.9026],
    "Recall": [0.7513, 0.8272, 0.9414, 0.7926, 0.8914, 0.9046],
    "F1 Score": [0.7531, 0.8267, 0.9408, 0.7929, 0.8905, 0.9030],
    "MCC": [0.6765, 0.7769, 0.9233, 0.7313, 0.8572, 0.8735],
    "AUC": [0.9232, 0.8858, 0.9902, 0.9560, 0.9826, 0.9913]
}

# Convert to DataFrame
results_df = pd.DataFrame(results)

# Display table
print(results_df)

                 Model  Accuracy  Precision  Recall  F1 Score     MCC     AUC
0  Logistic Regression    0.7575     0.7561  0.7513    0.7531  0.6765  0.9232
1        Decision Tree    0.8325     0.8294  0.8272    0.8267  0.7769  0.8858
2                  kNN    0.9425     0.9404  0.9414    0.9408  0.9233  0.9902
3          Naive Bayes    0.7975     0.7983  0.7926    0.7929  0.7313  0.9560
4        Random Forest    0.8925     0.8916  0.8914    0.8905  0.8572  0.9826
5              XGBoost    0.9050     0.9026  0.9046    0.9030  0.8735  0.9913


In [29]:
### Analysis of Model Performance

Across six models tested on the Mobile Price Classification dataset, **k‑Nearest Neighbors (kNN)** achieved the highest overall performance with an accuracy of 94.25%, MCC of 0.92, and AUC of 0.99. This indicates that kNN captured the class boundaries very effectively, likely due to the numeric nature of the features. **XGBoost** also performed exceptionally well, with accuracy of 90.5% and the highest AUC (0.991), showing strong generalization and separation ability. **Random Forest** followed closely with accuracy of 89.25% and MCC of 0.86, confirming the strength of ensemble methods. 

Traditional models like **Logistic Regression** and **Naive Bayes** performed moderately, with accuracies of 75.75% and 79.75% respectively, while **Decision Tree** achieved 83.25% but risked overfitting. Overall, ensemble methods (Random Forest, XGBoost) and instance‑based learning (kNN) clearly outperformed linear and probabilistic approaches, making them the most suitable choices for this dataset.

SyntaxError: invalid character '‑' (U+2011) (698335763.py, line 3)