Jafar Yahia
Hands-on-Assignment II

Classification Model



Imports

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import mean_squared_error, accuracy_score, classification_report, confusion_matrix
from sklearn.decomposition import PCA

# Classification Models
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier


dataset

In [None]:
data = pd.read_csv("/content/NY-House-Dataset.csv")

Targets for classification

In [None]:
price_median = data['PRICE'].median()
data['PRICE_Category'] = np.where(data['PRICE'] >= price_median, 1, 0)

X_classification = data.drop(columns=['PRICE', 'PRICE_Category'])
y_classification = data['PRICE_Category']


- use the median house price to split data in high and low categories

Split and scale data for the classification

In [None]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), X_classification.select_dtypes(include=['float64', 'int64']).columns),
        ('cat', OneHotEncoder(), X_classification.select_dtypes(include=['object']).columns)
    ])
X_classification_transformed = preprocessor.fit_transform(X_classification)

X_train_class, X_test_class, y_train_class, y_test_class = train_test_split(
    X_classification_transformed, y_classification, test_size=0.3, random_state=42)



1. Logistic Regression


In [None]:
logistic_reg = LogisticRegression(random_state=42)
logistic_reg.fit(X_train_class, y_train_class)
y_pred_class_log = logistic_reg.predict(X_test_class)

print("Logistic Regression Accuracy:", accuracy_score(y_test_class, y_pred_class_log))
print("Classification Report:\n", classification_report(y_test_class, y_pred_class_log))


Logistic Regression Accuracy: 0.8750867453157529
Classification Report:
               precision    recall  f1-score   support

           0       0.87      0.88      0.88       717
           1       0.88      0.87      0.88       724

    accuracy                           0.88      1441
   macro avg       0.88      0.88      0.88      1441
weighted avg       0.88      0.88      0.88      1441



2. Decision tree classifier

In [None]:
decision_tree = DecisionTreeClassifier(random_state=42)
decision_tree.fit(X_train_class, y_train_class)
y_pred_class_tree = decision_tree.predict(X_test_class)

print("Decision Tree Accuracy:", accuracy_score(y_test_class, y_pred_class_tree))
print("Classification Report:\n", classification_report(y_test_class, y_pred_class_tree))


Decision Tree Accuracy: 0.8653712699514227
Classification Report:
               precision    recall  f1-score   support

           0       0.86      0.87      0.86       717
           1       0.87      0.86      0.87       724

    accuracy                           0.87      1441
   macro avg       0.87      0.87      0.87      1441
weighted avg       0.87      0.87      0.87      1441



3. Random Forest classifier

In [None]:
rf_classifier = RandomForestClassifier(random_state=42)
rf_classifier.fit(X_train_class, y_train_class)
y_pred_class_rf = rf_classifier.predict(X_test_class)

print("Random Forest Accuracy:", accuracy_score(y_test_class, y_pred_class_rf))
print("Classification Report:\n", classification_report(y_test_class, y_pred_class_rf))


Random Forest Accuracy: 0.8778625954198473
Classification Report:
               precision    recall  f1-score   support

           0       0.88      0.87      0.88       717
           1       0.87      0.89      0.88       724

    accuracy                           0.88      1441
   macro avg       0.88      0.88      0.88      1441
weighted avg       0.88      0.88      0.88      1441



- adds better classification accuracy

4. Gradient Boosting Classifier

In [None]:
gboost_classifier = GradientBoostingClassifier(random_state=42)
gboost_classifier.fit(X_train_class, y_train_class)
y_pred_class_gboost = gboost_classifier.predict(X_test_class)

print("Gradient Boosting Accuracy:", accuracy_score(y_test_class, y_pred_class_gboost))
print("Classification Report:\n", classification_report(y_test_class, y_pred_class_gboost))


Gradient Boosting Accuracy: 0.8750867453157529
Classification Report:
               precision    recall  f1-score   support

           0       0.87      0.88      0.88       717
           1       0.88      0.87      0.87       724

    accuracy                           0.88      1441
   macro avg       0.88      0.88      0.88      1441
weighted avg       0.88      0.88      0.88      1441



- boosts trees for tasks

5. K-nearest neighbor classifier

In [None]:
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train_class, y_train_class)
y_pred_class_knn = knn.predict(X_test_class)

print("K-Nearest Neighbors Accuracy:", accuracy_score(y_test_class, y_pred_class_knn))
print("Classification Report:\n", classification_report(y_test_class, y_pred_class_knn))


K-Nearest Neighbors Accuracy: 0.8390006939625261
Classification Report:
               precision    recall  f1-score   support

           0       0.83      0.84      0.84       717
           1       0.84      0.83      0.84       724

    accuracy                           0.84      1441
   macro avg       0.84      0.84      0.84      1441
weighted avg       0.84      0.84      0.84      1441



- neigbors set to 5 initially
- KNN model used for the classification

Classification Model Conclusion

In [None]:
print("CONCLUSION:")
print("Logistic Regression Accuracy:", accuracy_score(y_test_class, y_pred_class_log))
print("Decision Tree Classifier Accuracy:", accuracy_score(y_test_class, y_pred_class_tree))
print("Random Forest Classifier Accuracy:", accuracy_score(y_test_class, y_pred_class_rf))
print("Gradient Boosting Classifier Accuracy:", accuracy_score(y_test_class, y_pred_class_gboost))
print("K-Nearest Neighbors Accuracy:", accuracy_score(y_test_class, y_pred_class_knn))
print("Best performing model: Based on accuracy and classification report.")


CONCLUSION:
Logistic Regression Accuracy: 0.8750867453157529
Decision Tree Classifier Accuracy: 0.8653712699514227
Random Forest Classifier Accuracy: 0.8778625954198473
Gradient Boosting Classifier Accuracy: 0.8750867453157529
K-Nearest Neighbors Accuracy: 0.8390006939625261
Best performing model: Based on accuracy and classification report.
