# Preliminary model for classification

This notebook aims to develop a preliminary version of a classifier for the challenge.

In [None]:
import os
from pathlib import Path

from matplotlib import pyplot as plt
import numpy as np
import pandas as pd

import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report, confusion_matrix
from sklearn.model_selection import train_test_split

import pickle

## Access processed data

In [None]:
DIR_REPO = Path.cwd().parent.parent
DIR_DATA_PROCESSED = Path(DIR_REPO) / "data" / "processed"
DIR_MODELS = Path(DIR_REPO) / "models"

os.listdir(DIR_DATA_PROCESSED)

In [None]:
FILEPATH_PROCESSED = DIR_DATA_PROCESSED / "preprocessed_listings.csv"

In [None]:
df = pd.read_csv(FILEPATH_PROCESSED, index_col=0)
df.head()

In [None]:
df = df.dropna(axis=0)

In [None]:
df.columns

## Convert string to categorical variable

In [None]:
# Categorical variable mapping dictionaries
MAP_ROOM_TYPE = {"Shared room": 1, "Private room": 2, "Entire home/apt": 3, "Hotel room": 4}
MAP_NEIGHB = {"Bronx": 1, "Queens": 2, "Staten Island": 3, "Brooklyn": 4, "Manhattan": 5}

In [None]:
# Map categorical features
df["neighbourhood"] = df["neighbourhood"].map(MAP_NEIGHB)
df["room_type"] = df["room_type"].map(MAP_ROOM_TYPE)

## Split data for cross-validation

In [None]:
FEATURE_NAMES = ['neighbourhood', 'room_type', 'accommodates', 'bathrooms', 'bedrooms']

X = df[FEATURE_NAMES]
y = df['category']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=1)

## A simple classifier model

### Train the model

In [None]:
clf = RandomForestClassifier(n_estimators=500, random_state=0, class_weight='balanced', n_jobs=4)
clf.fit(X_train, y_train)

### Evaluate the model

In [None]:
y_pred = clf.predict(X_test)

In [None]:
# Compute overall accuracy
accuracy_score(y_test, y_pred)

In [None]:
# Compute overall one-versus-rest area under the ROC

y_proba = clf.predict_proba(X_test)
roc_auc_score(y_test, y_proba, multi_class='ovr')

In [None]:
importances = clf.feature_importances_
indices = np.argsort(importances)[::-1]
features = X_train.columns[indices]
importances = importances[indices]

fig, ax = plt.subplots(figsize=(12, 7))
plt.barh(range(len(importances)), importances)
plt.yticks(range(len(importances)), features, fontsize=12)
ax.invert_yaxis()
ax.set_xlabel("Feature importance", fontsize=12)

plt.show()

In [None]:
classes = [0, 1, 2, 3]
labels = ['low', 'mid', 'high', 'lux']

c = confusion_matrix(y_test, y_pred)
c = c / c.sum(axis=1).reshape(len(classes), 1)

# Plot
fig, ax = plt.subplots(figsize=(10, 10))
sns.heatmap(c, annot=True, cmap='BuGn', square=True, fmt='.2f', annot_kws={'size': 10}, cbar=False)
plt.xlabel('Predicted', fontsize=16)
plt.ylabel('Real', fontsize=16)
plt.xticks(ticks=np.arange(.5, len(classes)), labels=labels, rotation=0, fontsize=12)
plt.yticks(ticks=np.arange(.5, len(classes)), labels=labels, rotation=0, fontsize=12)
plt.title("Simple model", fontsize=18)

plt.show()

In [None]:
maps = {'0.0': 'low', '1.0': 'mid', '2.0': 'high', '3.0': 'lux'}

report = classification_report(y_test, y_pred, output_dict=True)
df_report = pd.DataFrame.from_dict(report).T[:-3]
df_report.index = [maps[i] for i in df_report.index]
df_report

In [None]:
metrics = ['precision', 'recall', 'support']

fig, axes = plt.subplots(1, len(metrics), figsize=(16, 7))

for i, ax in enumerate(axes):

    ax.barh(df_report.index, df_report[metrics[i]], alpha=0.9)
    ax.tick_params(axis='both', which='major', labelsize=12)
    ax.set_xlabel(metrics[i], fontsize=12)
    ax.invert_yaxis()
    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)

plt.suptitle("Simple model", fontsize=14)
plt.show()

In [None]:
pickle.dump(clf, open(DIR_MODELS / "simple_classifier.pkl", 'wb'))