# Week 6 - Machine Learning practical

## Exploring the cities dataset

In [None]:
# import the modules we will need to use 
%matplotlib inline
import numpy as np
import pandas as pd
import seaborn as sn
import matplotlib.pyplot as plt

# suppressing some advisory warnings that don't affect what we're doing here
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

# machine learning models and utilities 
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier

In [None]:
# fetch the dataset from GitHub
data = "https://raw.githubusercontent.com/jargonautical/r2d3-part-1-data/refs/heads/master/part_1_data.csv"
city_df = pd.read_csv(data)
city_df.head()

### Define our X and y  
* X: the factors, attributes or features.  
* y: the target variable or label.

__NOTE__ upper case 'X' and lower case 'y' is a common naming convention in machine learning models.

In [None]:
X = city_df.drop(columns = ['in_sf'], axis = 1)
y = city_df['in_sf']

### Train/Test split  

* Once we have defined our features and our labels, we take a subset or 'slice' of the data to use in training our model.  
* We can specify percentages to split on, or select a preset 'random state'.
* Then we pass the `X` and `y` we have just defined to the function to create four new objects.
    * `X_train` and `y_train` are the slices of `X` and `y` that will be used to train the model.
    * `X_test` and `y_test` will be held back to test the model after it has been trained.


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 0)

### Instantiating a model  

* Now we need to call (or create, or instantiate) an instance of an object called `DecisionTreeClassifier`.
* The steps are:
    * Call an instance e.g. `clf = DecisionTreeClassifier()`
    * Fit the instance `.fit()`
    * And tell the model what data it is being fitted to `X_train, y_train`

In [None]:
clf = DecisionTreeClassifier().fit(X_train, y_train)

### Test and evaluate  

* Now we can call some other functions on our model using `clf.score()`.  
* To see how well it managed to fit the training set, we call it on the training data `X_train, y_train`.
* To see its performance on new and unseen data, we do the same but with `X_test, y_test`

In [None]:
print('Accuracy of Decision Tree classifier on training set: {:.2f}'
     .format(clf.score(X_train, y_train)))
print('Accuracy of Decision Tree classifier on test set: {:.2f}'
     .format(clf.score(X_test, y_test)))

### Visualising the model  
* Most machine learning models in sklearn have specific methods for visualising them.
* For a Decision Tree Classifier, we can call `tree.plot_tree()`.

In [None]:
from sklearn import tree 
# Putting the feature names and class names into variables
fn = X.columns
cn = ['sf', 'ny']
tree.plot_tree(clf,
               feature_names = fn, 
               class_names=cn,
               filled = True);

### Co-efficients  
* These are the values or *weights* the model has assigned to each of the features.
* We can call and print them using `clf.feature_importances_`, and we can also plot them on a bar chart for easy comparison.

In [None]:
def plot_feature_importances(clf, fn):
    c_features = len(fn)
    plt.barh(range(c_features), clf.feature_importances_)
    plt.xlabel("Feature importance")
    plt.ylabel("Feature name")
    plt.yticks(np.arange(c_features), fn)

plt.figure(figsize=(10,4), dpi=80)
plot_feature_importances(clf, fn)
plt.show()

print('Feature importances: {}'.format(clf.feature_importances_))

### Model tuning  
* Given our classifier's perfect score in training, and the drop in test score, it's possible our model is overfitted!
* We can simplify and tune it in several ways.
* The simplest option is to constrain it to a small number of levels using `max_depth`.

In [None]:
clf2 = DecisionTreeClassifier(max_depth = 4).fit(X_train, y_train)

print('Accuracy of Decision Tree classifier on training set: {:.2f}'
     .format(clf2.score(X_train, y_train)))
print('Accuracy of Decision Tree classifier on test set: {:.2f}'
     .format(clf2.score(X_test, y_test)))

In [None]:
from sklearn import tree 
# Putting the feature names and class names into variables
fn = X.columns
cn = ['sf', 'ny']
tree.plot_tree(clf2,
               feature_names = fn, 
               class_names=cn,
               filled = True);
#fig.savefig('../images/plottreedefault.png')

In [None]:
plt.figure(figsize=(10,4), dpi=80)
plot_feature_importances(clf2, fn)
plt.show()

print('Feature importances: {}'.format(clf2.feature_importances_))

### Feature selection  
* The goal is to find out which features have the most weight in the model (the largest coefficients).
* If we can identify those, we can remove anything that has very little impact on the accuracy of the model overall.  
* In this case, we can use `elevation` and `price_per_sqft` for a simpler model that is reasonably accurate.

In [None]:
trim_df = city_df[['in_sf','elevation','price_per_sqft']]
trim_df.head()

In [None]:
X = trim_df.drop(columns = ['in_sf'], axis = 1)
y = trim_df['in_sf']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 0)
clf3 = DecisionTreeClassifier(max_depth=2).fit(X_train, y_train)
print('Accuracy of Decision Tree classifier on training set: {:.2f}'
     .format(clf3.score(X_train, y_train)))
print('Accuracy of Decision Tree classifier on test set: {:.2f}'
     .format(clf3.score(X_test, y_test)))

In [None]:
# Putting the feature names and class names into variables
fn = X.columns
cn = ['sf', 'ny']
tree.plot_tree(clf3,
               feature_names = fn, 
               class_names=cn,
               filled = True);
#fig.savefig('../images/plottreedefault.png')

In [None]:
fn = X.columns

def plot_feature_importances(clf, fn):
    c_features = len(fn)
    plt.barh(range(c_features), clf.feature_importances_)
    plt.xlabel("Feature importance")
    plt.ylabel("Feature name")
    plt.yticks(np.arange(c_features), fn)

plt.figure(figsize=(10,4), dpi=80)
plot_feature_importances(clf3, fn)
plt.show()

print('Feature importances: {}'.format(clf.feature_importances_))