# Week 6 - Machine Learning practical

## Exploring the Hidden Gems dataset

In [None]:
# import the modules we will need to use 
%matplotlib inline
import numpy as np
import pandas as pd
import seaborn as sn
import matplotlib.pyplot as plt

# suppressing some advisory warnings that don't affect what we're doing here
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

# machine learning models and utilities 
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier

### Import the data

In [None]:
# import dataset 
hidden_gems = "https://raw.githubusercontent.com/jargonautical/bsuBootcampCohort5/refs/heads/main/netflix-rotten-tomatoes-metacritic-imdb.csv"
df = pd.read_csv(hidden_gems)


### Inspect the data

In [None]:
df.columns
#df.head()
#df.sample()
#df.describe()

### Shaping our data 
  
Different machine learning models need the input data shaped in specific ways.  
In general, they need:
* Numeric values for the variables,  
* A target variable (the 'answer' we're looking for).  

The target variable is the class or category we want to detect, in this case whether or not a movie has won any awards.

### Create a class target variable

In [None]:
# create a new column - if awards won > 0 then 1 else 0
df['winner'] = df['Awards Received'].apply(lambda x: 1 if x > 0 else 0)

### Select the numeric variables we will use 

In [None]:
# we want to use only numeric values, so we supply a list of these
trim_df = df.select_dtypes(include=['float64', 'int64']) 
trim_df.columns

In [None]:
trim_df.head()

In [None]:
trim_df['Awards Received'] = trim_df['Awards Received'].fillna(0) 
trim_df.head()


In [None]:
#trim_df = trim_df.drop(columns = ['Awards Received'], axis=1)
in_df = trim_df.dropna()

### Defining our model inputs X and y

In [None]:
X = in_df.drop(columns = ['winner'], axis = 1)
y = in_df['winner']

### The train-test split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 0)

### Decision Trees

In [None]:
from sklearn.tree import DecisionTreeClassifier

clf = DecisionTreeClassifier().fit(X_train, y_train)

print('Accuracy of Decision Tree classifier on training set: {:.2f}'
     .format(clf.score(X_train, y_train)))
print('Accuracy of Decision Tree classifier on test set: {:.2f}'
     .format(clf.score(X_test, y_test)))

#### Setting max decision tree depth to help avoid overfitting

In [None]:
clf2 = DecisionTreeClassifier(max_depth = 3).fit(X_train, y_train)

print('Accuracy of Decision Tree classifier on training set: {:.2f}'
     .format(clf2.score(X_train, y_train)))
print('Accuracy of Decision Tree classifier on test set: {:.2f}'
     .format(clf2.score(X_test, y_test)))

#### Visualizing decision trees

In [None]:
from sklearn import tree 
# Putting the feature names and class names into variables
fn = X.columns
cn = ['winner', 'loser']
tree.plot_tree(clf,
               feature_names = fn, 
               class_names=cn,
               filled = True);
#fig.savefig('../images/plottreedefault.png')

#### Feature importance

In [None]:
def plot_feature_importances(clf, feature_names):
    c_features = len(feature_names)
    plt.barh(range(c_features), clf.feature_importances_)
    plt.xlabel("Feature importance")
    plt.ylabel("Feature name")
    plt.yticks(np.arange(c_features), feature_names)

plt.figure(figsize=(10,4), dpi=80)
plot_feature_importances(clf, fn)
plt.show()

print('Feature importances: {}'.format(clf.feature_importances_))