In [None]:
from sklearn.datasets import load_iris
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn import metrics
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
np.random.seed(0)

In [None]:
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

### Load in Recipe Data from Epicurious 

Load in 13,000 recipes from epicurious. We want to see that given the nutritional value of a recipe — particularly calories, protein, fat and sodium — could we predict what the rating would be for the particular recipe.

In [None]:
df = pd.read_csv('epicurious-recipes.csv', encoding='utf-8')
df.head()

### Let's look at the data to see some comparisons might exist

We'll do a pairplot to see if there are any correlations

In [None]:
sns.pairplot(df, vars=['rating','calories','protein','fat','sodium'])

In [None]:
len(df.index)

### Setup X and y variables

X is going to be the values we input into the system. It's traditionally capitalized because it's an input as in f(x) = y. In RandomForest, we setup the following variables

X - All of the numerical variables that we will use to predict new value
y - The label we're trying to match, and find a new species.

In [None]:
X = df[['calories', 'protein', 'fat', 'sodium']]
y = df['rating'].apply(lambda x: int(x * 1000)) #multiply ratings by 1000

### Split our data into four parts

This will create four variables:

* `X_train` - This will have 80 percent rows in our X columns. They're randomly chosen.
* `X_text`  - This will have the remaining 20 percent of the X columns to test against.
* `y_train` - This will have 80 percent of our y or (rating) column, randomly selected.
* `y_test`  - This will have the remaining 20 percent of the ratings to test against.

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
print('Number of observations in the training data:', len(X_train))
print('Number of observations in the test data:',len(X_test))

### Setup our classifier

* `n_jobs` specifies how much processor we're devoting to this task (concurrent processes).
* `n_estimators` is how many trees in the forest to use (100 is default in recent versions).
* `random_state` is the random seed. It's optional, but specifying it will give you the same results every time this is run. 

In [None]:
clf = RandomForestRegressor(n_jobs=2, n_estimators=100, random_state=0)

### Setup the Actual Model

This is the "machine learning" part, where we fit our model based on the training data.

In [None]:
clf.fit(X_train,y_train)

### Predict using the test data

Now that we have our classifier, we'll predict results with our 20 percent test data.

In [None]:
y_pred = clf.predict(X_test)

## Create table of predictions

Now that we have our classifier, let's see how accurate it is by making a table and graphing it.

In [None]:
# convert back to rating values by dividing by 1000
act_rating = np.vectorize(lambda x: float(x/1000))(y_test)
pre_rating = np.vectorize(lambda x: float(x/1000))(y_pred)

df2 = pd.DataFrame({'Actual': act_rating, 'Predicted': pre_rating})
df2['Difference'] = df2['Predicted'] - df2['Actual']
df2.head(15)

In [None]:
sns.displot(df2['Difference'])

In [None]:
print("Standard deviation:", df2['Difference'].std())

### Let's see how accurate we were

0.5 means 50%, which means we got _exact_ results about half the time. 

In [None]:
print("Accuracy:", clf.score(X_test, y_test))

### Let's see which of the variables our classifier preferred

This will tell us how much our classifier weighed each variable

In [None]:
# see importance of each variable
feature_imp = pd.DataFrame({'variable':list(X_train.columns), 'importance': clf.feature_importances_}).sort_values(by='importance',ascending=False)
feature_imp

In [None]:
sns.catplot(y='variable', x='importance', kind='bar', orient='h', data=feature_imp)

### Predict a one-off

Let's provide values for calories, protein, fat and sodium and see what rating it would predict

In [None]:
#calories, protein, fat,sodium
oneoff = clf.predict([[100, 10, 7, 550]])

print("Predicted rating: ", float(oneoff/1000))