# Capstone Pre-processing and Training Data Development- Jessica Williams

In [1]:
import pandas as pd
import numpy as np
import os
import pickle
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import __version__ as sklearn_version
from sklearn.decomposition import PCA
from sklearn.preprocessing import scale
from sklearn.model_selection import train_test_split, cross_validate, GridSearchCV, learning_curve
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.dummy import DummyRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import SelectKBest, f_regression
import datetime

from sb_utils import save_file


In [2]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import LinearSVR


In [3]:
#NMF imports
from sklearn.decomposition import NMF
from sklearn.preprocessing import normalize

First let’s import the data we worked with in the EDA portion.

In [4]:
recipe_attributes1 = pd.read_csv('C:/Users/jwatki8/Downloads/My Capstone project/recipe_attributes1.csv')

In [5]:
recipe_attributes2 = pd.read_csv('C:/Users/jwatki8/Downloads/My Capstone project/recipe_attributes2.csv')

In [6]:
recipe_attributes3 = pd.read_csv('C:/Users/jwatki8/Downloads/My Capstone project/recipe_attributes3.csv')

In [7]:
recipe_attributes1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 231634 entries, 0 to 231633
Data columns (total 7 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   recipe_id      231634 non-null  int64  
 1   minutes        231634 non-null  int64  
 2   n_steps        231634 non-null  float64
 3   n_ingredients  231634 non-null  float64
 4   rating_mean    231634 non-null  float64
 5   review_count   231634 non-null  int64  
 6   polarity_avg   231634 non-null  float64
dtypes: float64(4), int64(3)
memory usage: 12.4 MB


In [8]:
recipe_attributes2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 231634 entries, 0 to 231633
Columns: 212 entries, recipe_id to ingr_wine
dtypes: float64(211), int64(1)
memory usage: 374.7 MB


In [9]:
recipe_attributes3.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 231637 entries, 0 to 231636
Data columns (total 55 columns):
 #   Column                      Non-Null Count   Dtype  
---  ------                      --------------   -----  
 0   recipe_id                   231637 non-null  int64  
 1   rating_mean                 231637 non-null  float64
 2   polarity_avg                231637 non-null  float64
 3   ingr_baking powder          231637 non-null  int64  
 4   ingr_baking soda            231637 non-null  int64  
 5   ingr_black pepper           231637 non-null  int64  
 6   ingr_brown sugar            231637 non-null  int64  
 7   ingr_cheddar cheese         231637 non-null  int64  
 8   ingr_cream cheese           231637 non-null  int64  
 9   ingr_garlic cloves          231637 non-null  int64  
 10  ingr_ground black           231637 non-null  int64  
 11  ingr_lemon juice            231637 non-null  int64  
 12  ingr_olive oil              231637 non-null  int64  
 13  ingr_parmesan 

Next let’s create a dataframe that includes the numeric features that we narrowed down in the EDA portion.

In [10]:
#recipe_attributes_sub3.drop(to_drop, axis=1, inplace=True)

recipe_attributes3=recipe_attributes3.drop(['rating_mean','polarity_avg'],axis=1)

In [11]:
#recipe_attributes_p=pd.concat([recipe_attributes1, recipe_attributes3], axis=1)
recipe_attributes_p=recipe_attributes1.merge(recipe_attributes3,on='recipe_id')

In [12]:
recipe_attributes_p.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 231634 entries, 0 to 231633
Data columns (total 59 columns):
 #   Column                      Non-Null Count   Dtype  
---  ------                      --------------   -----  
 0   recipe_id                   231634 non-null  int64  
 1   minutes                     231634 non-null  int64  
 2   n_steps                     231634 non-null  float64
 3   n_ingredients               231634 non-null  float64
 4   rating_mean                 231634 non-null  float64
 5   review_count                231634 non-null  int64  
 6   polarity_avg                231634 non-null  float64
 7   ingr_baking powder          231634 non-null  int64  
 8   ingr_baking soda            231634 non-null  int64  
 9   ingr_black pepper           231634 non-null  int64  
 10  ingr_brown sugar            231634 non-null  int64  
 11  ingr_cheddar cheese         231634 non-null  int64  
 12  ingr_cream cheese           231634 non-null  int64  
 13  ingr_garlic cl

In [13]:
recipe_attributes_p.shape

(231634, 59)

Before I examine imputing missing values I will check my dataset for them as I think I cleaned this up pretty well in my previous steps.

In [14]:
counts = recipe_attributes_p.isna().sum()
print(counts.sum())

0


From this we can see that our data contains no missing values.

Next we can start working on creating a train and test set for our new recipe attributes dataframe. Because I want to examine the possibility of more than one target variable, I want to create two different train and test sets. One set will have a y variable of ‘rating_mean’ and the other will have a y variable of ‘polarity_avg’.

In [15]:
len(recipe_attributes_p) * .7, len(recipe_attributes_p) * .3

(162143.8, 69490.2)

In [16]:
X_train, X_test, y_train, y_test = train_test_split(recipe_attributes_p.drop(columns='rating_mean'), 
                                                    recipe_attributes_p.rating_mean, test_size=0.3, 
                                                    random_state=22)

In [17]:
X_train.shape, X_test.shape

((162143, 58), (69491, 58))

In [18]:
y_train.shape, y_test.shape

((162143,), (69491,))

In [19]:
X_train2, X_test2, y_train2, y_test2 = train_test_split(recipe_attributes_p.drop(columns='polarity_avg'), 
                                                    recipe_attributes_p.polarity_avg, test_size=0.3, 
                                                    random_state=22)

In [20]:
X_train2.shape, X_test2.shape

((162143, 58), (69491, 58))

In [21]:
y_train2.shape, y_test2.shape

((162143,), (69491,))

I already have a dataframe that pretty much only includes numeric variables. The only column that should be dropped here is the recipe id column at it is the identifier variable for the recipes.

In [22]:
#names_list = ['', '',]
#names_train = X_train[names_list]
#names_test = X_test[names_list]
#X_train.drop(columns=names_list, inplace=True)
#X_test.drop(columns=names_list, inplace=True)
#X_train.shape, X_test.shape


id_vars = ['recipe_id']
id_train = X_train[id_vars]
id_test = X_test[id_vars]
X_train.drop(columns=id_vars, inplace=True)
X_test.drop(columns=id_vars, inplace=True)
X_train.shape, X_test.shape

((162143, 57), (69491, 57))

In [23]:
X_train.dtypes

minutes                         int64
n_steps                       float64
n_ingredients                 float64
review_count                    int64
polarity_avg                  float64
ingr_baking powder              int64
ingr_baking soda                int64
ingr_black pepper               int64
ingr_brown sugar                int64
ingr_cheddar cheese             int64
ingr_cream cheese               int64
ingr_garlic cloves              int64
ingr_ground black               int64
ingr_lemon juice                int64
ingr_olive oil                  int64
ingr_parmesan cheese            int64
ingr_purpose flour              int64
ingr_salt pepper                int64
ingr_sour cream                 int64
ingr_vanilla extract            int64
ingr_vegetable oil              int64
step_10 minutes                 int64
step_15 minutes                 int64
step_30 minutes                 int64
step_bring boil                 int64
step_large bowl                 int64
step_medium 

In [24]:
X_test.dtypes

minutes                         int64
n_steps                       float64
n_ingredients                 float64
review_count                    int64
polarity_avg                  float64
ingr_baking powder              int64
ingr_baking soda                int64
ingr_black pepper               int64
ingr_brown sugar                int64
ingr_cheddar cheese             int64
ingr_cream cheese               int64
ingr_garlic cloves              int64
ingr_ground black               int64
ingr_lemon juice                int64
ingr_olive oil                  int64
ingr_parmesan cheese            int64
ingr_purpose flour              int64
ingr_salt pepper                int64
ingr_sour cream                 int64
ingr_vanilla extract            int64
ingr_vegetable oil              int64
step_10 minutes                 int64
step_15 minutes                 int64
step_30 minutes                 int64
step_bring boil                 int64
step_large bowl                 int64
step_medium 

In [25]:
#names_list = ['', '',]
#names_train = X_train[names_list]
#names_test = X_test[names_list]
#X_train.drop(columns=names_list, inplace=True)
#X_test.drop(columns=names_list, inplace=True)
#X_train.shape, X_test.shape


id_vars = ['recipe_id']
id_train = X_train2[id_vars]
id_test = X_test2[id_vars]
X_train2.drop(columns=id_vars, inplace=True)
X_test2.drop(columns=id_vars, inplace=True)
X_train2.shape, X_test2.shape

((162143, 57), (69491, 57))

In [26]:
X_train2.dtypes

minutes                         int64
n_steps                       float64
n_ingredients                 float64
rating_mean                   float64
review_count                    int64
ingr_baking powder              int64
ingr_baking soda                int64
ingr_black pepper               int64
ingr_brown sugar                int64
ingr_cheddar cheese             int64
ingr_cream cheese               int64
ingr_garlic cloves              int64
ingr_ground black               int64
ingr_lemon juice                int64
ingr_olive oil                  int64
ingr_parmesan cheese            int64
ingr_purpose flour              int64
ingr_salt pepper                int64
ingr_sour cream                 int64
ingr_vanilla extract            int64
ingr_vegetable oil              int64
step_10 minutes                 int64
step_15 minutes                 int64
step_30 minutes                 int64
step_bring boil                 int64
step_large bowl                 int64
step_medium 

In [27]:
X_test2.dtypes

minutes                         int64
n_steps                       float64
n_ingredients                 float64
rating_mean                   float64
review_count                    int64
ingr_baking powder              int64
ingr_baking soda                int64
ingr_black pepper               int64
ingr_brown sugar                int64
ingr_cheddar cheese             int64
ingr_cream cheese               int64
ingr_garlic cloves              int64
ingr_ground black               int64
ingr_lemon juice                int64
ingr_olive oil                  int64
ingr_parmesan cheese            int64
ingr_purpose flour              int64
ingr_salt pepper                int64
ingr_sour cream                 int64
ingr_vanilla extract            int64
ingr_vegetable oil              int64
step_10 minutes                 int64
step_15 minutes                 int64
step_30 minutes                 int64
step_bring boil                 int64
step_large bowl                 int64
step_medium 

Now I have two different train test sets for modeling the data. This will allow me to see the performance based on two different target variables and compare the results.

 Let’s move on to scaling.

In [28]:
scaler = StandardScaler()
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [29]:
scaler = StandardScaler()
scaler.fit(X_train2)
X_train2_scaled = scaler.transform(X_train2)
X_test2_scaled = scaler.transform(X_test2)

Now we have:

1. Created X and y variables

2. Checked for and imputed missing values.

3. Performed a train test split on the data.

4. Scaled the data.

We are ready to begin modeling.
