In [1]:
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv('data.csv')
df.dropna(subset=['price'], inplace=True)
df.reset_index(drop=True, inplace=True)

In [3]:
features = list(df.columns)
target = 'price'
features.remove(target)

X = df[features]
y = df[target].str.strip("$").str.replace(",","").astype(float)

In [4]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [5]:
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBRegressor
class Data_Transformer(object):
    
    def fit(self, X, y=None):
        new_df = pd.DataFrame()
        new_df["Weight"] = X["weight"].map(self.weight2num) # convert weight to numerical value
        self.mean_weight = new_df["Weight"].mean() # obtain mean weight
        new_df["Weight"].fillna(self.mean_weight,inplace=True) # fill in missing weight with mean weight
        new_df["Month"] = pd.to_datetime(X["purchase_date"]).dt.month # convert purchase date to purchase weekday
        self.majority_month = new_df["Month"].mode()[0] # obtain majority purchase month
        new_df["Month"].fillna(self.majority_month,inplace=True) # fill in missing purchase month with majority purchase month
        new_df["Weekday"] = pd.to_datetime(X["purchase_date"]).dt.weekday # convert purchase date to purchase weekday
        self.majority_weekday = new_df["Weekday"].mode()[0] # obtain majority purchase weekday
        new_df["Weekday"].fillna(self.majority_weekday,inplace=True) # fill in missing purchase weekday with majority purchase weekday
        new_df["Ingredient Number"] = X["ingredient"].map(self.get_numbers) # obtain number of ingredients in recipe
        self.mean_ingredient_number = new_df["Ingredient Number"].mean() # obtain mean ingredient number
        new_df['Ingredient Number'].fillna(self.mean_ingredient_number,inplace=True) # fill in missing ingredient number with median ingredient number
        self.pl_le = LabelEncoder() # create label-encoder
        new_df["Product Level"] = pd.Series(self.pl_le.fit_transform(X["product_level"])) # fit and transform product level with label-encoder
        self.majority_product_level = new_df["Product Level"].mode()[0] # obtain majority product level code
        new_df["Product Level"].fillna(self.majority_product_level,inplace=True) # fill in missing product level with majority product level code
        self.pt_le = LabelEncoder() # create label-encoder
        new_df["Cost"] = X["cost"].str.strip("$").str.strip("k").astype(float)*1000 # convert cost to numerical value
        self.cost_imputer = XGBRegressor() # create a XGBoost imputer for cost
        df_for_imputing_cost = new_df.dropna() # create training data for cost imputer by dropping missing data
        self.cost_imputer.fit(df_for_imputing_cost[["Weight","Month","Weekday","Ingredient Number","Product Level"]], df_for_imputing_cost["Cost"]) # fit cost imputer
        
    def transform(self, X, y=None):
        new_df = pd.DataFrame()
        new_df["Weight"] = X["weight"].map(self.weight2num) # convert weight to numerical value
        new_df["Weight"].fillna(self.mean_weight,inplace=True) # fill in missing weight with mean weight
        new_df["Month"] = pd.to_datetime(X["purchase_date"]).dt.month # convert purchase date to purchase month
        new_df["Month"].fillna(self.majority_month,inplace=True) # fill in missing purchase month with majority purchase month
        new_df["Weekday"] = pd.to_datetime(X["purchase_date"]).dt.weekday # convert purchase date to purchase weekday
        new_df["Weekday"].fillna(self.majority_weekday,inplace=True) # fill in missing purchase weekday with majority purchase weekday
        new_df['Ingredient Number'] = X["ingredient"].map(self.get_numbers) # obtain number of ingredients in recipe
        new_df['Ingredient Number'].fillna(self.mean_ingredient_number,inplace=True) # fill in missing ingredient number with mean ingredient number
        new_df["Product Level"] = self.pl_le.transform(X["product_level"]) # transform product level with label-encoder
        new_df["Product Level"].fillna(self.majority_product_level,inplace=True) # fill in missing product level with majority product level code
        new_df["Cost"] = X["cost"].str.strip("$").str.strip("k").astype(float)*1000 # convert cost to numerical value
        imputed_cost = pd.Series(self.cost_imputer.predict(new_df[new_df["Cost"].isnull()][["Weight","Month","Weekday","Ingredient Number","Product Level"]])) # obtain imputed cost
        imputed_cost.index = new_df[new_df["Cost"].isnull()]["Cost"].index # set index of imputed cost
        new_df["Cost"].fillna(imputed_cost,inplace=True) # fill in missing cost with imputed cost
        return new_df # return new_df
    
    def fit_transform(self, X, y=None):
        self.fit(X)
        return self.transform(X)
    
    def weight2num(self, x): # function to convert weight to number
        if type(x) == str:
            x = x.strip('Kg').split(' Ton ')
            return float(x[0])*1000+float(x[1])
        else:
            return x
        
    def get_numbers(self, x): # function to get number of ingredients in recipe
        if type(x) == str:
            return len(x.split(','))
        else:
            return x

In [6]:
from tensorflow.keras import layers, models
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler
from scikeras.wrappers import KerasRegressor
from sklearn.metrics import make_scorer, r2_score

In [7]:
def create_nn_model():
    nn_model = models.Sequential()
    nn_model.add(layers.Dense(64, activation='relu', input_shape=(6,)))
    nn_model.add(layers.Dense(64, activation='relu'))
    nn_model.add(layers.Dense(32, activation='relu'))
    nn_model.add(layers.Dense(1, activation=None))

    nn_model.compile(loss='mean_squared_error', metrics=['mean_absolute_percentage_error'])

    return nn_model

In [8]:
steps =[('dtf', Data_Transformer()),
        ('scaler', MinMaxScaler()),
        ('keras_model', KerasRegressor(model=create_nn_model, verbose=0))]
nn_model = Pipeline(steps)

In [9]:
parameters = {'keras_model__batch_size': [30, 60, 100],
                'keras_model__epochs': [100, 200, 500],
                'keras_model__optimizer': ['rmsprop', 'adam']}
scorer = make_scorer(r2_score, greater_is_better=True)

In [10]:
model_gsv = GridSearchCV(estimator=nn_model, param_grid=parameters, cv=5, scoring=scorer)
model_gsv = model_gsv.fit(X_train, y_train)

In [11]:
model_gsv.best_params_

{'keras_model__batch_size': 30,
 'keras_model__epochs': 500,
 'keras_model__optimizer': 'adam'}

In [12]:
result = model_gsv.cv_results_
result = pd.DataFrame(result)[['param_keras_model__batch_size', 'param_keras_model__epochs','param_keras_model__optimizer', 'mean_test_score', 'rank_test_score']]
result

Unnamed: 0,param_keras_model__batch_size,param_keras_model__epochs,param_keras_model__optimizer,mean_test_score,rank_test_score
0,30,100,rmsprop,0.376278,11
1,30,100,adam,0.375248,13
2,30,200,rmsprop,0.406192,7
3,30,200,adam,0.405459,8
4,30,500,rmsprop,0.40939,2
5,30,500,adam,0.410178,1
6,60,100,rmsprop,0.320822,15
7,60,100,adam,0.302843,16
8,60,200,rmsprop,0.398755,10
9,60,200,adam,0.399093,9


In [13]:
y_train_pred = model_gsv.predict(X_train)
y_test_pred = model_gsv.predict(X_test)

In [14]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [15]:
print('train MAE: {0:.2e}'.format(mean_absolute_error(y_train, y_train_pred)))
print('train MSE: {0:.2e}'.format(mean_squared_error(y_train, y_train_pred)))
print('train R2: {0:.3f}'.format(r2_score(y_train, y_train_pred)))

train MAE: 1.07e+05
train MSE: 2.86e+10
train R2: 0.435


In [16]:
print('test MAE: {0:.2e}'.format(mean_absolute_error(y_test, y_test_pred)))
print('test MSE: {0:.2e}'.format(mean_squared_error(y_test, y_test_pred)))
print('test R2: {0:.3f}'.format(r2_score(y_test, y_test_pred)))

test MAE: 1.08e+05
test MSE: 2.46e+10
test R2: 0.393
