In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('data.csv')
df.dropna(subset=['price'], inplace=True)
df.reset_index(drop=True, inplace=True)

In [3]:
features = list(df.columns)
target = 'price'
features.remove(target)

X = df[features]
y = df[target].str.strip("$").str.replace(",","").astype(float)

In [4]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [5]:
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBRegressor
class Data_Transformer(object):
    
    def fit(self, X, y=None):
        new_df = pd.DataFrame()
        new_df["Weight"] = X["weight"].map(self.weight2num) # convert weight to numerical value
        self.mean_weight = new_df["Weight"].mean() # obtain mean weight
        new_df["Weight"].fillna(self.mean_weight,inplace=True) # fill in missing weight with mean weight
        new_df["Month"] = pd.to_datetime(X["purchase_date"]).dt.month # convert purchase date to purchase weekday
        self.majority_month = new_df["Month"].mode()[0] # obtain majority purchase month
        new_df["Month"].fillna(self.majority_month,inplace=True) # fill in missing purchase month with majority purchase month
        new_df["Weekday"] = pd.to_datetime(X["purchase_date"]).dt.weekday # convert purchase date to purchase weekday
        self.majority_weekday = new_df["Weekday"].mode()[0] # obtain majority purchase weekday
        new_df["Weekday"].fillna(self.majority_weekday,inplace=True) # fill in missing purchase weekday with majority purchase weekday
        new_df["Ingredient Number"] = X["ingredient"].map(self.get_numbers) # obtain number of ingredients in recipe
        self.mean_ingredient_number = new_df["Ingredient Number"].mean() # obtain mean ingredient number
        new_df['Ingredient Number'].fillna(self.mean_ingredient_number,inplace=True) # fill in missing ingredient number with median ingredient number
        self.pl_le = LabelEncoder() # create label-encoder
        new_df["Product Level"] = pd.Series(self.pl_le.fit_transform(X["product_level"])) # fit and transform product level with label-encoder
        self.majority_product_level = new_df["Product Level"].mode()[0] # obtain majority product level code
        new_df["Product Level"].fillna(self.majority_product_level,inplace=True) # fill in missing product level with majority product level code
        self.pt_le = LabelEncoder() # create label-encoder
        new_df["Cost"] = X["cost"].str.strip("$").str.strip("k").astype(float)*1000 # convert cost to numerical value
        self.cost_imputer = XGBRegressor() # create a XGBoost imputer for cost
        df_for_imputing_cost = new_df.dropna() # create training data for cost imputer by dropping missing data
        self.cost_imputer.fit(df_for_imputing_cost[["Weight","Month","Weekday","Ingredient Number","Product Level"]], df_for_imputing_cost["Cost"]) # fit cost imputer
        
    def transform(self, X, y=None):
        new_df = pd.DataFrame()
        new_df["Weight"] = X["weight"].map(self.weight2num) # convert weight to numerical value
        new_df["Weight"].fillna(self.mean_weight,inplace=True) # fill in missing weight with mean weight
        new_df["Month"] = pd.to_datetime(X["purchase_date"]).dt.month # convert purchase date to purchase month
        new_df["Month"].fillna(self.majority_month,inplace=True) # fill in missing purchase month with majority purchase month
        new_df["Weekday"] = pd.to_datetime(X["purchase_date"]).dt.weekday # convert purchase date to purchase weekday
        new_df["Weekday"].fillna(self.majority_weekday,inplace=True) # fill in missing purchase weekday with majority purchase weekday
        new_df["Ingredient Number"] = X["ingredient"].map(self.get_numbers) # obtain number of ingredients in recipe
        new_df["Ingredient Number"].fillna(self.mean_ingredient_number,inplace=True) # fill in missing ingredient number with mean ingredient number
        new_df["Product Level"] = self.pl_le.transform(X["product_level"]) # transform product level with label-encoder
        new_df["Product Level"].fillna(self.majority_product_level,inplace=True) # fill in missing product level with majority product level code
        new_df["Cost"] = X["cost"].str.strip("$").str.strip("k").astype(float)*1000 # convert cost to numerical value
        imputed_cost = pd.Series(self.cost_imputer.predict(new_df[new_df["Cost"].isnull()][["Weight","Month","Weekday","Ingredient Number","Product Level"]])) # obtain imputed cost
        imputed_cost.index = new_df[new_df["Cost"].isnull()]["Cost"].index # set index of imputed cost
        new_df["Cost"].fillna(imputed_cost,inplace=True) # fill in missing cost with imputed cost
        return new_df[["Cost","Weight","Ingredient Number"]] # return new_df
    
    def fit_transform(self, X, y=None):
        self.fit(X)
        return self.transform(X)
    
    def weight2num(self, x): # function to convert weight to number
        if type(x) == str:
            x = x.strip('Kg').split(' Ton ')
            return float(x[0])*1000+float(x[1])
        else:
            return x
        
    def get_numbers(self, x): # function to get number of ingredients in recipe
        if type(x) == str:
            return len(x.split(','))
        else:
            return x

## 1. Linear Regression

In [6]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures, MinMaxScaler

In [7]:
steps = [('dtf',Data_Transformer()),
         ('poly', PolynomialFeatures(degree=10)),
         ('scaler', MinMaxScaler()),
         ('lr', LinearRegression())]
lr_model = Pipeline(steps)

In [8]:
lr_model.fit(X_train, y_train)

In [9]:
y_train_pred = lr_model.predict(X_train)
y_test_pred = lr_model.predict(X_test)

In [10]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [11]:
print('train MAE: {0:.2e}'.format(mean_absolute_error(y_train, y_train_pred)))
print('train MSE: {0:.2e}'.format(mean_squared_error(y_train, y_train_pred)))
print('train R2: {0:.3f}'.format(r2_score(y_train, y_train_pred)))

train MAE: 9.52e+04
train MSE: 2.04e+10
train R2: 0.597


In [12]:
print('test MAE: {0:.2e}'.format(mean_absolute_error(y_test, y_test_pred)))
print('test MSE: {0:.2e}'.format(mean_squared_error(y_test, y_test_pred)))
print('test R2: {0:.3f}'.format(r2_score(y_test, y_test_pred)))

test MAE: 2.12e+08
test MSE: 2.69e+19
test R2: -663483051.652


## 2. L1 Regularization (Lasso Regression)

In [13]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import Lasso
from sklearn.preprocessing import PolynomialFeatures, MinMaxScaler
from sklearn.metrics import make_scorer, r2_score

In [14]:
steps = [('dtf', Data_Transformer()),
         ('poly', PolynomialFeatures()),
         ('scaler', MinMaxScaler()),
         ('lasso', Lasso(max_iter=1000000))]
lasso_model = Pipeline(steps)

In [15]:
parameters = {'poly__degree': [2, 3, 5, 10], 
              'lasso__alpha': [1, 2, 3, 5, 10, 20, 50, 100]}
scorer = make_scorer(r2_score, greater_is_better=True)

In [16]:
model_gsv = GridSearchCV(lasso_model, parameters, cv=5, scoring=scorer)
model_gsv = model_gsv.fit(X_train, y_train)

In [17]:
model_gsv.best_params_

{'lasso__alpha': 100, 'poly__degree': 2}

In [18]:
result = model_gsv.cv_results_
result = pd.DataFrame(result)[['param_poly__degree', 'param_lasso__alpha', 'mean_test_score', 'rank_test_score']]
result

Unnamed: 0,param_poly__degree,param_lasso__alpha,mean_test_score,rank_test_score
0,2,1,0.398361,9
1,3,1,0.386809,16
2,5,1,0.102432,24
3,10,1,-15.32354,32
4,2,2,0.398595,8
5,3,2,0.387182,15
6,5,2,0.202411,23
7,10,2,-6.437025,31
8,2,3,0.398832,7
9,3,3,0.387328,14


In [19]:
y_train_pred = model_gsv.predict(X_train)
y_test_pred = model_gsv.predict(X_test)

In [20]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [21]:
print('train MAE: {0:.2e}'.format(mean_absolute_error(y_train, y_train_pred)))
print('train MSE: {0:.2e}'.format(mean_squared_error(y_train, y_train_pred)))
print('train R2: {0:.3f}'.format(r2_score(y_train, y_train_pred)))

train MAE: 1.08e+05
train MSE: 2.90e+10
train R2: 0.428


In [22]:
print('test MAE: {0:.2e}'.format(mean_absolute_error(y_test, y_test_pred)))
print('test MSE: {0:.2e}'.format(mean_squared_error(y_test, y_test_pred)))
print('test R2: {0:.3f}'.format(r2_score(y_test, y_test_pred)))

test MAE: 1.09e+05
test MSE: 2.50e+10
test R2: 0.382


## 3. L2 Regularization (Ridge Regression)

In [23]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import Ridge
from sklearn.preprocessing import PolynomialFeatures, MinMaxScaler
from sklearn.metrics import make_scorer, r2_score

In [24]:
steps = [('dtf', Data_Transformer()),
         ('poly', PolynomialFeatures(degree=10)),
         ('scaler', MinMaxScaler()),
         ('ridge', Ridge(alpha=5))]
ridge_model = Pipeline(steps)

In [25]:
parameters = {'poly__degree': [2, 3, 5, 10], 
              'ridge__alpha': [1, 2, 3, 5, 10, 20, 50, 100]}
scorer = make_scorer(r2_score, greater_is_better=True)

In [26]:
model_gsv = GridSearchCV(ridge_model, parameters, cv=5, scoring=scorer)
model_gsv = model_gsv.fit(X_train, y_train)

In [27]:
model_gsv.best_params_

{'poly__degree': 3, 'ridge__alpha': 5}

In [28]:
result = model_gsv.cv_results_
result = pd.DataFrame(result)[['param_poly__degree', 'param_ridge__alpha', 'mean_test_score', 'rank_test_score']]
result

Unnamed: 0,param_poly__degree,param_ridge__alpha,mean_test_score,rank_test_score
0,2,1,0.403607,3
1,2,2,0.401548,7
2,2,3,0.399708,10
3,2,5,0.39699,13
4,2,10,0.393028,16
5,2,20,0.387426,20
6,2,50,0.366531,23
7,2,100,0.326183,27
8,3,1,0.402661,6
9,3,2,0.403446,4


In [29]:
y_train_pred = model_gsv.predict(X_train)
y_test_pred = model_gsv.predict(X_test)

In [30]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [31]:
print('train MAE: {0:.2e}'.format(mean_absolute_error(y_train, y_train_pred)))
print('train MSE: {0:.2e}'.format(mean_squared_error(y_train, y_train_pred)))
print('train R2: {0:.3f}'.format(r2_score(y_train, y_train_pred)))

train MAE: 1.08e+05
train MSE: 2.90e+10
train R2: 0.427


In [32]:
print('test MAE: {0:.2e}'.format(mean_absolute_error(y_test, y_test_pred)))
print('test MSE: {0:.2e}'.format(mean_squared_error(y_test, y_test_pred)))
print('test R2: {0:.3f}'.format(r2_score(y_test, y_test_pred)))

test MAE: 1.09e+05
test MSE: 2.51e+10
test R2: 0.380
