# VeggiePrice wholesale price prediction. Imported cleaned data set, featurized data, ran TF-IDF, then CV with Random Forest Regression

In [27]:
import pandas as pd
import glob
import numpy as np
import re
import csv
import random
from sklearn.cross_validation import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import statsmodels.api as sm
import statsmodels.formula.api as fsm
from sklearn.preprocessing import scale
from sklearn.ensemble import RandomForestRegressor
from sklearn import (cross_validation, cluster, datasets, decomposition, ensemble, preprocessing, metrics)

#Import Cleaned Data

In [10]:
input_data = pd.read_csv('../FarmFreshToChef/Restaurants/cleaned_res_data2.csv').drop(['Unnamed: 0'],axis=1)
#input_data.columns = input_data.rename(columns = lambda x: x.lower(), inplace=True)
input_data.columns = input_data.columns.str.lower()

In [11]:
input_data[['year','month']] = input_data[['year','month']].astype(float)

In [12]:
#calculate continuous date variable
input_data['datefac'] = (input_data['year']-12) + (input_data['month']-1)/12

In [13]:
#drop unnecessary fields
data = input_data.drop(['date', 'ct', 'file_name', 'product_list','month','year'], axis=1)[input_data['price_per_unit'].notnull()].reset_index(drop=True)
data['intercept']=1.0

In [20]:
#get dummies
data_dummy = pd.get_dummies(data, columns=[u'unit','cat'])
data_dummy = pd.concat([data_dummy, data['cat']],axis=1)

In [21]:
#  Add in fields for Quantity * Unit
data_dummy[u'quantity_unit_bu'] = data_dummy[[u'quantity','unit_bu']].product(axis=1)
data_dummy[u'quantity_unit_bulbs'] = data_dummy[[u'quantity','unit_bulbs']].product(axis=1)
data_dummy[u'quantity_unit_ct'] = data_dummy[[u'quantity','unit_ct']].product(axis=1)
data_dummy[u'quantity_unit_flat'] = data_dummy[[u'quantity','unit_flat']].product(axis=1)
data_dummy[u'quantity_unit_lb'] = data_dummy[[u'quantity','unit_lb']].product(axis=1)
data_dummy[u'quantity_unit_leaves'] = data_dummy[[u'quantity','unit_leaves']].product(axis=1)

In [22]:
ols_data = data_dummy.copy()
y = ols_data.pop('price_per_unit')
mdl_data = ols_data.drop(['product','cat_bean','cat','farm','unit_bu','quantity_unit_bu', 'price'],axis=1)
mdl = sm.OLS(y, mdl_data)
results = mdl.fit()
results.summary()

0,1,2,3
Dep. Variable:,price_per_unit,R-squared:,0.608
Model:,OLS,Adj. R-squared:,0.608
Method:,Least Squares,F-statistic:,1229.0
Date:,"Sun, 23 Aug 2015",Prob (F-statistic):,0.0
Time:,14:56:02,Log-Likelihood:,-58436.0
No. Observations:,19821,AIC:,116900.0
Df Residuals:,19795,BIC:,117100.0
Df Model:,25,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5
,coef,std err,t,P>|t|,[95.0% Conf. Int.]
quantity,1.8696,0.098,19.111,0.000,1.678 2.061
datefac,-0.0120,0.060,-0.200,0.842,-0.130 0.106
intercept,-26.5317,1.666,-15.926,0.000,-29.797 -23.266
unit_bulbs,0.0042,0.016,0.265,0.791,-0.027 0.035
unit_ct,21.8376,1.178,18.533,0.000,19.528 24.147
unit_flat,26.3923,0.584,45.199,0.000,25.248 27.537
unit_lb,32.4527,1.166,27.825,0.000,30.167 34.739
unit_leaves,-0.0814,0.004,-18.211,0.000,-0.090 -0.073
cat_bouquet,7.8413,2.027,3.869,0.000,3.869 11.814

0,1,2,3
Omnibus:,15201.723,Durbin-Watson:,0.279
Prob(Omnibus):,0.0,Jarque-Bera (JB):,836668.259
Skew:,3.205,Prob(JB):,0.0
Kurtosis:,34.177,Cond. No.,1.13e+16


#Randomized PCA

In [23]:
# try CountVectorizer: best adj r2 for 30 components using RandomizedPCA, set maxfeatures to 50 ~ .722
vec_machine = CountVectorizer(max_features=30)
word_matrix = vec_machine.fit_transform(data_dummy['product'])
words = pd.DataFrame(word_matrix.todense().astype(np.float))
words.columns = vec_machine.get_feature_names()
print data_dummy.shape
print words.shape

(19821, 37)
(19821, 30)


In [25]:
# try TFIDF: best adj r2 for 30 components using RandomizedPCA, set maxfeatures to 50 ~ .73, .735 for maxf = 60 and comp=40
#Random Forest: r2 ~.96 for max_features = 15, n_components = 5
vec_machine = TfidfVectorizer(max_features=15)
word_matrix = vec_machine.fit_transform(data_dummy['product'])
words = pd.DataFrame(word_matrix.todense().astype(np.float))
words.columns = vec_machine.get_feature_names()
print data_dummy.shape
print words.shape

(19821, 37)
(19821, 15)


In [28]:
rpca = decomposition.RandomizedPCA(n_components=5)
rpca.fit(words)

RandomizedPCA(copy=True, iterated_power=3, n_components=5, random_state=None,
       whiten=False)

In [29]:
multi_regress = words.copy()

X_pca = rpca.fit_transform(scale(multi_regress.as_matrix()))

In [35]:
ols_data = rf_data.copy()
y = ols_data['price_per_unit']
mdl_data = ols_data.drop(['price_per_unit','product','cat_bean','cat','farm','unit_bu','quantity_unit_bu', 'price'],axis=1)
mdl = sm.OLS(y, mdl_data)
results = mdl.fit()
results.summary()
ols_data['predictions'] = results.predict()

In [37]:
ols_data['abs_error'] = np.round(np.abs(ols_data['price_per_unit'] - ols_data['predictions']),decimals=2)
ols_data['total_price_pred'] = np.round(ols_data['quantity']*ols_data['predictions'],decimals=2)
ols_data['total_price_diff'] = np.round(np.abs(ols_data['price'] - ols_data['total_price_pred']),decimals=2)
ols_data['perc_error'] = np.round(ols_data['abs_error']/ols_data['price_per_unit'],decimals=2)
np.mean(ols_data['perc_error'])

1.1220543867615158

# K-Fold Cross Validation

In [38]:
features = pd.DataFrame(X_pca)

rf_data = pd.concat([data_dummy, features], axis=1).drop(['datefac','unit_bulbs','quantity_unit_bulbs'],axis=1).reset_index(drop=True)
#rf_data = pd.concat([data_dummy, features], axis=1).drop(['DATEFAC','UNIT_bulbs','QUANTITY_UNIT_bu'],axis=1).reset_index(drop=True)
#rf_data = data_dummy.drop(['DATEFAC','FARM_coachella','UNIT_bulbs','CAT_bean','QUANTITY_UNIT_bu'],axis=1).reset_index(drop=True)

In [39]:
products = rf_data['product'].unique()
len(products)

640

In [45]:
kf = cross_validation.KFold(len(products), n_folds=5, shuffle=True)
results = []
dollar_errors = []
perc_errors = []
med_error = []
data_pred = rf_data.copy()
data_pred['predictions'] = -1
vars_to_drop = ['price_per_unit','product', 'price','cat_bean','farm','cat']
for train_index, test_index in kf:
    train_products = products[train_index]
    train_data = rf_data[rf_data['product'].isin(train_products)]
    test_products = products[test_index]
    #print test_products
    test_data = rf_data[rf_data['product'].isin(test_products)]
    x_train = train_data.drop(vars_to_drop,axis=1)
    y_train = train_data['price_per_unit']
    x_test = test_data.drop(vars_to_drop,axis=1)
    y_test = test_data['price_per_unit']
    #regr = LinearRegression()
    regr = RandomForestRegressor(n_estimators=5)
    regr.fit(x_train, y_train)
    predictions = regr.predict(x_test)
#     for (prod, pred, true) in zip(test_products, predictions, y_test):
#         if pred != true:
#             print "product:", prod
#             print "prediction:", pred
#             print "true y:", true
#             print "-------------------------------------------"
#             dollar_errors.append(true - pred)
#             perc_errors.append((true - pred)/true)
    data_pred.iloc[test_data.index,-1] = np.round(predictions,decimals=2)
    results.append(regr.score(x_test, y_test))
print "average score:", np.mean(results)

average score: 0.960061359081


In [46]:
results

[0.95990771369723971,
 0.92228577410341805,
 0.96570893652808287,
 0.97764755499773492,
 0.97475681608036768]

In [50]:
data_pred['abs_error'] = np.round(np.abs(data_pred['price_per_unit'] - data_pred['predictions']),decimals=2)
data_pred['total_price_pred'] = np.round(data_pred['quantity']*data_pred['predictions'],decimals=2)
data_pred['total_price_diff'] = np.round(np.abs(data_pred['price'] - data_pred['total_price_pred']),decimals=2)
data_pred['perc_error'] = np.round(data_pred['abs_error']/data_pred['price_per_unit'],decimals=2)
np.mean(data_pred['total_price_diff'])

2.354882195651077

In [58]:
print('Mean predicted price error',format(np.mean(data_pred.perc_error)))

('Mean predicted price error', '0.133684476061')
