In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV
from xgboost import XGBClassifier

Read data and check if there is any NaN value exists.

In [2]:
train_df = pd.read_csv("train.csv")
train_df = train_df.drop(train_df.columns[[0]], axis=1)
print("NaN value exists?", train_df.isnull().values.any())
train_df.head()

NaN value exists? False


Unnamed: 0,Weight,Age,Days,Vegan,Cats,Cash,Size,Toppings
0,106.238809,36.596211,38,0,1,5.699125,No order,No order
1,184.378192,28.739952,28,0,0,1.171537,No order,No order
2,232.475732,106.605562,38,1,1,259.440103,Large,Hawaiian
3,112.811584,103.684648,112,0,0,13.886261,No order,No order
4,139.31781,15.045878,78,0,0,1934.054928,Medium,Pepperoni


In [3]:
test_df = pd.read_csv("test.csv")
test_df = test_df.drop(test_df.columns[[0]], axis=1)
print("NaN value exists?", test_df.isnull().values.any())
test_df.head()

NaN value exists? False


Unnamed: 0,Weight,Age,Days,Vegan,Cats,Cash
0,215.241281,45.123194,19,0,0,1955.03428
1,251.301889,17.856168,38,0,0,2532.312093
2,189.421541,105.951771,3,0,0,241.320502
3,75.0,37.001579,7,0,0,292.279276
4,156.416838,92.159389,63,0,2,325.376085


Next, some variables are categorical, they should be encoded into labels, but later on when we output results with models, we should use the same encoder for decoding label to original value, so I create a dictionary here to store the two encoder that were being used.

In [4]:
from sklearn import preprocessing
def encode_labels(labels):
    le = preprocessing.LabelEncoder()
    le.fit(labels)
    return le, le.transform(labels)

In [5]:
predited_label_dict = {}

for col in train_df.columns:
    if train_df[col].dtype != np.float64 and train_df[col].dtype != np.int64:
        print(col)
        predited_label_dict[col], train_df[col] = encode_labels(train_df[col])
train_df.head()

Size
Toppings


Unnamed: 0,Weight,Age,Days,Vegan,Cats,Cash,Size,Toppings
0,106.238809,36.596211,38,0,1,5.699125,3,4
1,184.378192,28.739952,28,0,0,1.171537,3,4
2,232.475732,106.605562,38,1,1,259.440103,1,2
3,112.811584,103.684648,112,0,0,13.886261,3,4
4,139.31781,15.045878,78,0,0,1934.054928,2,5


In [6]:
print(predited_label_dict)

{'Size': LabelEncoder(), 'Toppings': LabelEncoder()}


Spliting x and y data from dataset.

In [7]:
x = train_df.drop(["Size", "Toppings"], axis=1)
x.head()

Unnamed: 0,Weight,Age,Days,Vegan,Cats,Cash
0,106.238809,36.596211,38,0,1,5.699125
1,184.378192,28.739952,28,0,0,1.171537
2,232.475732,106.605562,38,1,1,259.440103
3,112.811584,103.684648,112,0,0,13.886261
4,139.31781,15.045878,78,0,0,1934.054928


In [8]:
y_size = train_df["Size"]
y_size.head()

0    3
1    3
2    1
3    3
4    2
Name: Size, dtype: int64

In [9]:
y_toppings = train_df["Toppings"]
y_toppings.head()

0    4
1    4
2    2
3    4
4    5
Name: Toppings, dtype: int64

Based on the life experience and variables we have in dataset, I choosed KNN for the first attempt on model selecetion. Vegan people have higher probability on choosing same falvor of pizza then other group, also, people in similiar weight and age are more likely to get same size of pizza. Therefore, the distance calculation part in KNN algorithm might be helpful.

In [10]:
from sklearn.model_selection import cross_validate, cross_val_score
scoring_list = ["accuracy", "f1_micro"]
y_list = ["Size", "Toppings"]

In [11]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler

scalar = StandardScaler()
x_scaled = scalar.fit(x).transform(x)

In [12]:
model = KNeighborsClassifier(n_neighbors=3)
for y in y_list:
    clf = cross_validate(model, x_scaled, train_df[y], cv=10, verbose=1, return_train_score=False, scoring=scoring_list)
    print("KNN")
    print(y, "accuracy:", clf["test_accuracy"].mean())
    print(pd.DataFrame(clf))

KNN
Size accuracy: 0.21366354234
   fit_time  score_time  test_accuracy  test_f1_micro
0  0.002117    0.013327       0.192308       0.192308
1  0.000825    0.002276       0.294118       0.294118
2  0.000784    0.011650       0.215686       0.215686
3  0.001271    0.002656       0.235294       0.235294
4  0.004677    0.008432       0.294118       0.294118
5  0.000806    0.002606       0.137255       0.137255
6  0.003988    0.007814       0.142857       0.142857
7  0.000731    0.003050       0.125000       0.125000
8  0.003860    0.007133       0.312500       0.312500
9  0.000691    0.002638       0.187500       0.187500
KNN
Toppings accuracy: 0.247516813012
   fit_time  score_time  test_accuracy  test_f1_micro
0  0.001991    0.016852       0.283019       0.283019
1  0.002980    0.010729       0.264151       0.264151
2  0.001253    0.003889       0.211538       0.211538
3  0.001162    0.012273       0.307692       0.307692
4  0.000774    0.003072       0.200000       0.200000
5  0.006434

[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.1s finished
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.1s finished


But turns out the accuracy is pretty bad, only roughly around 20%. Might because the distance calculation in KNN is not relative.

So I then run Random Forest Classifier since it generally has good performance and it handles all sort of data well. Also other boosters.

In [13]:
model = RandomForestClassifier(n_estimators=10, max_depth=4, random_state=123)
for y in y_list:
    clf = cross_validate(model, x, train_df[y], cv=10, verbose=1, return_train_score=False, scoring=scoring_list)
    print("Random Forest")
    print(y, "accuracy:", clf["test_accuracy"].mean())
    print(pd.DataFrame(clf))

[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.5s finished


Random Forest
Size accuracy: 0.381634192138
   fit_time  score_time  test_accuracy  test_f1_micro
0  0.049322    0.014155       0.346154       0.346154
1  0.068980    0.011780       0.372549       0.372549
2  0.033516    0.003097       0.470588       0.470588
3  0.033715    0.013470       0.411765       0.411765
4  0.050110    0.003215       0.392157       0.392157
5  0.031503    0.009561       0.333333       0.333333
6  0.037161    0.011985       0.489796       0.489796
7  0.033646    0.003793       0.333333       0.333333
8  0.045984    0.003374       0.250000       0.250000
9  0.034003    0.012103       0.416667       0.416667
Random Forest
Toppings accuracy: 0.431568731263
   fit_time  score_time  test_accuracy  test_f1_micro
0  0.033473    0.003326       0.396226       0.396226
1  0.035335    0.003390       0.471698       0.471698
2  0.032436    0.004997       0.442308       0.442308
3  0.030010    0.009227       0.461538       0.461538
4  0.047758    0.003356       0.380000      

[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.4s finished


In [14]:
model = AdaBoostClassifier(n_estimators=10, random_state=123)
for y in y_list:
    clf = cross_validate(model, x, train_df[y], cv=10, verbose=1, return_train_score=False, scoring=scoring_list)
    print("ADA")
    print(y, "accuracy:", clf["test_accuracy"].mean())
    print(pd.DataFrame(clf))

[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.5s finished


ADA
Size accuracy: 0.323542109151
   fit_time  score_time  test_accuracy  test_f1_micro
0  0.037252    0.012965       0.230769       0.230769
1  0.034612    0.010891       0.431373       0.431373
2  0.049607    0.002861       0.294118       0.294118
3  0.035898    0.012891       0.313725       0.313725
4  0.039284    0.009830       0.392157       0.392157
5  0.046705    0.004313       0.392157       0.392157
6  0.046973    0.003245       0.306122       0.306122
7  0.037535    0.011599       0.333333       0.333333
8  0.045620    0.003781       0.333333       0.333333
9  0.052533    0.015989       0.208333       0.208333
ADA
Toppings accuracy: 0.288856320168
   fit_time  score_time  test_accuracy  test_f1_micro
0  0.043216    0.012161       0.264151       0.264151
1  0.036232    0.005514       0.415094       0.415094
2  0.036185    0.003237       0.403846       0.403846
3  0.046720    0.015659       0.346154       0.346154
4  0.035758    0.010332       0.360000       0.360000
5  0.03605

[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.5s finished


In [15]:
model = XGBClassifier(n_estimators=10, max_depth=3, random_state=123)
for y in y_list:
    clf = cross_validate(model, x, train_df[y], cv=10, verbose=1, return_train_score=False, scoring=scoring_list)
    print("XGB")
    print(y, "accuracy:", clf["test_accuracy"].mean())
    print(pd.DataFrame(clf))

[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.4s finished


XGB
Size accuracy: 0.393691322683
   fit_time  score_time  test_accuracy  test_f1_micro
0  0.028290    0.004570       0.365385       0.365385
1  0.028412    0.004481       0.431373       0.431373
2  0.035500    0.014158       0.431373       0.431373
3  0.031892    0.004012       0.411765       0.411765
4  0.026374    0.008534       0.411765       0.411765
5  0.022508    0.011187       0.352941       0.352941
6  0.025474    0.007040       0.448980       0.448980
7  0.030965    0.002404       0.354167       0.354167
8  0.032021    0.002921       0.375000       0.375000
9  0.038520    0.005889       0.354167       0.354167
XGB
Toppings accuracy: 0.471077478924
   fit_time  score_time  test_accuracy  test_f1_micro
0  0.031993    0.002935       0.471698       0.471698
1  0.035881    0.011450       0.490566       0.490566
2  0.032510    0.013917       0.557692       0.557692
3  0.052642    0.028706       0.442308       0.442308
4  0.049877    0.006565       0.440000       0.440000
5  0.04004

[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.5s finished


The results are not obviously pleasant, but we can see that Random Forest and XGB worked a bit better compare to ADA.

Do grid search for entering different input, trees of 10, 15, and 20, max depth on 4, 6, 8 layers.

In [16]:
param_grid = { 
    'n_estimators': [10, 15, 20],
    'max_depth': [4, 6, 8],
    'random_state': [123]
#     'max_features': ['auto', 'sqrt', 'log2']
}

grid_size = GridSearchCV(estimator=RandomForestClassifier(), param_grid=param_grid, cv=10, verbose=1 ,scoring=scoring_list, refit="accuracy")
grid_size.fit(x, y_size)
grid_toppings = GridSearchCV(estimator=RandomForestClassifier(), param_grid=param_grid, cv=10, verbose=1, scoring=scoring_list, refit="accuracy")
grid_toppings.fit(x, y_toppings)
print("=============================================")
print("Best params (size):", grid_size.best_params_)
print("Best score (size):", grid_size.best_score_)

print("Best params (toppings):", grid_toppings.best_params_)
print("Best score (toppings):", grid_toppings.best_score_)

Fitting 10 folds for each of 9 candidates, totalling 90 fits


[Parallel(n_jobs=1)]: Done  90 out of  90 | elapsed:    7.6s finished


Fitting 10 folds for each of 9 candidates, totalling 90 fits
Best params (size): {'max_depth': 6, 'n_estimators': 15, 'random_state': 123}
Best score (size): 0.412
Best params (toppings): {'max_depth': 8, 'n_estimators': 15, 'random_state': 123}
Best score (toppings): 0.456


[Parallel(n_jobs=1)]: Done  90 out of  90 | elapsed:    6.9s finished


From the result above we can see that the best model among these inputs is max depth with 6, number of trees with 15 for predicting size and max depth with 8, number of trees with 15 for predicting toppings. Predictions are around 41% and 45%.

In [17]:
clf_size = RandomForestClassifier(n_estimators=15, max_depth=6)
clf_size.fit(x, y_size)
hyp_size = clf_size.predict(test_df)
hyp_size = predited_label_dict["Size"].inverse_transform(hyp_size)

clf_toppings = RandomForestClassifier(n_estimators=15, max_depth=8)
clf_toppings.fit(x, y_toppings)
hyp_toppings = clf_toppings.predict(test_df)
hyp_toppings = predited_label_dict["Toppings"].inverse_transform(hyp_toppings)

In [18]:
result_df = pd.DataFrame({'Size':hyp_size, 'Toppings':hyp_toppings})
result_df.head()

Unnamed: 0,Size,Toppings
0,Large,Supreme
1,X-Large,Meat-lovers
2,X-Large,Cheese
3,Small,Supreme
4,Large,Cheese


In [20]:
result_df.to_csv(r'pred2.txt', header=None, sep=',', mode='a')

In [22]:
train_df.shape

(500, 8)

In [23]:
train_df.describe()

Unnamed: 0,Weight,Age,Days,Vegan,Cats,Cash,Size,Toppings
count,500.0,500.0,500.0,500.0,500.0,500.0,500.0,500.0
mean,149.848538,60.290461,43.54,0.042,0.396,3797.89463,2.434,2.9
std,45.907961,28.123431,32.881527,0.20079,0.695811,22126.256472,1.551276,1.983398
min,75.0,13.008069,0.0,0.0,0.0,0.273849,0.0,0.0
25%,114.947429,35.549435,19.0,0.0,0.0,56.529103,1.0,1.0
50%,148.74212,60.083296,38.0,0.0,0.0,253.057224,2.0,2.0
75%,181.703051,84.853689,63.0,0.0,1.0,1639.666602,4.0,4.0
max,299.34874,109.993134,199.0,1.0,5.0,425434.282276,5.0,6.0


The reason that prediction accuracy is not significantly impressive might be the dataset is too small that it does not have enough data for the model to learn and predict, another possible cause can be that when it comes to predicting size and topping, there might be more factors (predictor variables) in real world, which our dataset does not contain. In conclusion, we can either gather more data or consider more features for predicting.