# IMPORT AND PREPARE DATA

In [1]:
# Import initial dependencies
import pandas as pd
import numpy as np

In [2]:
# Convert CSVs to DFs
features = "../output/UPDATED_reduced_wine_features.csv"
targets = "../output/reduced_wine_targets.csv"

features_df_pre = pd.read_csv(features)
targets_df =pd.read_csv(targets) 

In [3]:
features_df = features_df_pre[['white', 'flavor_categories_taste_notes_light, fruity', 'price', 'country_Austria']]

In [4]:
# Check feature DF head
features_df.head()

Unnamed: 0,white,"flavor_categories_taste_notes_light, fruity",price,country_Austria
0,0.0,1,15.0,0
1,0.0,0,65.0,0
2,0.0,1,19.0,0
3,0.0,0,34.0,0
4,0.0,0,30.0,0


In [5]:
# Check feature DF shape
features_df.shape

(109662, 4)

In [6]:
# Check targets DF
targets_df.head()

Unnamed: 0,points
0,87
1,87
2,87
3,87
4,87


In [7]:
# Check Dtypes
targets_df.dtypes

points    int64
dtype: object

In [8]:
# Check feature and target DF shape
print(features_df.shape, targets_df.shape)

(109662, 4) (109662, 1)


In [9]:
# Assign the features and target to X and y abd check assignments
raw_feature_data = features_df.values
raw_target_data = targets_df.values
X = raw_feature_data[:, 0:5]
y = raw_target_data.reshape(-1,1)

print(X, y)

[[ 0.  1. 15.  0.]
 [ 0.  0. 65.  0.]
 [ 0.  1. 19.  0.]
 ...
 [ 0.  0. 70.  0.]
 [ 0.  0. 25.  0.]
 [ 0.  0. 38.  0.]] [[87]
 [87]
 [87]
 ...
 [90]
 [90]
 [91]]


# SPLIT, NORMALIZE, AND ENCODE THE DATA

In [10]:
# Create the train and test sets for the features and target
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [11]:
# Scale the features using the MinMax scaler since we know their values
from sklearn.preprocessing import LabelEncoder, MinMaxScaler

X_scaler = MinMaxScaler().fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

y_scaler = MinMaxScaler().fit(y_train)
y_train_scaled = y_scaler.transform(y_train)
y_test_scaled = y_scaler.transform(y_test)

# HYPERPERAMETER TUNING

In [12]:
# Import the sequential and dense modules fo build my NN
from sklearn.model_selection import GridSearchCV
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.wrappers.scikit_learn import KerasRegressor
from keras.constraints import maxnorm

Using TensorFlow backend.


# Run XGB Regressor with scalled data and recommended parameters from the previous test

In [13]:
print('Select Model...')
from xgboost import XGBRegressor
from datetime import datetime
import pprint
import os

# Create start time and model for XGB Regressor
start_time  = datetime.now()
xgb_reg = XGBRegressor() 


parameters_reg = {'n_estimators': [850], 'max_depth':[5, 7], \
              'learning_rate':[.01], 'batch_size':[10, 16]}

grid_search = GridSearchCV(estimator=xgb_reg, param_grid=parameters_reg, cv=10, n_jobs=-1)


print("parameters:")
pprint.pprint(parameters_reg)


grid = grid_search.fit(X_train_scaled, y_train_scaled.ravel())


print("Best score: %0.3f" % grid.best_score_)
print("Best parameters set:")
best_parameters=grid.best_estimator_.get_params()


for param_name in sorted(best_parameters.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))
    end_time = datetime.now()
print(f'Select Done..., Time Cost: {((end_time - start_time).seconds)}.') 

Select Model...
parameters:
{'batch_size': [10, 16],
 'learning_rate': [0.01],
 'max_depth': [5, 7],
 'n_estimators': [850]}
Best score: 0.401
Best parameters set:
	base_score: 0.5
	batch_size: 10
	booster: 'gbtree'
	colsample_bylevel: 1
	colsample_bynode: 1
	colsample_bytree: 1
	gamma: 0
	importance_type: 'gain'
	learning_rate: 0.01
	max_delta_step: 0
	max_depth: 5
	min_child_weight: 1
	missing: None
	n_estimators: 850
	n_jobs: 1
	nthread: None
	objective: 'reg:linear'
	random_state: 0
	reg_alpha: 0
	reg_lambda: 1
	scale_pos_weight: 1
	seed: None
	silent: None
	subsample: 1
	verbosity: 1
Select Done..., Time Cost: 253.


In [14]:
from sklearn.metrics import mean_squared_error, r2_score

# Use our model to predict a value
predicted = grid.predict(X_test_scaled)

# Score the prediction with mse and r2
mse = mean_squared_error(y_test_scaled, predicted)
r2 = r2_score(y_test_scaled, predicted)

# Print scores
print(f"Mean Squared Error (MSE): {mse}")
print(f"R-squared (R2): {r2}")

Mean Squared Error (MSE): 0.01410573966214706
R-squared (R2): 0.3979035733416677


In [15]:
# Create list of predictions and check
predicted_list = grid.predict(X_test_scaled)

predicted_list

array([0.43168384, 0.29862463, 0.365847  , ..., 0.40303826, 0.5965271 ,
       0.3683988 ], dtype=float32)

In [16]:
# Create list of actual values and check
actual_list = y_test_scaled

actual_list

array([[0.55],
       [0.45],
       [0.45],
       ...,
       [0.55],
       [0.8 ],
       [0.5 ]])

# PRINT PREDICTIONS FOR SCALED DATA TO COMPARE TO ACTUAL RATINGS

In [17]:
# Create DF for unscaled test to further compare predictions with actual points
prediction_comparrisons_df = pd.DataFrame({'XGB_Predictions (unscaled)' : predicted_list,
                                           'Actual_Points (unscaled)' : actual_list.ravel()
                                          })

prediction_comparrisons_df.head(40)

Unnamed: 0,XGB_Predictions (unscaled),Actual_Points (unscaled)
0,0.431684,0.55
1,0.298625,0.45
2,0.365847,0.45
3,0.410079,0.4
4,0.563481,0.4
5,0.383288,0.65
6,0.481746,0.4
7,0.543273,0.6
8,0.391415,0.25
9,0.449901,0.35


In [18]:
# Export unscaled results
prediction_comparrisons_df.to_csv(r'C:\Users\howar\Desktop\Data Science Boot Camp\Group_Project_3\prediction_modeling\output\UPDATED_predictions_vs_actual.csv', index=False)

In [19]:
# Save the tuned model for future use
import joblib
filename = 'saved_models\XGB_Regressor_unscaled_(BEST_SCALED_REDUCED_FEATURES).sav'
joblib.dump(grid, filename)

['saved_models\\XGB_Regressor_unscaled_(BEST_SCALED_REDUCED_FEATURES).sav']

# Run XGB Regressor with unscalled data and recommended parameters from the previous test

In [20]:
print('Select Model...')
from xgboost import XGBRegressor
from datetime import datetime
import pprint
import os


xgb_reg_2 = XGBRegressor() 


parameters_reg_2 = {'n_estimators': [850], 'max_depth':[5], \
              'learning_rate':[.01], 'batch_size':[10]}

grid_search_2 = GridSearchCV(estimator=xgb_reg_2, param_grid=parameters_reg_2, cv=10, n_jobs=-1)


print("parameters:")
pprint.pprint(parameters_reg_2)


grid_2 = grid_search_2.fit(X_train, y_train.ravel())


print("Best score: %0.3f" % grid_2.best_score_)
print("Best parameters set:")
best_parameters_2=grid_2.best_estimator_.get_params()


for param_name in sorted(parameters_reg_2.keys()):
    print("\t%s: %r" % (param_name, best_parameters_2[param_name]))



Select Model...
parameters:
{'batch_size': [10],
 'learning_rate': [0.01],
 'max_depth': [5],
 'n_estimators': [850]}
Best score: 0.401
Best parameters set:
	batch_size: 10
	learning_rate: 0.01
	max_depth: 5
	n_estimators: 850


In [21]:
from sklearn.metrics import mean_squared_error, r2_score

# Use our model to predict a value
predicted = grid_2.predict(X_test)

# Score the prediction with mse and r2
mse = mean_squared_error(y_test, predicted)
r2 = r2_score(y_test, predicted)

# Print scores
print(f"Mean Squared Error (MSE): {mse}")
print(f"R-squared (R2): {r2}")

Mean Squared Error (MSE): 5.643817592693378
R-squared (R2): 0.39774118786712387


In [22]:
# Create list of predictions and check
predicted_list_unscaled = grid_2.predict(X_test)

predicted_list_unscaled

array([88.639015, 85.94526 , 87.3009  , ..., 88.034966, 91.92236 ,
       87.33915 ], dtype=float32)

In [23]:
# Create list of actual values and check
actual_list_unscaled = y_test

actual_list_unscaled

array([[91],
       [89],
       [89],
       ...,
       [91],
       [96],
       [90]], dtype=int64)

# PRINT PREDICTIONS AND ACTUAL RATINGS FOR UNSCALED DATA

In [24]:
# Create DF for unscaled test to further compare predictions with actual points
prediction_comparrisons_unscaled_df = pd.DataFrame({'XGB_Predictions (unscaled)' : predicted_list_unscaled,
                                           'Actual_Points (unscaled)' : actual_list_unscaled.ravel()
                                          })

prediction_comparrisons_unscaled_df.head(40)

Unnamed: 0,XGB_Predictions (unscaled),Actual_Points (unscaled)
0,88.639015,91
1,85.945259,89
2,87.300903,89
3,88.245422,88
4,91.248062,88
5,87.65612,93
6,89.624031,88
7,90.846878,92
8,87.823799,85
9,88.97892,87


In [25]:
# Export unscaled results
prediction_comparrisons_unscaled_df.to_csv(r'C:\Users\howar\Desktop\Data Science Boot Camp\Group_Project_3\prediction_modeling\output\UPDATED_predictions_vs_actual_unscaled.csv', index=False)

In [26]:
# Save the tuned model for future use
import joblib
filename = 'saved_models\XGB_Regressor_unscaled_(BEST_UNSCALED_REDUCED_FEATURES).sav'
joblib.dump(grid_2, filename)

['saved_models\\XGB_Regressor_unscaled_(BEST_UNSCALED_REDUCED_FEATURES).sav']

In [37]:
# Export model to a serialized Python object structure to use in the App.py file
import pickle
pickle.dump(grid_2, open("../output/XGB_unscaled_model.pkl","wb"))

In [38]:
# Export model to a serialized Python object structure to use in the App.py file (this time, in the app.py directory...)
import pickle
pickle.dump(grid_2, open("../../wine_application/parallax-template/XGB_unscaled_model.pkl","wb"))

In [39]:
# Create list of predictions and check
predicted_list_full = grid_2.predict(X)

predicted_list_full

array([86.1245  , 90.67437 , 87.0198  , ..., 91.09032 , 88.0918  ,
       89.159584], dtype=float32)

In [40]:
predicted_list_full.shape

(109662,)

In [41]:
# Create list of actual values and check
actual_list_full = y

actual_list_full

array([[87],
       [87],
       [87],
       ...,
       [90],
       [90],
       [91]], dtype=int64)

In [42]:
actual_list_full.shape

(109662, 1)

# EXPORT FINAL PREDICTED FEATURES AND TARGETS IN ONE COMBINED FILE TO USE IN THE APPLICATION

In [43]:
features_df['predicted_scores'] = predicted_list_full

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [44]:
features_df.columns

Index(['white', 'flavor_categories_taste_notes_light, fruity', 'price',
       'country_Austria', 'predicted_scores'],
      dtype='object')

In [45]:
features_df.head()

Unnamed: 0,white,"flavor_categories_taste_notes_light, fruity",price,country_Austria,predicted_scores
0,0.0,1,15.0,0,86.124496
1,0.0,0,65.0,0,90.67437
2,0.0,1,19.0,0,87.019798
3,0.0,0,34.0,0,88.770905
4,0.0,0,30.0,0,88.502487


In [46]:
features_df.shape

(109662, 5)

In [47]:
# Export unscaled results
features_df.to_csv(r'C:\Users\howar\Desktop\Data Science Boot Camp\Group_Project_3\prediction_modeling\output\full_predictions.csv', index=False)