In [1]:
# Import initial dependencies
import pandas as pd
import numpy as np

In [2]:
# Convert CSVs to DFs
features = "output/reduced_wine_features.csv"
targets = "output/reduced_wine_targets.csv"

features_df = pd.read_csv(features)
targets_df =pd.read_csv(targets) 

In [3]:
# Check feature DF head
features_df.head()

Unnamed: 0,price,country_Portugal,country_US,red_flavor_categories_light and fuity
0,15.0,1,0,1
1,65.0,0,1,0
2,19.0,0,1,1
3,34.0,0,1,0
4,30.0,0,0,0


In [4]:
# Check feature DF shape
features_df.shape

(68776, 4)

In [6]:
targets_df.head()

Unnamed: 0,points
0,87
1,87
2,87
3,87
4,87


In [7]:
targets_df.dtypes

points    int64
dtype: object

In [9]:
# Check target DF shape
targets_df.shape

(68776, 1)

In [11]:
# Assign the features and target to X and y abd check assignments
raw_feature_data = features_df.values
raw_target_data = targets_df.values
X = raw_feature_data[:, 0:5]
y = raw_target_data.reshape(-1,1)

print(X, y)

[[15.  1.  0.  1.]
 [65.  0.  1.  0.]
 [19.  0.  1.  1.]
 ...
 [40.  0.  0.  0.]
 [20.  0.  0.  0.]
 [75.  0.  1.  0.]] [[87]
 [87]
 [87]
 ...
 [90]
 [90]
 [90]]


# SPLIT, NORMALIZE, AND ENCODE THE DATA

In [12]:
# Create the train and test sets for the features and target
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

In [13]:
# Scale the features using the MinMax scaler since we know their values
from sklearn.preprocessing import LabelEncoder, MinMaxScaler

X_scaler = MinMaxScaler().fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

y_scaler = MinMaxScaler().fit(y_train)
y_train_scaled = y_scaler.transform(y_train)
y_test_scaled = y_scaler.transform(y_test)

# HYPERPERAMETER TUNING

In [14]:
# Import the sequential and dense modules fo build my NN
from sklearn.model_selection import GridSearchCV
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.wrappers.scikit_learn import KerasRegressor
from keras.wrappers.scikit_learn import KerasClassifier
from keras.constraints import maxnorm

Using TensorFlow backend.


In [24]:
print('Select Model...')
from xgboost import XGBRegressor
from datetime import datetime
import pprint

start_time  = datetime.now()
xgb_clf = XGBRegressor() 
parameters = {'n_estimators': [200, 300, 500], 'max_depth':[3,5,7,9], \
              'learning_rate':[.01, .001, .0001], 'batch_size':[10, 20, 40, 60, 80, 100]}
grid_search = GridSearchCV(estimator=xgb_clf, param_grid=parameters, cv=10, n_jobs=-1)
print("parameters:")
pprint.pprint(parameters)
grid_search.fit(X_train_scaled, y_train_scaled.ravel())
print("Best score: %0.3f" % grid_search.best_score_)
print("Best parameters set:")
best_parameters=grid_search.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))
end_time = datetime.now()
print(f'Select Done..., Time Cost: {((end_time - start_time).seconds)}.') 

Select Model...
parameters:
{'batch_size': [10, 20, 40, 60, 80, 100],
 'learning_rate': [0.01, 0.001, 0.0001],
 'max_depth': [3, 5, 7, 9],
 'n_estimators': [200, 300, 500]}
Best score: 0.394
Best parameters set:
	batch_size: 10
	learning_rate: 0.01
	max_depth: 5
	n_estimators: 500
Select Done..., Time Cost: 3418.


In [25]:
from sklearn.metrics import mean_squared_error, r2_score

# Use our model to predict a value
predicted_5 = grid_search.predict(X_test_scaled)

# Score the prediction with mse and r2
mse_5 = mean_squared_error(y_test_scaled, predicted_5)
r2_5 = r2_score(y_test_scaled, predicted_5)

print(f"Mean Squared Error (MSE): {mse_5}")
print(f"R-squared (R2): {r2_5}")

Mean Squared Error (MSE): 0.014704609038281724
R-squared (R2): 0.3956518367935681


In [34]:
predicted_list = grid_search.predict(X_test_scaled)

predicted_list

array([0.5287669 , 0.5398442 , 0.29520985, ..., 0.48204508, 0.42631766,
       0.48962787], dtype=float32)

In [35]:
actual_list = y_test_scaled

predicted_list

array([0.5287669 , 0.5398442 , 0.29520985, ..., 0.48204508, 0.42631766,
       0.48962787], dtype=float32)

In [48]:
prediction_comparrisons_df = pd.DataFrame({'XGB_Predictions':predicted_list,
                                           'Actual_Points':actual_list.ravel()
                                          })

prediction_comparrisons_df.head(20)

Unnamed: 0,XGB_Predictions,Actual_Points
0,0.528767,0.6
1,0.539844,0.5
2,0.29521,0.3
3,0.49498,0.55
4,0.529106,0.45
5,0.28852,0.25
6,0.402724,0.5
7,0.365748,0.25
8,0.560576,0.75
9,0.493237,0.6


In [49]:
prediction_comparrisons_df.to_csv(r'C:\Users\howar\Desktop\Data Science Boot Camp\Group_Project_3\Group_Project_3\prediction_modeling\output\predictions_vs_acttual.csv', index=False)

In [40]:
print('Select Model...')
from xgboost import XGBRegressor
from datetime import datetime
import pprint

start_time  = datetime.now()
xgb_clf = XGBRegressor() 
parameters = {'n_estimators': [200, 300, 500], 'max_depth':[5], \
              'learning_rate':[.01], 'batch_size':[10]}
grid_search_2 = GridSearchCV(estimator=xgb_clf, param_grid=parameters, cv=10, n_jobs=-1)
print("parameters:")
pprint.pprint(parameters)
grid_2 = grid_search_2.fit(X_train, y_train.ravel())
print("Best score: %0.3f" % grid_2.best_score_)
print("Best parameters set:")
best_parameters=grid_2.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))
end_time = datetime.now()
print(f'Select Done..., Time Cost: {((end_time - start_time).seconds)}.') 

Select Model...
parameters:
{'batch_size': [10],
 'learning_rate': [0.01],
 'max_depth': [5],
 'n_estimators': [200, 300, 500]}
Best score: 0.359
Best parameters set:
	batch_size: 10
	learning_rate: 0.01
	max_depth: 5
	n_estimators: 500
Select Done..., Time Cost: 37.


In [42]:
from sklearn.metrics import mean_squared_error, r2_score

# Use our model to predict a value
predicted_6 = grid_2.predict(X_test)

# Score the prediction with mse and r2
mse_6 = mean_squared_error(y_test, predicted_6)
r2_6 = r2_score(y_test, predicted_6)

print(f"Mean Squared Error (MSE): {mse_6}")
print(f"R-squared (R2): {r2_6}")

Mean Squared Error (MSE): 6.193746039224186
R-squared (R2): 0.3636044602703611


In [50]:
predicted_list_unscaled = grid_2.predict(X_test)

predicted_list_unscaled

array([89.94345 , 90.18756 , 85.36217 , ..., 89.063385, 87.92288 ,
       89.26439 ], dtype=float32)

In [51]:
actual_list_unscaled = y_test

actual_list_unscaled

array([[92],
       [90],
       [86],
       ...,
       [93],
       [87],
       [91]], dtype=int64)

In [52]:
prediction_comparrisons_unscaled_df = pd.DataFrame({'XGB_Predictions (unscaled)' : predicted_list_unscaled,
                                           'Actual_Points (unscaled)' : actual_list_unscaled.ravel()
                                          })

prediction_comparrisons_unscaled_df.head(20)

Unnamed: 0,XGB_Predictions (unscaled),Actual_Points (unscaled)
0,89.943451,92
1,90.187561,90
2,85.362167,86
3,89.137169,91
4,90.05426,89
5,85.278992,85
6,87.47184,90
7,86.731483,85
8,90.595421,95
9,89.256577,92


In [53]:
prediction_comparrisons_unscaled_df.to_csv(r'C:\Users\howar\Desktop\Data Science Boot Camp\Group_Project_3\Group_Project_3\prediction_modeling\output\predictions_vs_actual_unscaled.csv', index=False)

In [18]:
from sklearn.linear_model import Ridge
from sklearn.model_selection import GridSearchCV

params={'alpha': [25,10,4,2,1.0,0.8,0.5,0.3,0.2,0.1,0.05,0.02,0.01],
       'normalize': [True, False]}
rdg_reg = Ridge()
clf = GridSearchCV(rdg_reg,params,cv=2,verbose = 1, scoring = 'neg_mean_squared_error')
clf.fit(X_train_scaled, y_train_scaled)


Fitting 2 folds for each of 26 candidates, totalling 52 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  52 out of  52 | elapsed:    0.1s finished


GridSearchCV(cv=2, error_score='raise-deprecating',
             estimator=Ridge(alpha=1.0, copy_X=True, fit_intercept=True,
                             max_iter=None, normalize=False, random_state=None,
                             solver='auto', tol=0.001),
             iid='warn', n_jobs=None,
             param_grid={'alpha': [25, 10, 4, 2, 1.0, 0.8, 0.5, 0.3, 0.2, 0.1,
                                   0.05, 0.02, 0.01],
                         'normalize': [True, False]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='neg_mean_squared_error', verbose=1)

In [19]:
# Print the score and best params for the activation model
print("Best: %f using %s" % (clf.best_score_, clf.best_params_))

Best: -0.019730 using {'alpha': 0.01, 'normalize': False}


In [21]:
from sklearn.metrics import mean_squared_error, r2_score

# Use our model to predict a value
predicted_2 = clf.predict(X_test_scaled)

# Score the prediction with mse and r2
mse_2 = mean_squared_error(y_test_scaled, predicted_2)
r2_2 = r2_score(y_test_scaled, predicted_2)

print(f"Mean Squared Error (MSE): {mse_2}")
print(f"R-squared (R2): {r2_2}")

Mean Squared Error (MSE): 0.019410384900731423
R-squared (R2): 0.2022480549229445


In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.model_selection import cross_val_score
def cross_validate_best_known():
    '''
        import and clean the tractor data, then do a corss validation on each of the three models we are
        training here. A RandomForest, a GradientBoost, and an AdaBoost backed by a DecisionTree. Print
        the scores.

        The parameters we're using here are the "best" that we've found so far using a grid search.
    '''

rf = RandomForestRegressor(max_features=2, min_samples_split=4, n_estimators=50, min_samples_leaf=2)
gb = GradientBoostingRegressor(loss='quantile', learning_rate=0.0001, n_estimators=50, max_features='log2', min_samples_split=2, max_depth=1)
ada_tree_backing = DecisionTreeRegressor(max_features='sqrt', splitter='random', min_samples_split=4, max_depth=3)
ab = AdaBoostRegressor(ada_tree_backing, learning_rate=0.0001, loss='square', n_estimators=100000)


rf.fit(X_train_scaled, y_train_scaled.ravel())
# validate.cross_v_scores([rf, gb, ab], X_train_scaled, y_train_scaled)
# # RandomForestRegressor -- RMLSE: -0.596797712098, R2: 0.0272065373946
# GradientBoostingRegressor -- RMLSE: -0.996134592541, R2: -2.37202164829
# AdaBoostRegressor -- RMLSE: -0.706385708459, R2: -0.103966980393 

In [None]:
gb.fit(X_train_scaled, y_train_scaled.ravel())

In [None]:
ada_tree_backing.fit(X_train_scaled, y_train_scaled.ravel())

In [None]:
ab.fit(X_train_scaled, y_train_scaled.ravel())

In [None]:
# Use our model to predict a value
predicted_3 = ab.predict(X_test_scaled)

# Score the prediction with mse and r2
mse_3 = mean_squared_error(y_test_scaled, predicted_3)
r2_3 = r2_score(y_test_scaled, predicted_3)

print(f"Mean Squared Error (MSE): {mse_3}")
print(f"R-squared (R2): {r2_3}")

# THE BELOW CONTAINS DIFFERENT GRID SEARCHS I WAS TESTING OUT
# KEEPING THIS IN THE NOTEBOOK FOR FUTURE REFERENCE

In [None]:
# # Create the base Keras classifier wrapper for activation method
# def create_base_model(activation='relu'):
    
#     model = Sequential()
#     model.add(Dense(units=100, activation='relu', input_dim=5))
#     model.add(Dense(units=100, activation='relu'))
#     model.add(Dense(units=100, activation='tanh'))
#     model.add(Dense(units=100, activation='relu'))
#     model.add(Dense(units=3, activation='softmax'))
#     model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    
#     return model


# # Create activation model
# model = KerasClassifier(build_fn=create_base_model, epochs=250, batch_size=88)


# # Define the activation grid search parameters
# activation = ['softmax', 'softplus', 'softsign', 'relu', 'tanh', 'sigmoid', 'hard_sigmoid', 'linear']


# # Assign the activation grid search parameters, fit, and run the model
# param_grid = dict(activation=activation)
# grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1)
# grid_result = activation_grid.fit(X_train_scaled, y_train_categorical)

In [None]:
# # Print the score and best params for the activation model
# print("Best: %f using %s" % (activation_grid_result.best_score_, activation_grid_result.best_params_))

In [None]:
# print(f"Training Data Score: {activation_grid_result.score(X_train_scaled, y_train_categorical)}")
# print(f"Testing Data Score: {activation_grid_result.score(X_test_scaled, y_test_categorical)}")

In [None]:
# # Create the base Keras classifier wrapper for the model
# def create_model(activation='adam', dropout_rate=0.0):
    
#     model = Sequential()
#     model.add(Dense(units=100, activation='relu', input_dim=5))
#     model.add(Dense(units=100, activation='relu'))
#     model.add(Dense(units=100, activation='tanh'))
#     model.add(Dense(units=100, activation='relu'))
#     model.add(Dense(units=3, activation='softmax'))
#     model.add(Dropout(dropout_rate))
#     model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    
#     return model


# # Create model
# model = KerasClassifier(build_fn=create_model, epochs=250, batch_size=88)


# # Define the activation and dropout rate grid search parameters
# activation = ['softmax', 'softplus', 'softsign', 'relu', 'tanh', 'sigmoid', 'hard_sigmoid', 'linear']
# dropout_rate = [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]


# # Assign the grid search parameters, fit, and run the model
# param_grid = dict(activation=activation, dropout_rate=dropout_rate)
# fit_grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1)
# grid_result = fit_grid.fit(X_train_scaled, y_train_categorical)

In [None]:
# # Print the score and best params for the activation model
# print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))

In [None]:
# # Create the base Keras classifier wrapper for optimizer model
# def create_optimizer_model(optimizer='adam'):
    
#     model = Sequential()
#     model.add(Dense(units=100, activation='hard_sigmoid', input_dim=5))
#     model.add(Dense(units=100, activation='hard_sigmoid'))
#     model.add(Dense(units=100, activation='hard_sigmoid'))
#     model.add(Dense(units=100, activation='hard_sigmoid'))
#     model.add(Dense(units=3, activation='hard_sigmoid'))
#     model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])
    
#     return model


# # Create optimizer model
# optimizer_model = KerasClassifier(build_fn=create_optimizer_model, epochs=250, batch_size=88)


# # Define the optimizer grid search parameters
# optimizer = ['SGD', 'RMSprop', 'Adagrad', 'Adadelta', 'Adam', 'Adamax', 'Nadam']


# # Assign the optimizer grid search parameters, fit, and run the model
# optimizer_param_grid = dict(optimizer=optimizer)
# optimizer_grid = GridSearchCV(estimator=optimizer_model, param_grid=optimizer_param_grid, n_jobs=-1)
# optimizer_grid_result = optimizer_grid.fit(X_train_scaled, y_train_categorical)

In [None]:
# # Print the score and best params for the optimizer model
# print("Best: %f using %s" % (optimizer_grid_result.best_score_, optimizer_grid_result.best_params_))

In [None]:
# # Create the base Keras classifier wrapper for dropout model
# def create_dropout_model(dropout_rate=0.0):
    
#     model = Sequential()
#     model.add(Dense(units=100, activation='relu', input_dim=5))
#     model.add(Dense(units=100, activation='relu'))
#     model.add(Dense(units=100, activation='tanh'))
#     model.add(Dense(units=100, activation='relu'))
#     model.add(Dense(units=3, activation='softmax'))
#     model.add(Dropout(dropout_rate))
#     model.compile(loss='categorical_crossentropy', optimizer='Adadelta', metrics=['accuracy']) 
    
#     return model


# # Create the dropout model
# dropout_model = KerasClassifier(build_fn=create_dropout_model, epochs=250, batch_size=88)


# # Define the dropout grid search parameters
# dropout_rate = [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]


# # Assign the dropout grid search parameters, fit, and run the model
# dropout_param_grid = dict(dropout_rate=dropout_rate)
# dropout_grid = GridSearchCV(estimator=dropout_model, param_grid=dropout_param_grid, n_jobs=-1)
# dropout_grid_result = dropout_grid.fit(X_train_scaled, y_train_categorical)

In [None]:
# # Print the score and best params for the dropout model
# print("Best: %f using %s" % (dropout_grid_result.best_score_, dropout_grid_result.best_params_))

In [None]:
# print(f"Training Data Score: {model2.score(X_train_scaled, y_train)}")
# print(f"Testing Data Score: {model2.score(X_test_scaled, y_test)}")