In [None]:
# Data 802: Analytical Tools and Foundations
# Homework assignment 1 - Part 2: Predictive Modeling
# Last updated: July 30, 2018 by Anna M. Kot

# #############################################################################
# Import and initialize dependencies.
import pandas as pd
import numpy as np
import datetime
import time
import warnings
from matplotlib import pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import MinMaxScaler
from sklearn.cross_validation import cross_val_score
from sklearn import metrics
from sklearn import datasets, linear_model, preprocessing
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import Lasso
from sklearn.naive_bayes import GaussianNB

scaler=MinMaxScaler()
warnings.filterwarnings("ignore")
plt.style.use('ggplot')

# #############################################################################
# Load and prepare the data for analysis

# Set the path to the folder location containing the raw data file 'HW Data.csv', 
# and load the dataset as 'CBS' for Capital Bikeshare.
CBS = pd.read_csv("/Users/annakot/Desktop/M.S. Analytics/Data 802/02. HW2/HW Data.csv")

# Convert the dataset into a pandas DataFrame.
CBS = pd.DataFrame(CBS)

# Drop columns with previously identified NaN >= 1/2 dataset total,
# unnecessary 'icon' column,
CBS = CBS.drop(["rain_1h", "rain_3h", "snow_1h", "weather_icon"], axis=1)

CBS.BinCount.replace(("Low", "High"), (0, 1), inplace=True)

# Rename the columns in the CBS DataFrame.
CBS = CBS.rename(index=str, columns={"Date_Key": "Date Key", 
                                     "TotalCount": "Total Count", 
                                     "BinCount": "Bin Count", 
                                     "humidity": "Humidity", 
                                     "wind_deg": "Wind (Degrees)",  
                                     "clouds_all": "Cloudiness", 
                                     "weather_main": "Weather (Main)",
                                     "Temp_F": "Temperature (F)",
                                     "Temp_Min_F": "Minimum Temperature (F)",
                                     "Temp_Max_F": "Maximum Temperature (F)",
                                     "Wind_MPH": "Wind (MPH)"})

# Rearrage the columns in the CBS DataFrame.
CBS = CBS[["Date Key", "Total Count", "Bin Count", "Humidity",
           "Cloudiness", 
           "Weather (Main)", 
           "Temperature (F)", 
           "Minimum Temperature (F)",
           "Maximum Temperature (F)", "Wind (Degrees)",
           "Wind (MPH)","Year","Month","Day","Hour","Holiday"]]

# Create an array, CBSarray, to segment year, month, date, 
# and hour from the 'Date Key' column in CBS and append to CBSarray.
CBSarray = np.array(CBS["Date Key"])
year=[]
month=[]
date=[]
hour=[]

for i in CBSarray:
    j = datetime.datetime.strptime(i, "%Y-%m-%d-%H")
    year.append(j.year)
    month.append(str(j.month).zfill(2))
    date.append(str(j.day).zfill(2))
    hour.append(str(j.hour).zfill(2))
    
# Overwrite the existing year, month, day, and hour column, respectively, in 
# CBS and append segmented year, month, date, and hour from CBSarray to eliminate
# null or missing values in the respective columns.    
CBS["Year"] = year
CBS["Month"] = month
CBS["Day"] = date
CBS["Hour"] = hour

# Convert columns 'Year', 'Month', 'Day', and 'Hour' to int64
cols = ['Year','Month','Day','Hour']
CBS[cols] = CBS[cols].astype(np.int64)

# Print shape of original DataFrame (21715, 18)
#print("Shape of Original DataFrame: {}".format(CBS.shape))

# Drop missing values (942 missing values)
CBS = CBS.dropna()

# Print shape of new DataFrame (20773, 18)
#print("Shape of DataFrame After Dropping All Rows with Missing Values: {}".format(CBS.shape))

# Generate a heatmap showing the correlation between the different features
#import seaborn as sns; sns.set()
#sns.heatmap(CBS.corr(), square=True, cmap='RdYlGn')

# Drop columns with redundant correlation
CBS = CBS.drop(["Minimum Temperature (F)", "Maximum Temperature (F)"], axis=1)

# Generate a heatmap showing the correlation between the different features#
#import seaborn as sns; sns.set()
#sns.heatmap(CBS.corr(), square=True, cmap='RdYlGn')

# Create dummy variables
#CBS = pd.get_dummies(CBS, prefix=['Weather (Main)'], columns=['Weather (Main)'])

#CBS.corr()

# #############################################################################
# Create cyclical curves.
# This will ensure that the 0 and 23 hour, for example, are close to each other, 
# thus allowing the cyclical nature of the variable to shine through.

# Create and plot cyclical curve for 'Hour': time.
CBS = CBS.sort_values('Hour')

CBS['sin_time'] = np.sin(2*np.pi*CBS.Hour/24)
CBS['cos_time'] = np.cos(2*np.pi*CBS.Hour/24)

CBS['time'] = CBS['sin_time'] + CBS['cos_time']

#CBS.sin_time.plot();
#CBS.cos_time.plot();

#CBS.time.plot();
#CBS.plot.scatter('sin_time','cos_time').set_aspect('equal');

# Create and plot cyclical curve for 'Hour': time.
CBS = CBS.sort_values('Month')

CBS['sin_month'] = np.sin(2*np.pi*CBS.Month/12)
CBS['cos_month'] = np.cos(2*np.pi*CBS.Month/12)

CBS['month'] = CBS['sin_month'] + CBS['cos_month']

#CBS.sin_month.plot();
#CBS.cos_month.plot();

#CBS.month.plot();
# CBS.plot.scatter('sin_month','cos_month').set_aspect('equal');

In [None]:
# Predictive Modeling using MULTIPLE LINEAR REGRESSION
# Last updated: July 30, 2018 by Anna M. Kot

# #############################################################################
# Create feature and the response variable arrays.

# Define predictor/independent variables.
#X = CBS[["Humidity","Wind (Degrees)","Year","month","Day","time","Temperature (F)","Wind (MPH)"]]
X = CBS[["month","time","Temperature (F)"]]

# Define target/dependent variable.
y = CBS['Total Count']

# #############################################################################
# Determine the most important feature(s) for predictive power.

# # Instantiate a lasso regressor.
# lasso = Lasso(alpha=0.4, normalize=True)

# # Fit the regressor to the data.
# lasso.fit(X, y)

# # Compute and print the coefficients.
# lasso_coef = lasso.coef_
# print(lasso_coef)

# # Plot the coefficients.
# plt.plot(range(len(X.columns)), lasso_coef)
# plt.xticks(range(len(X.columns)), X.columns.values, rotation=45)
# plt.margins(0.02)
# plt.show()

# #############################################################################
# Split into a training set and a test set using a stratified k fold, and
# prepapre variables for MLR REGRESSION.

# Split into a training and testing set using a stratified k fold.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

# Scale training and testing predictor/independent variables.
X_train_scaled=scaler.fit_transform(X_train)
X_test_scaled=scaler.fit_transform(X_test)

# #############################################################################
# Train a MLR REGRESSION model

# Create the regressor and fit the regressor
# to the training set.
lm = linear_model.LinearRegression()
model = lm.fit(X_train_scaled, y_train)

# Predict on the test set.
y_pred = lm.predict(X_test_scaled)

# #############################################################################
# Quantitative evaluation of the model quality on the test set.

# Compute and print R^2 and RMSE
print("R^2: {}".format(lm.score(X_test_scaled, y_test)))
rmse = np.sqrt(metrics.mean_squared_error(y_test, y_pred))
print("Root Mean Squared Error: {}".format(rmse))

In [None]:
# Predictive Modeling using KNN REGRESSION
# Last updated: July 30, 2018 by Anna M. Kot

# #############################################################################
# Initialize required functions.

# class SBS():
#     def __init__(self, estimator, k_features,
#         scoring=metrics.accuracy_score,
#         test_size=0.25, random_state=1):
#         self.scoring = scoring
#         self.estimator = clone(estimator)
#         self.k_features = k_features
#         self.test_size = test_size
#         self.random_state = random_state
        
#     def fit(self, X, y):
#         X_train, X_test, y_train, y_test =
#             train_test_split(X, y, test_size=self.test_size,
#         random_state=self.random_state)
#         dim = X_train.shape[1]
#         self.indices_ = tuple(range(dim))
#         self.subsets_ = [self.indices_]
#         score = self._calc_score(X_train, y_train,
#         X_test, y_test, self.indices_)
#         self.scores_ = [score]
#         while dim > self.k_features:
#             scores = []
#             subsets = []
#             for p in combinations(self.indices_, r=dim-1):
#                 score = self._calc_score(X_train, y_train,
#                 X_test, y_test, p)
#                 scores.append(score)
#                 subsets.append(p)
#             best = np.argmax(scores)
#             self.indices_ = subsets[best]
#             self.subsets_.append(self.indices_)
#             dim -= 1
#             self.scores_.append(scores[best])
#         self.k_score_ = self.scores_[-1]
#         return self
    
#     def transform(self, X):
#         return X[:, self.indices_]
        
#     def _calc_score(self, X_train, y_train,
#                         X_test, y_test, indices):
#         self.estimator.fit(X_train[:, indices], y_train)
#         y_pred = self.estimator.predict(X_test[:, indices])
#         score = self.scoring(y_test, y_pred)
#         return score

# #############################################################################
# Create feature and the response variable arrays.

# Define predictor/independent variables.
X = CBS[["Year","month","Day","time","Temperature (F)","Wind (MPH)"]]

# Define target/dependent variable.
y = CBS['Total Count']

# #############################################################################
# Split into a training set and a test set, and
# scale variables for KNN REGRESSION.

# Split into a training and testing set.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

# Scale training and testing predictor/independent variables.
X_train_scaled=scaler.fit_transform(X_train)
X_test_scaled=scaler.fit_transform(X_test)

# #############################################################################
# Determine k using a model complexity curve.

# Setup arrays to store, train, and test accuracies.
neighbors = np.arange(1, 9)
train_accuracy = np.empty(len(neighbors))
test_accuracy = np.empty(len(neighbors))

# Loop over different values of k.
for i, k in enumerate(neighbors):
    # Setup a k-NN Classifier with k neighbors: knn.
    knn = KNeighborsClassifier(n_neighbors=k)

    # Fit the classifier to the training set.
    knn.fit(X_train_scaled, y_train)
    
    #Compute accuracy on the training set.
    train_accuracy[i] = knn.score(X_train_scaled, y_train)

    #Compute accuracy on the testing set.
    test_accuracy[i] = knn.score(X_test_scaled, y_test)

# Generate model complexity curve.
plt.title('k-NN: Varying Number of Neighbors')
plt.plot(neighbors, test_accuracy, label = 'Testing Accuracy')
plt.plot(neighbors, train_accuracy, label = 'Training Accuracy')
plt.legend()
plt.xlabel('Number of Neighbors')
plt.ylabel('Accuracy')
plt.show()

# #############################################################################
# Determine k using a GridSearch.
# Returns: {'n_neighbors': 1}
#          0.27049232941780604

# # Specifiy the hyperparameter as a dictionary in which the keys 
# # are the hyperparameter names.
# param_grid = {'n_neighbors': np.arange(1,50)}

# # Initiate the KNN classifier.
# knn = KNeighborsClassifier()

# # Using GridSearch, pass in the model, the grid to tune, and the number of folds to use.
# knn_cv = GridSearchCV(knn, param_grid, cv=5)

# # Perform the GridSearch inplace.
# knn_cv.fit(X_train_scaled,y_train)

# # Apply the attributes best params and best score, respectively, to retrieve the 
# # hyperparameters that perform the best along with the mean cross-validation score over that fold.
# print(knn_cv.best_params_)
# print(knn_cv.best_score_)

# #############################################################################
# Train a KNN regression model.

# Create a KNN classifier with 1 neighbor as determined by the 
# model complexity curve and the GridSearch.
knn = KNeighborsClassifier(n_neighbors=1)

# Fit (train) the classifier to the training set.
KNNmodel = knn.fit(X_train_scaled, y_train)

# #############################################################################
# Quantitative evaluation of the model quality on the test set.

# Test the response.
y_pred = knn.predict(X_test_scaled)
print("Test set predictions: {}".format(y_pred))

# Print the accuracy of the model.
print ('Score:', KNNmodel.score(X_test_scaled, y_test))

# Print the parameters of the KNN model.
print(KNNmodel)

# #############################################################################
# Sequential backward selection (SBS) to select the top features 
# with most predictive power.

# from sklearn.base import clone
# from itertools import combinations
# from sklearn.model_selection import train_test_split
# from sklearn.metrics import accuracy_score

# knn = KNeighborsClassifier(n_neighbors=1)
# sbs = SBS(knn, k_features=1)
# sbs.fit(X_train_scaled, y_train)

# k_feat = [len(k) for k in sbs.subsets_]
# plt.plot(k_feat, sbs.scores_, marker='o')
# #plt.ylim([0.7, 1.1])
# plt.ylabel('Accuracy')
# plt.xlabel('Number of features')
# plt.grid()
# plt.show()

# print(sbs.subsets_)
# k5 = list(sbs.subsets_[3])
# print(X.columns[1:][k5])

In [269]:
# Predictive Modeling using LOGISTIC REGRESSION
# Last updated: July 30, 2018 by Anna M. Kot

# #############################################################################
# Create feature and the response variable arrays.

# Define predictor/independent variables.
#X = CBS[["Humidity","Wind (Degrees)","Year","month","Day","time","Temperature (F)","Wind (MPH)"]]
X = CBS[["Wind (Degrees)","month","time","Temperature (F)","Wind (MPH)"]]

# Define target/dependent variable.
y = CBS["Bin Count"]

# #############################################################################
# Split into a training set and a test set, and
# scale variables for LOGISTIC REGRESSION.

# Split into a training and testing set.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42, stratify=y)

# Scale training and testing predictor/independent variables.
X_train_scaled=scaler.fit_transform(X_train)
X_test_scaled=scaler.fit_transform(X_test)

# #############################################################################
# Determine regression parameters using a GridSearch.
# C results in 0.05179474679231213 for my model.

# # Setup the hyperparameter grid
# c_space = np.logspace(-5, 8, 15)
# param_grid = {'C': c_space}

# # Instantiate a logistic regression classifier: logreg
# logreg = LogisticRegression()

# # Instantiate the GridSearchCV object: logreg_cv
# logreg_cv = GridSearchCV(logreg, param_grid, cv=5)

# # Fit it to the data
# logreg_cv.fit(X_train_scaled,y_train)

# # Print the tuned parameter and score
# print("Tuned Logistic Regression Parameters: {}".format(logreg_cv.best_params_))
# print("Best score is {}".format(logreg_cv.best_score_))

# #############################################################################
# Recursive feature elimination to select the top three features.

# from sklearn.feature_selection import RFE
# rfe = RFE(model, 3)
# fit = rfe.fit(X_train_scaled, y_train)

# #print(fit.n_features_)
# print(fit.support_)
# print(fit.ranking_)

# #############################################################################
# Train a LOGISTIC REGRESSION model

# Create a logistic classifier and fit the classifier to the training set.
model = LogisticRegression(C=0.052)
model = model.fit (X_train_scaled, y_train)

# Predict the class labels for the test set.
y_pred = model.predict(X_test_scaled)

# #############################################################################
# Quantitative evaluation of the model quality on the test set.

# Print the accuracy of the model.
print ('Score:', model.score(X_test_scaled, y_test))

# Calculate the probabilities of the class for the test set.
probability = model.predict_proba(X_test_scaled)
print (probability)

# Calculate the confusion matrix to describe the performance of
# the classification model on a set of test data for which the
# true values are known.
print (pd.crosstab(y_test, y_pred, rownames=['True'], colnames=['Predicted'], margins=True))

# Classification report is another method to examine the 
# performance of the classification model.
print (metrics.classification_report(y_test, y_pred))

# #############################################################################
# Qualitative evaluation of the predictions using matplotlib.

# Compute predicted probabilities.
y_pred_prob = model.predict_proba(X_test_scaled)[:,1]

# Generate ROC curve values: fpr, tpr, thresholds.
#fpr, tpr, thresholds = metrics.roc_curve(y_test, y_pred_prob)

# Plot ROC curve
# plt.plot([0, 1], [0, 1], 'k--')
# plt.plot(fpr, tpr)
# plt.xlabel('False Positive Rate')
# plt.ylabel('True Positive Rate')
# plt.title('ROC Curve')
# plt.show()

# Compute and print AUC score.
#print("AUC: {}".format(metrics.roc_auc_score(y_test, y_pred_prob)))

# Compute cross-validated AUC scores:.
#cv_auc = cross_val_score(model, X, y, cv=5, scoring='roc_auc')

# Print list of AUC scores
#print("AUC scores computed using 5-fold cross-validation: {}".format(cv_auc))


Score: 0.8921832884097035
[[0.021 0.979]
 [0.098 0.902]
 [0.121 0.879]
 ...
 [0.139 0.861]
 [0.015 0.985]
 [0.14  0.86 ]]
Predicted     0     1   All
True                       
0          1303   246  1549
1           314  3331  3645
All        1617  3577  5194
             precision    recall  f1-score   support

          0       0.81      0.84      0.82      1549
          1       0.93      0.91      0.92      3645

avg / total       0.89      0.89      0.89      5194



In [None]:
# Predictive Modeling using NAIVE BAYES CLASSIFICATION
# Last updated: July 30, 2018 by Anna M. Kot

# #############################################################################
# Create feature and the response variable arrays.

# Define predictor/independent variables.
X = CBS[["Wind (Degrees)","month","time","Temperature (F)","Wind (MPH)"]]

# Define target/dependent variable.
y = CBS["Bin Count"]

# #############################################################################
# Split into a training set and a test set, and
# scale variables for LOGISTIC REGRESSION.

# Split into a training and testing set.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42, stratify=y)

# Scale training and testing predictor/independent variables.
X_train_scaled=scaler.fit_transform(X_train)
X_test_scaled=scaler.fit_transform(X_test)

# #############################################################################
# Train a NAIVE BAYES CLASSIFICATION model

# Create a naive bayes classifier and fit the 
# classifier to the training set.
gnb = GaussianNB()
gnb.fit(X_train_scaled, y_train)

y_pred = gnb.predict(X_test_scaled)

# #############################################################################
# Quantitative evaluation of the predictions using matplotlib.

# FUNCTIONS
def success_ratio(cm):
    total = cm[0][0] + cm[1][0] + cm[0][1] + cm[1][1]
    return 100*(cm[0][0] + cm[1][1]) / total

# Print the accuracy of the model.
print ('Score:', gnb.score(X_test_scaled, y_test))

# Confusion matrix.
cm_test = pd.crosstab(y_test, y_pred, rownames=['True'], colnames=['Predicted'], margins=True)

# Classification report is another method to examine the 
# performance of the classification model.
print (metrics.classification_report(y_test, y_pred))

print("Test set confusion matrix : \n"+str(cm_test))
print("Success ratio on test set : "+str(success_ratio(cm=cm_test))+"%")


In [None]:
# # empty list that will hold cv scores
# cv_scores = []

# for k in neighbors:
#     knn = KNeighborsClassifier(n_neighbors=k)
#     scores = cross_val_score(knn, X_train_scaled, y_train, cv=10, scoring='r2')
#     cv_scores.append(scores.mean())
    
# # changing to misclassification error
# MSE = [1 - x for x in cv_scores]

# # determining best k
# optimal_k = neighbors[MSE.index(min(MSE))]
# print (optimal_k)

# # plot misclassification error vs k
# plt.plot(neighbors, MSE)
# plt.xlabel('Number of Neighbors K')
# plt.ylabel('Misclassification Error')
# plt.show()

# print(MSE)

In [None]:

X = pd.DataFrame(X
Y = pd.DataFrame(y)
                 
                 
#P = pd.DataFrame(y_pred)

#df = (X, Y, P)
#print(df)



In [281]:
from sklearn.preprocessing import scale

mean_of_array = X.mean(axis=0)
std_of_array = X.std(axis=0)

X_test_scaled = pd.DataFrame(X_test_scaled)

X_original = (X_test_scaled * std_of_array) + mean_of_array

#print (X[:4])
print (X_original[:4])

    0   1   2   3   4  Wind (Degrees)  month  time  Temperature (F)  \
0 NaN NaN NaN NaN NaN             NaN    NaN   NaN              NaN   
1 NaN NaN NaN NaN NaN             NaN    NaN   NaN              NaN   
2 NaN NaN NaN NaN NaN             NaN    NaN   NaN              NaN   
3 NaN NaN NaN NaN NaN             NaN    NaN   NaN              NaN   

   Wind (MPH)  
0         NaN  
1         NaN  
2         NaN  
3         NaN  


In [279]:

y_test = pd.DataFrame(y_test)
y_pred = pd.DataFrame(y_pred)

X_test_scaled.to_csv('/Users/annakot/Desktop/1.csv', index=False)
y_test.to_csv('/Users/annakot/Desktop/2.csv', index=False)
y_pred.to_csv('/Users/annakot/Desktop/3.csv', index=False)

In [283]:
std_of_array

Wind (Degrees)     105.258490
month                0.980286
time                 0.996787
Temperature (F)     16.723982
Wind (MPH)           4.406127
dtype: float64