### Import all needed libraries and set our seed

In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from scipy import stats
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from imblearn.over_sampling import SMOTE
np.random.seed(0)
import xgboost as xgb

### read in our cleaned up dataset

In [2]:
df = pd.read_csv('../mod_three_project/new_2018_citibike_data.cvs')

In [3]:
df.drop(['Unnamed: 0'],axis=1,inplace=True)

In [4]:
start_nhbr = pd.read_csv('start_neighborhoods')

In [6]:
start_nhbr.drop(['Unnamed: 0','start station name'],inplace=True,axis=1)

In [18]:
df_w_snhbr = result = pd.merge(df, start_nhbr, on=['start station id'])

In [24]:
df_w_snhbr = df_w_snhbr[['start_nhbr', 'usertype', 'gender', 'start_month',
                         'start_day_of_week', 'start_hour', 'ave_temp', 'precip', 
                         'snow_depth','birth_yr_range', 'start_hr_range','stop_nhbr']]

In [25]:
df_w_snhbr.head()

Unnamed: 0,start_nhbr,usertype,gender,start_month,start_day_of_week,start_hour,ave_temp,precip,snow_depth,birth_yr_range,start_hr_range,stop_nhbr
0,Tudor City,1,1,1,1,7,19.5,0.0,0,1964 - 1971,0 - 11,Tribeca
1,Tudor City,1,1,1,1,7,19.5,0.0,0,1964 - 1971,0 - 11,Tribeca
2,Tudor City,1,1,1,1,7,19.5,0.0,0,1964 - 1971,0 - 11,Tribeca
3,Tudor City,1,1,1,5,18,45.0,0.0,0,1988 - 1995,12 - 23,Williamsburg
4,Tudor City,1,1,1,5,18,45.0,0.0,0,1988 - 1995,12 - 23,Williamsburg


Lets get a sample from our dataset

In [4]:
df_small = df.sample(5000,random_state=42)

In [5]:
df_small.columns

Index(['start station id', 'usertype', 'gender', 'start_month',
       'start_day_of_week', 'start_hour', 'ave_temp', 'precip', 'snow_depth',
       'stop_nhbr', 'birth_yr_range', 'start_hr_range'],
      dtype='object')

In [10]:
#let see how many neighborhoods we have
len(list(df_small.stop_nhbr.value_counts()))

51

get sample from df with start neighborhoods

In [26]:
df_w_snhbr_sm = df_w_snhbr.sample(5000,random_state=42)

### separate the target and the predictors

In [11]:
target = df_small.stop_nhbr

In [13]:
df_small.dtypes

start station id     float64
usertype               int64
gender                 int64
start_month            int64
start_day_of_week      int64
start_hour             int64
ave_temp             float64
precip               float64
snow_depth             int64
stop_nhbr             object
birth_yr_range        object
start_hr_range        object
dtype: object

__Separate target and predictors from df with start neighborhoods__

In [44]:
df_w_snhbr_sm.to_csv('citibike_data_with_strt_nhbr.csv')

In [30]:
target_snhbr = df_w_snhbr_sm.stop_nhbr 

In [40]:
data_snhbr = pd.get_dummies(df_w_snhbr_sm[df_w_snhbr_sm.columns[:-1]])

In [41]:
data_snhbr.head(3) 

Unnamed: 0,usertype,gender,start_month,start_day_of_week,start_hour,ave_temp,precip,snow_depth,start_nhbr_Astoria,start_nhbr_Battery Park,...,birth_yr_range_1940 - 1947,birth_yr_range_1948 - 1955,birth_yr_range_1956 - 1963,birth_yr_range_1964 - 1971,birth_yr_range_1972 - 1979,birth_yr_range_1980 - 1987,birth_yr_range_1988 - 1995,birth_yr_range_1996 - 2003,start_hr_range_0 - 11,start_hr_range_12 - 23
180088,1,1,6,4,15,68.5,0.1,0,0,0,...,0,0,0,0,0,1,0,0,0,1
2441387,1,1,7,1,8,79.0,2.24,0,0,0,...,0,1,0,0,0,0,0,0,1,0
433909,1,1,3,0,9,38.0,0.0,0,0,0,...,0,0,0,0,0,0,1,0,1,0


### Create dummies from our catrgorical data in our predictors ( birth_yr_range, start_hr_range)

In [19]:
data = pd.get_dummies(df_small[['start station id','usertype', 'gender',
                            'start_month', 'start_day_of_week',
                            'start_hour', 'ave_temp','precip', 
                            'snow_depth', 'birth_yr_range',
                            'start_hr_range']])

In [20]:
data.head()

Unnamed: 0,start station id,usertype,gender,start_month,start_day_of_week,start_hour,ave_temp,precip,snow_depth,birth_yr_range_1940 - 1947,birth_yr_range_1948 - 1955,birth_yr_range_1956 - 1963,birth_yr_range_1964 - 1971,birth_yr_range_1972 - 1979,birth_yr_range_1980 - 1987,birth_yr_range_1988 - 1995,birth_yr_range_1996 - 2003,start_hr_range_0 - 11,start_hr_range_12 - 23
543507,526.0,1,0,5,1,14,76.0,0.0,0,0,0,0,0,0,1,0,0,0,1
361736,144.0,1,1,4,5,16,60.5,0.0,0,0,0,0,0,0,1,0,0,0,1
447442,3167.0,0,0,5,6,9,61.0,0.0,0,0,0,0,0,0,1,0,0,1,0
472328,499.0,1,1,5,3,18,63.5,0.22,0,0,0,0,1,0,0,0,0,0,1
351878,448.0,1,1,4,3,22,54.5,0.0,0,0,0,0,0,0,1,0,0,0,1


In [21]:
target.shape,data.shape

((5000,), (5000, 19))

### Do train test split for the data

we have chose the 80/20 split as our ratio

In [22]:
data_train, data_test, target_train, target_test = train_test_split(data,target,test_size=0.2)


__train test split for DataFramew with start neighborhood__

In [42]:
data_train_sn, data_test_sn, target_train_sn, target_test_sn = train_test_split(data_snhbr,target_snhbr,test_size=0.2)


### Baseline Models

In [23]:
BL_tree_clf = DecisionTreeClassifier(criterion = "gini", max_depth = 5) 
BL_tree_clf.fit(data_train, target_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=5,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [24]:
print('Baseline Tree Classifier train accuracy:', str(round(BL_tree_clf.score(data_train, target_train)*100,2))+'%','\nBaseline Tree Classifier train test accuracy:',str(round(BL_tree_clf.score(data_test, target_test)*100,2))+'%',sep='\n')

Baseline Tree Classifier train accuracy:
9.93%

Baseline Tree Classifier train test accuracy:
6.1%


#### Confusion Matrix of Baseline Tree Classifier:

In [35]:
pred = BL_tree_clf.predict(data_test)
print('\nConfusion Matrix')
print('----------------')
pd.crosstab(target_test, pred, rownames=['True'], colnames=['Predicted'], margins=True)


Confusion Matrix
----------------


Predicted,Astoria,Battery Park,Chelsea,Clinton,East Village,Flatiron District,Garment District,Gramercy,Greenwich Village,Lower East Side,Midtown,Park Slope,Upper West Side,Williamsburg,All
True,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
Astoria,1,0,0,1,0,0,0,0,1,0,2,1,1,2,9
Battery Park,0,0,0,1,0,0,0,0,0,1,0,1,4,2,9
Bedford Stuyvesant,0,0,0,0,0,0,0,0,0,0,0,0,3,0,3
Boerum Hill,0,0,0,1,0,0,1,0,0,0,0,0,1,4,7
Brooklyn Heights,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1
Carroll Gardens,0,0,0,2,1,0,2,0,0,1,3,0,7,0,16
Central Park,0,0,0,0,0,0,0,0,1,1,5,0,5,0,12
Chelsea,0,1,0,5,4,0,1,1,3,4,7,0,18,9,53
Chinatown,0,0,0,2,1,0,0,0,0,0,1,0,2,3,9
Clinton,0,0,2,3,4,0,2,0,3,2,6,1,22,7,52


In [25]:
forest_bl = RandomForestClassifier(n_estimators=100, max_depth= 5)
forest_bl.fit(data_train, target_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=5, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [26]:
print('Baseline random forest accuracy:', str(round(forest_bl.score(data_train, target_train)*100,2))+'%',
      '\nBaseline random forest test accuracy:',str(round(forest_bl.score(data_test, target_test)*100,2))+'%',sep='\n')

Baseline random forest accuracy:
16.5%

Baseline random forest test accuracy:
7.5%


__Baseline Random Forest for DataFrame with Start Neighborhood__

In [43]:
forest_bl_sn = RandomForestClassifier(n_estimators=100, max_depth= 5)
forest_bl_sn.fit(data_train_sn, target_train_sn)
# data_train_sn, data_test_sn, target_train_sn, target_test_sn
print('Baseline random forest accuracy:', str(round(forest_bl_sn.score(data_train_sn, target_train_sn)*100,2))+'%',
      '\nBaseline random forest test accuracy:',str(round(forest_bl_sn.score(data_test_sn, target_test_sn)*100,2))+'%',sep='\n')

Baseline random forest accuracy:
18.95%

Baseline random forest test accuracy:
6.7%


In [45]:
rf_param_grid_2 = {
    'n_estimators': [50,100,150,200],
    'criterion': ['entropy'],
    'max_depth': [None, 2, 5],
    'min_samples_split': [2, 5, 12],
    'min_samples_leaf': [1,2]
}

In [48]:
rf_grid_search_sn = GridSearchCV(forest_bl_sn, rf_param_grid_2, cv=3, return_train_score=True)
rf_grid_search_sn.fit(data_snhbr,target_snhbr)

rf_training_score_sn = np.mean(rf_grid_search_sn.cv_results_['mean_train_score'])
rf_testing_score_sn = rf_grid_search_sn.score(data_snhbr,target_snhbr)

print("Mean Training Score: {:.4}%".format(rf_training_score_sn * 100))
print("Mean Testing Score: {:.4}%".format(rf_testing_score_sn * 100))
print("Best Parameter Combination Found During Grid Search:")
rf_grid_search.best_params_



Mean Training Score: 40.68%
Mean Testing Score: 14.76%
Best Parameter Combination Found During Grid Search:


NameError: name 'rf_grid_search' is not defined

In [49]:
rf_grid_search_sn.best_params_

{'criterion': 'entropy',
 'max_depth': 5,
 'min_samples_leaf': 2,
 'min_samples_split': 2,
 'n_estimators': 50}

In [1]:
# data_train_sn, data_test_sn, target_train_sn, target_test_sn
rf_grid_search_sn_new = RandomForestClassifier(criterion='entropy',max_depth=5,
                                               min_samples_leaf=2,min_samples_split=2,n_estimators=50)
rf_grid_search_sn_new.fit(data_train_sn,target_train_sn)
y_pred_test = rf_grid_search_sn_new.predict(data_test_sn)

NameError: name 'RandomForestClassifier' is not defined

In [None]:
show_cf(target_test_sn, y_pred_test, class_names=list(set(target_train_sn)), model_name='Random Forest with Start and Stop Neighborhoods')

In [74]:
def show_cf(y_true, y_pred, class_names=None, model_name=None):
    plt.figure(figsize=(20,12))
    cf = confusion_matrix(y_true, y_pred)
    plt.imshow(cf, cmap=plt.cm.Blues)
    
    if model_name:
        plt.title("Confusion Matrix: {}".format(model_name))
    else:
        plt.title("Confusion Matrix")
    plt.ylabel('True Label')
    plt.xlabel('Predicted Label')
    
    class_names = set(y_true)
    tick_marks = np.arange(len(class_names))
    if class_names:
        plt.xticks(tick_marks, class_names)
        plt.yticks(tick_marks, class_names)
    
    thresh = cf.max() / 2.
    
    for i, j in itertools.product(range(cf.shape[0]), range(cf.shape[1])):
        plt.text(j, i, cf[i, j], horizontalalignment='center', color='white' if cf[i, j] > thresh else 'black')
    plt.xticks(rotation=90)
    plt.colorbar()

### XGBoost

Baseline for XGBoost and our data with the start neighborhoods

In [51]:
# data_snhbr,target_snhbr
# data_train_sn, data_test_sn, target_train_sn, target_test_sn
xgb_clf = xgb.XGBClassifier()
xgb_clf.fit(data_train_sn, target_train_sn)
training_preds = xgb_clf.predict(data_train_sn)
val_preds = xgb_clf.predict(data_test_sn)
training_accuracy = accuracy_score(target_train_sn,training_preds)
val_accuracy = accuracy_score(target_test_sn,val_preds)

print("Training Accuracy: {:.4}%".format(training_accuracy * 100))
print("Validation accuracy: {:.4}%".format(val_accuracy * 100))

Training Accuracy: 26.12%
Validation accuracy: 6.4%


__Create a parameter grid dictionary__

In [62]:
param_grid = {
    "learning_rate": [0.5, 0.7, 0.1],
    'max_depth': [2, 3, 4],
    'min_child_weight': [4, 5],
    'n_estimators': [300, 400, 500],
}

__Use GridSearch to find the best pararmeters for our model__

In [73]:
# data_snhbr,target_snhbr
# data_train_sn, data_test_sn, target_train_sn, target_test_sn

grid_xgb_clf = GridSearchCV(xgb_clf, param_grid, scoring='accuracy',n_jobs=1,cv=None)
grid_xgb_clf.fit(data_train_sn, target_train_sn)

best_parameters = grid_xgb_clf.best_params_

print("Grid Search found the following optimal parameters: ")
for param_name in sorted(best_parameters.keys()):
    print("%s: %r" % (param_name, best_parameters[param_name]))

training_preds = grid_xgb_clf.predict(data_train_sn)
val_preds = grid_xgb_clf.predict(data_test_sn)
training_accuracy = accuracy_score(target_train_sn,training_preds)
val_accuracy = accuracy_score(target_test_sn,val_preds)

print("")
print("Training Accuracy: {:.4}%".format(training_accuracy * 100))
print("Validation accuracy: {:.4}%".format(val_accuracy * 100))



KeyboardInterrupt: 

In [57]:
# data_snhbr,target_snhbr
# data_train_sn, data_test_sn, target_train_sn, target_test_sn
xgb_clf = xgb.XGBClassifier(learning_rate=0.5,max_depth=3,min_child_weight=5,n_estimators=300)
xgb_clf.fit(data_train_sn, target_train_sn)
training_preds = xgb_clf.predict(data_train_sn)
val_preds = xgb_clf.predict(data_test_sn)
training_accuracy = accuracy_score(target_train_sn,training_preds)
val_accuracy = accuracy_score(target_test_sn,val_preds)

print("Training Accuracy: {:.4}%".format(training_accuracy * 100))
print("Validation accuracy: {:.4}%".format(val_accuracy * 100))

# "learning_rate": [0.5, 0.7, 0.1],
#     'max_depth': [2, 3, 4],
#     'min_child_weight': [4, 5],
#     'n_estimators': [300, 400, 500]

Training Accuracy: 74.72%
Validation accuracy: 3.8%


In [58]:
xgb_clf = xgb.XGBClassifier(learning_rate=0.5,max_depth=2,min_child_weight=5,n_estimators=150)
xgb_clf.fit(data_train_sn, target_train_sn)
training_preds = xgb_clf.predict(data_train_sn)
val_preds = xgb_clf.predict(data_test_sn)
training_accuracy = accuracy_score(target_train_sn,training_preds)
val_accuracy = accuracy_score(target_test_sn,val_preds)

print("Training Accuracy: {:.4}%".format(training_accuracy * 100))
print("Validation accuracy: {:.4}%".format(val_accuracy * 100))

Training Accuracy: 31.5%
Validation accuracy: 5.0%


In [59]:
xgb_clf = xgb.XGBClassifier(learning_rate=0.5,max_depth=2,min_child_weight=2,n_estimators=100)
xgb_clf.fit(data_train_sn, target_train_sn)
training_preds = xgb_clf.predict(data_train_sn)
val_preds = xgb_clf.predict(data_test_sn)
training_accuracy = accuracy_score(target_train_sn,training_preds)
val_accuracy = accuracy_score(target_test_sn,val_preds)

print("Training Accuracy: {:.4}%".format(training_accuracy * 100))
print("Validation accuracy: {:.4}%".format(val_accuracy * 100))

Training Accuracy: 32.62%
Validation accuracy: 6.1%


In [60]:
xgb_clf = xgb.XGBClassifier(learning_rate=0.3,max_depth=2,min_child_weight=2,n_estimators=100)
xgb_clf.fit(data_train_sn, target_train_sn)
training_preds = xgb_clf.predict(data_train_sn)
val_preds = xgb_clf.predict(data_test_sn)
training_accuracy = accuracy_score(target_train_sn,training_preds)
val_accuracy = accuracy_score(target_test_sn,val_preds)

print("Training Accuracy: {:.4}%".format(training_accuracy * 100))
print("Validation accuracy: {:.4}%".format(val_accuracy * 100))

Training Accuracy: 25.25%
Validation accuracy: 7.0%


In [61]:
xgb_clf = xgb.XGBClassifier(learning_rate=0.2,max_depth=3,min_child_weight=2,n_estimators=100)
xgb_clf.fit(data_train_sn, target_train_sn)
training_preds = xgb_clf.predict(data_train_sn)
val_preds = xgb_clf.predict(data_test_sn)
training_accuracy = accuracy_score(target_train_sn,training_preds)
val_accuracy = accuracy_score(target_test_sn,val_preds)

print("Training Accuracy: {:.4}%".format(training_accuracy * 100))
print("Validation accuracy: {:.4}%".format(val_accuracy * 100))

Training Accuracy: 34.55%
Validation accuracy: 5.6%


In [None]:
xgb_clf

In [None]:
'criterion': 'entropy',
 'max_depth': 5,
 'min_samples_leaf': 2,
 'min_samples_split': 2,
 'n_estimators': 50

In [72]:
# data_snhbr,target_snhbr
# data_train_sn, data_test_sn, target_train_sn, target_test_sn
xgb_clf_bt = xgb.XGBClassifier(learning_rate=0.4,max_depth=5,min_child_weight=3,n_estimators=50)
xgb_clf_bt.fit(data_train_sn, target_train_sn)
training_preds = xgb_clf_bt.predict(data_train_sn)
val_preds = xgb_clf_bt.predict(data_test_sn)
training_accuracy = accuracy_score(target_train_sn,training_preds)
val_accuracy = accuracy_score(target_test_sn,val_preds)

print("Training Accuracy: {:.4}%".format(training_accuracy * 100))
print("Validation accuracy: {:.4}%".format(val_accuracy * 100))

Training Accuracy: 58.98%
Validation accuracy: 5.4%
