# Helper Code:

## This is a brief synopsis of some go-to Data Science code

It's a work in progress

# Exploratory Data Analysis (EDA)

In [None]:
# Pretty simple, but I like it as the first thing I run
def eda(dataframe):
    print("missing values \n", dataframe.isnull().sum(), '\n')
    print("dataframe index \n", dataframe.index, '\n')
    print("dataframe types \n", dataframe.dtypes, '\n')
    print("dataframe shape \n", dataframe.shape, '\n')
    print("dataframe describe \n", dataframe.describe(include='all'), '\n')

Examples of changing dtypes of columns:

In [None]:
df["Date"] = pd.to_datetime(df["Date"], infer_datetime_format=True)
# Fix date column

In [None]:
# Add Year, Month, Day, etc columns if wanted
df["Year"] = df['Date'].dt.year
df['Month'] = df['Date'].dt.month

In [None]:
df["Category Name"] = df["Category Name"].astype('category')
df["Vendor Number"] = df["Vendor Number"].astype('category')
df["Item Number"] = df["Item Number"].astype('category')
df["Item Description"] = df["Item Description"].astype('category')
# Change category columns to dtype category

In [None]:
df["State Bottle Retail"] = df["State Bottle Retail"].apply(lambda x: x.strip('$')).astype(float)
df["Sale (Dollars)"] = df["Sale (Dollars)"].apply(lambda x: x.strip('$')).astype(float)
# Change dollar amounts to floats for calculations

In [None]:
df.dtypes

# Plotting

Basic imports and settings

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [None]:
sns.set()
# Default seaborn settings are pretty
plt.rcParams['figure.figsize'] = [16, 10]
# This makes your plots bigger
sns.set(font_scale=1.5)
# This increases the size of the label fonts

Count Bar chart in seaborn:

the 'hue=' attribute adds a column to split the count on

In [None]:
sns.countplot(x='Species', hue='WnvPresent', data=train)
plt.xticks(rotation=45)
plt.show()

Simple scatter plot example

In [None]:
fig, ax = plt.subplots()
# Only actually necessary if you want to save an image of the file
sns.set(font_scale=2)
# set font size of labels
sns.regplot('Age', 'Median', data=dataframe, fit_reg=False, scatter_kws={"s": 200})
# plot first data points, takes x column, y column, DataFrame, can fit a linear regression line, can change size of points
sns.regplot('Age', '90th Percentile', data=y, fit_reg=False, scatter_kws={"s": 200})
# Second plot for fun
plt.title('Median comments over time')
# Title
plt.xlabel('Age of Post in Hours')
# X axis label
plt.ylabel('Number of Comments')
# Y axis label
plt.legend(['Median', '90th Percentile'])
# Legend if you do use both plots
fig.savefig('/Users/Dale/Desktop/RedMed.svg', format='svg', dpi=2000)
# Saves the file here if you want, comment out otherwise
plt.show()
# Show it (though unnecessary with matplotlib inline)

# Modeling General

Don't forget your train/test split

In [2]:
from sklearn.model_selection import train_test_split, cross_val_score

In [None]:
df_train, df_test = train_test_split(dataframe, test_size=0.3, random_state=42)
# You can pass a dataframe with both your X and y, good if you want to use Patsy format
X_train, X_test, y_train, y_test = train_test_split(df_X, df_y, test_size=0.3, random_state=42)
# Pass your X features seperate and then your y target
# I prefer this split as you don't have to drop things later and can do transformations easier

sklearn Evaluation:
It expects you to have already used .predict().  The second two parameters are only if you want to get the Roc-Auc score and only work if you have bianary classes.

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, roc_auc_score

def eval_sklearn_model(y_true, predictions, model=None, X=None):
    """This function takes the true values for y and the predictions made by the model and prints out the confusion matrix along with Accuracy, Precision, and, if model and X provided, Roc_Auc Scores."""
    cnf_matrix = confusion_matrix(y_true, predictions)

    print('True Negative: ', cnf_matrix[0, 0], '| False Positive: ', cnf_matrix[0, 1])
    print('False Negative: ', cnf_matrix[1, 0], '| True Positive: ', cnf_matrix[1, 1], '\n')

    sensitivity = cnf_matrix[1, 1]/ (cnf_matrix[1, 0] + cnf_matrix[1, 1])
    specificity = cnf_matrix[0, 0]/ (cnf_matrix[0, 1] + cnf_matrix[0, 0])

    print('Sensitivity (TP/ TP + FN): ', sensitivity)
    print('Specificity (TN/ TN + FP): ', specificity, '\n')

    print('Accuracy: ', accuracy_score(y_true, predictions, normalize=True))
    print('Precision: ', precision_score(y_true, predictions))
    if model != None:
        print('Roc-Auc: ', roc_auc_score(y_true, [x[1] for x in model.predict_proba(X)]))
    else:
        pass
    print('\n')

# Modeling Classification

## Stats Models
Logistic Regression is good here because of the .summary() feature

In [None]:
import statsmodels.formula.api as sm

In [None]:
model = sm.logit("target_y ~ var_x1 + var_x2 + var_x3 * var_x4",data = df_train).fit()
# You can use the Patsy format here.  var_x3 and var_x4 are 'interation' variables in this example

In [None]:
model.summary()

In [None]:
# Evaluate logistic regression in this format:
import math

In [None]:
print('If var_x1 increases by 1, it is ', math.exp(3.2923), ' times as likely the target class will be the case.')

## SKLEARN

Basic process:
    
    1) initiate your model
    2) cross-validate to get base accuracy
    3) if looking for other metrics to compare:
        a) use .fit(X_train, y_train)
        b) use .predict(X_train) to make preditions of TRAINING data
        c) compare these with y_train (can use eval_sklearn_model(y_train, predictions))
    4) once you are satisfied with your model, use test data
        a) use .fit(X_test)
        b) compare these with y_test (can use eval_sklearn_model(y_test, test_predictions))

K Nearest Neighbors

In [None]:
from sklearn.neighbors import KNeighborsClassifier

In [None]:
knn = KNeighborsClassifier(n_neighbors=10)
# Perform cross validation to estimate model performance
print(cross_val_score(knn, X_train, y_train, cv=3, scoring='accuracy').mean())

In [None]:
# If concerned with more than accuracy (such as sensitivity or specificity, do this as well)
# Skip otherwise
knn.fit(X_train, y_train)
predictions = knn.predict(X_train)
eval_sklearn_model(y_train, predictions)

In [None]:
# Test on your reserved information
knn.fit(X_train, y_train)
# Can skip .fit() if performed above
test_predictions = knn.predict(X_test)
eval_sklearn_model(y_test, test_predictions)

Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
log = LogisticRegression()
log.fit(X_train, y_train)
test_predictions = log.predict(X_test)
print('TD-IDF, Subs, Sensitivity, and Features Logistic Regression TEST SCORE:\n')
eval_sklearn_model(y_test, test_predictions)

Multinomial Naive Bayes Classifier

In [None]:
from sklearn.naive_bayes import MultinomialNB

In [None]:
%%time
mnb = MultinomialNB(class_prior=[.9,.1])
# use class_prior if you know your classes are unbalanced.  This example I have 90% 0's and 10% 1's
mnb.fit(X_train, y_train)
test_predictions = mnb.predict(X_test)
print('Multinomial Naive Bayes TEST SCORE:\n')
eval_sklearn_model(y_test, test_predictions)

Decision Tree Classifier

In [None]:
from sklearn.tree import DecisionTreeClassifier

In [None]:
%%time
dt = DecisionTreeClassifier(class_weight='balanced', min_samples_leaf=5)
dt.fit(X_train, y_train)
test_predictions = dt.predict(X_test)
print('Decision Tree TEST SCORE:\n')
eval_sklearn_model(y_test, test_predictions)

Random Forest Classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
%%time
dt = RandomForestClassifier(class_weight='balanced', max_features=750, min_samples_leaf=5, n_estimators=1000, n_jobs=-1)
dt.fit(X_train, y_train)
test_predictions = dt.predict(X_test)
print('Random Forest TEST SCORE:\n')
eval_sklearn_model(y_test, test_predictions)

Extra Trees Classifier

In [None]:
from sklearn.ensemble import ExtraTreesClassifier

In [None]:
%%time
dt = ExtraTreesClassifier(class_weight='balanced', max_features=500, min_samples_leaf=5, n_estimators=100, n_jobs=-1)
dt.fit(X_train, y_train)
test_predictions = dt.predict(X_test)
print('Extra Random Forest TEST SCORE:\n')
eval_sklearn_model(y_test, test_predictions)

XG Boost Classifier

In [None]:
from xgboost import XGBClassifier

In [None]:
xgb = XGBClassifier(scale_pos_weight=(27543/3111), objective='binary:logistic')
# make sure to pick the correct objective for the problem
# scale_pos_weight is supposed to help with unbalanced classes; it recommended number of negative cases divided by positive
xgb.fit(X_train, y_train)

In [None]:
test_predictions = xgb.predict(X_test)
eval_sklearn_model(y_test, test_predictions)

# Grid Search

In [None]:
from sklearn.grid_search import GridSearchCV
import time

This is just an example.
It is expecting a X_train, y_train, X_test, y_test split to have been performed

In [None]:
start_time = time.time()

# Pick which estimators you want to test (example is for random forest)
param_grid = dict(n_estimators = [100],
                 max_features = [250, 500, 750, 1000, 2000, 5000],
                 min_samples_leaf = [5, 15, 50, 100],
                 )

# Switch out the model here that you would like to test
model = RandomForestClassifier(class_weight='balanced', n_jobs=-1)

grid = GridSearchCV(model, param_grid, cv=3, scoring='roc_auc')

grid.fit(X_train, y_train)

best_results = {'params': list(grid.best_params_.items()), 'score': grid.best_score_}

best_model = grid.best_estimator_

best_model = best_model.fit(X_train, y_train)

score = best_model.score(X_test, y_test)

print('Number of Models Run: ', len(n_estimators) * len(max_features) * len(min_samples_leaf) * 3)
# YOU MUST UPDATE THIS LINE TO HAVE SAME ESTIMATORS AS IN DICTIONARY
print("{} Score: {:0.3}".format('Decision Tree Classifier', score.mean().round(3)), '\n')
print('Elapsed Time: {:0.3}'.format( time.time() - start_time), ' seconds', '\n')
print(grid.best_estimator_, '\n')
print('Best Hyperparameters we tested for', '\n', best_results)

In [None]:
# Your best model from the grid is already fit and saved as best_model
test_predictions = best_model.predict(X_test)
print('Grid Search TEST SCORE:\n')
# function created above should be run before this cell
eval_sklearn_model(y_test, test_predictions)

# Bokeh Google Map

In [1]:
from bokeh.io import output_file, show
from bokeh.models import (
  GMapPlot, GMapOptions, ColumnDataSource, Circle, DataRange1d, PanTool, WheelZoomTool, BoxSelectTool
)

map_options = GMapOptions(lat=41.87, lng=-87.70, map_type="hybrid", zoom=10)

plot = GMapPlot(
    x_range=DataRange1d(), y_range=DataRange1d(), map_options=map_options
)
plot.title.text = "Chicago"

# For GMaps to function, Google requires you obtain and enable an API key:
#
#     https://developers.google.com/maps/documentation/javascript/get-api-key
#
# Replace the value below with your personal API key:
plot.api_key = "AIzaSyACt4u00p7Z7djOutZHXa7JshUOA0m2Cjw"

source = ColumnDataSource(
    data=dict(
        lat=[41.786, 41.995,],
        lon=[-87.752, -87.933,],
    )
)

circle = Circle(x="lon", y="lat", size=15, fill_color="blue", fill_alpha=0.8, line_color=None)
plot.add_glyph(source, circle)

plot.add_tools(PanTool(), WheelZoomTool(), BoxSelectTool())
output_file("gmap_plot.html")
show(plot)