## Error Analysis: what are our false predictions?

- how good is our model with staff pick
- how good is our model without the staff pick? (drop staff pick)
- how good is our model against the staff pick? (predict only staff picked rows)
- how good is out model on data the staff didn't pick

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import time

import os

import src.visualization as vs
import src.process as process
import models.lib as modlib

from sklearn.model_selection import train_test_split, cross_val_predict, cross_val_score, cross_validate, KFold
from sklearn.model_selection import StratifiedKFold, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import make_scorer, accuracy_score, fbeta_score, recall_score, precision_score
from sklearn.metrics import classification_report, roc_curve, confusion_matrix

# models
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier

# ignore all warnings
import warnings
warnings.filterwarnings('ignore')

from xgboost import XGBClassifier

RSEED = 42

In [None]:
data = pd.read_csv('data/processed/kickstarter_clean.csv')


# Drop Staff Pick

data['day_hour_launch'] = data['day_hour_launch'].astype(str)
data['day_hour_deadline'] = data['day_hour_deadline'].astype(str)
data = data.drop(['staff_pick','usd_pledged','pledge_per_backer'], axis=1)


y = data['state']
X = data.drop('state', axis=1)

X_train_og, X_test_og, y_train_og, y_test_og = train_test_split(X,y, stratify = y, test_size = 0.2, random_state = RSEED)

## Include Staff Pick

In [None]:
data = pd.read_csv('data/processed/kickstarter_clean.csv')


# Include staff pick

data['day_hour_launch'] = data['day_hour_launch'].astype(str)
data['day_hour_deadline'] = data['day_hour_deadline'].astype(str)
data = data.drop(['usd_pledged','pledge_per_backer'], axis=1)

y = data['state']
X = data.drop('state', axis=1)

X_train, X_test, y_train, y_test = train_test_split(X,y, stratify = y, test_size = 0.2, random_state = RSEED)

target = 'state'
num_features, cat_features = process.num_cat_features(data, target=target)
models = {'XGB':  XGBClassifier(seed=RSEED)}

preprocessor = process.create_preprocessor(num_features, cat_features)
scaled_models = process.model_process_pipeline(models, preprocessor, prefix='scaled')

predictes_y_test_dict, _ = process.model_test_predict(X_train, X_test, y_train, scaled_models)
display(vs.nice_scores(y_test, predictes_y_test_dict))

fig, ax = plt.subplots(figsize=(4,4))
sns.heatmap(confusion_matrix(y_test, predictes_y_test_dict['scaledXGB']), fmt='g', annot=True, cmap='PuBuGn', cbar=False);

## Drop Staff Pick

In [None]:
data = pd.read_csv('data/processed/kickstarter_clean.csv')


# Include staff pick

data['day_hour_launch'] = data['day_hour_launch'].astype(str)
data['day_hour_deadline'] = data['day_hour_deadline'].astype(str)
data = data.drop(['staff_pick','usd_pledged','pledge_per_backer'], axis=1)

y = data['state']
X = data.drop('state', axis=1)

X_train, X_test, y_train, y_test = train_test_split(X,y, stratify = y, test_size = 0.2, random_state = RSEED)

target = 'state'
num_features, cat_features = process.num_cat_features(data, target=target)
models = {'XGB':  XGBClassifier(seed=RSEED)}

preprocessor = process.create_preprocessor(num_features, cat_features)
scaled_models = process.model_process_pipeline(models, preprocessor, prefix='scaled')

predictes_y_test_dict, _ = process.model_test_predict(X_train, X_test, y_train, scaled_models)
display(vs.nice_scores(y_test, predictes_y_test_dict))

fig, ax = plt.subplots(figsize=(4,4))
sns.heatmap(confusion_matrix(y_test, predictes_y_test_dict['scaledXGB']), fmt='g', annot=True, cmap='PuBuGn', cbar=False);

In [None]:
og_predictions = {}
scaled_models['scaledXGB'].fit(X_train, y_train)
og_predictions['scaledXGB'] = scaled_models['scaledXGB'].predict(X_test_og)

display(vs.nice_scores(y_test_og, og_predictions))

fig, ax = plt.subplots(figsize=(4,4))
sns.heatmap(confusion_matrix(y_test_og, og_predictions['scaledXGB']), fmt='g', annot=True, cmap='PuBuGn', cbar=False);

In [None]:
data_with_predictions = pd.concat([X_test.reset_index(drop=True), y_test.reset_index(drop=True), pd.Series(predictes_y_test_dict['scaledXGB'], name='predictions').reset_index(drop=True)], axis=1)

prediction_errors = data_with_predictions.query('state != predictions')
prediction_errors.to_csv(os.path.join('data/processed/', 'kickstarter_errors.csv'), index=False)
prediction_errors

In [None]:
j=len(X_test.columns)
fig, axs = plt.subplots(ncols=2,nrows=j,figsize=(16, 4)) 
for k, column in enumerate(X_test.columns):
    

    sns.countplot(data=prediction_errors, x=column, ax=axs.flat[k], hue='state')
    sns.countplot(data=X_test, x=column, ax=axs.flat[k+1], hue=y_test)

In [None]:
len(X_test.columns)

In [None]:
data = pd.read_csv('data/processed/kickstarter_clean.csv')


# Include staff pick

data['day_hour_launch'] = data['day_hour_launch'].astype(str)
data['day_hour_deadline'] = data['day_hour_deadline'].astype(str)
data = data.drop(['usd_pledged','pledge_per_backer'], axis=1)


# Drop all Rows where staff pick == False, then drop staff pick

data = data.drop(data[data['staff_pick'] == False].index)
data = data.drop(['staff_pick'], axis=1)

y = data['state']
X = data.drop('state', axis=1)

X_train, X_test, y_train, y_test = train_test_split(X,y, stratify = y, test_size = 0.2, random_state = RSEED)


target = 'state'
num_features, cat_features = process.num_cat_features(data, target=target)
models = {'XGB':  XGBClassifier(seed=RSEED)}


preprocessor = process.create_preprocessor(num_features, cat_features)
scaled_models = process.model_process_pipeline(models, preprocessor, prefix='scaled')


predictes_y_test_dict, _ = process.model_test_predict(X_train, X_test, y_train, scaled_models)
display(vs.nice_scores(y_test, predictes_y_test_dict))

fig, ax = plt.subplots(figsize=(4,4))
sns.heatmap(confusion_matrix(y_test, predictes_y_test_dict['scaledXGB']), fmt='g', annot=True, cmap='PuBuGn', cbar=False);

In [None]:
from imblearn.over_sampling import SMOTE

# Choose an oversampling method
oversampler = SMOTE(random_state=RSEED)

# Apply oversampling to the training set
X_train_oversampled, y_train_oversampled = oversampler.fit_resample(X_train, y_train)

In [None]:
og_predictions = {}
scaled_models['scaledXGB'].fit(X_train, y_train)
og_predictions['scaledXGB'] = scaled_models['scaledXGB'].predict(X_test_og)

display(vs.nice_scores(y_test_og, og_predictions))

fig, ax = plt.subplots(figsize=(4,4))
sns.heatmap(confusion_matrix(y_test_og, og_predictions['scaledXGB']), fmt='g', annot=True, cmap='PuBuGn', cbar=False);

## Only non-Staff Picked Data

In [None]:
data = pd.read_csv('data/processed/kickstarter_clean.csv')


# Include staff pick

data['day_hour_launch'] = data['day_hour_launch'].astype(str)
data['day_hour_deadline'] = data['day_hour_deadline'].astype(str)
data = data.drop(['usd_pledged','pledge_per_backer'], axis=1)


# Drop all Rows where staff pick == True

data = data.drop(data[data['staff_pick'] == True].index)
data = data.drop(['staff_pick'], axis=1)

y = data['state']
X = data.drop('state', axis=1)

X_train, X_test, y_train, y_test = train_test_split(X,y, stratify = y, test_size = 0.2, random_state = RSEED)


target = 'state'
num_features, cat_features = process.num_cat_features(data, target=target)
models = {'XGB':  XGBClassifier(seed=RSEED)}


preprocessor = process.create_preprocessor(num_features, cat_features)
scaled_models = process.model_process_pipeline(models, preprocessor, prefix='scaled')


predictes_y_test_dict, _ = process.model_test_predict(X_train, X_test, y_train, scaled_models)
display(vs.nice_scores(y_test, predictes_y_test_dict))

fig, ax = plt.subplots(figsize=(4,4))
sns.heatmap(confusion_matrix(y_test, predictes_y_test_dict['scaledXGB']), fmt='g', annot=True, cmap='PuBuGn', cbar=False);

In [None]:
og_predictions = {}
scaled_models['scaledXGB'].fit(X_train, y_train)
og_predictions['scaledXGB'] = scaled_models['scaledXGB'].predict(X_test_og)

display(vs.nice_scores(y_test_og, og_predictions))

fig, ax = plt.subplots(figsize=(4,4))
sns.heatmap(confusion_matrix(y_test_og, og_predictions['scaledXGB']), fmt='g', annot=True, cmap='PuBuGn', cbar=False);

- make a table that only contains falsely predicted rows
- compare data distributions of falsely predicted vs entire data