In [1]:
# Import everything

import sys
import pandas as pd
import numpy as np


import sklearn
from sklearn.model_selection import train_test_split
from category_encoders import OneHotEncoder, OrdinalEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline
from sklearn.metrics import accuracy_score

import xgboost
from xgboost import XGBClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

import pickle
import joblib


In [2]:
df = pd.read_csv('KickstarterCleanedv4.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,backers_count,category,goal,pledged,spotlight,state,blurb_length,goal_in_usd,campaign_duration,sub_category
0,0,63,comics,599.0,1942.0,1,1,5.0,599.0,23,graphic novels
1,1,132,comics,2000.0,3097.0,1,1,20.0,2000.0,30,graphic novels
2,2,6,crafts,500.0,211.0,0,0,5.0,500.0,30,diy
3,3,16,art,17000.0,1368.0,0,0,18.0,17000.0,45,painting
4,4,44,art,2500.0,2506.0,1,1,14.0,2500.0,60,performance art


In [3]:
df.drop(columns=['Unnamed: 0','goal','spotlight'],inplace=True)

In [4]:
df.columns

Index(['backers_count', 'category', 'pledged', 'state', 'blurb_length',
       'goal_in_usd', 'campaign_duration', 'sub_category'],
      dtype='object')

In [5]:
df = df.drop_duplicates()

In [6]:
df.shape

(8702, 8)

In [7]:
df.to_csv('assets/Kickstarter_FinalCleaned.csv')

In [8]:
# Extracting the target and feature matrix
target = 'state'
y = df[target]
X = df.drop(columns=target)

print(X.shape)
print(y.shape)

(8702, 7)
(8702,)


In [9]:
# Splitting into train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .4)
print(X_train.shape,X_test.shape)
print(y_train.shape,y_test.shape)

(5221, 7) (3481, 7)
(5221,) (3481,)


In [10]:
#Baseline

print('baseline accuracy', y.value_counts(normalize=True).max())


baseline accuracy 0.6234199034704666


In [11]:
#  Random Forest

model_rf = make_pipeline(OrdinalEncoder(),
                       SimpleImputer(strategy="mean"),
                       RandomForestClassifier( n_jobs=-1, random_state=42))

In [12]:
# Decision Tree
model_dt = make_pipeline(OrdinalEncoder(),
                      SimpleImputer(strategy="mean"),
                      DecisionTreeClassifier(random_state=42))

In [13]:
# XGBoost

model_xgb = make_pipeline(OrdinalEncoder(),
                       SimpleImputer(strategy="mean"),
                       XGBClassifier(random_state=42))

In [14]:
# Gradient Boost

model_gb = make_pipeline(OrdinalEncoder(),
                       SimpleImputer(strategy="mean"),
                       GradientBoostingClassifier(random_state=42))

In [15]:
model_rf.fit(X_train,y_train)
model_dt.fit(X_train,y_train)
model_xgb.fit(X_train,y_train)
model_gb.fit(X_train,y_train)



Pipeline(steps=[('ordinalencoder',
                 OrdinalEncoder(cols=['category', 'sub_category'],
                                mapping=[{'col': 'category',
                                          'data_type': dtype('O'),
                                          'mapping': publishing                                                                       1
technology                                                                       2
crafts                                                                           3
comics                                                                           4
art                                                                              5
music                                                                            6
food                                                                             7
film & video                                                                     8
fashion                                              

In [16]:
#Check Metrics on training
print('model_dt accuracy score', accuracy_score(y_train, model_dt.predict(X_train)))
print('model_rf accuracy score', accuracy_score(y_train, model_rf.predict(X_train)))
print('model_xgb accuracy score', accuracy_score(y_train, model_xgb.predict(X_train)))
print('model_gb accuracy score', accuracy_score(y_train, model_gb.predict(X_train)))

model_dt accuracy score 1.0
model_rf accuracy score 1.0
model_xgb accuracy score 1.0
model_gb accuracy score 0.986784140969163


In [17]:
# Metrics with test data
# print('model_dt accuracy score', accuracy_score(y_test, model_dt.predict(X_test)))
# print('model_rf accuracy score', accuracy_score(y_test, model_rf.predict(X_test)))
# print('model_xgb accuracy score', accuracy_score(y_test, model_xgb.predict(X_test)))
# print('model_gb accuracy score', accuracy_score(y_test, model_gb.predict(X_test)))

In [18]:
# saving models using pickle
saved_model_rf = pickle.dumps(model_rf)
saved_model_xgb = pickle.dumps(model_xgb)


In [19]:

joblib_file = "joblib_RF_Model.pkl"  
joblib.dump(model_rf, 'assets/model_rf')


['assets/model_rf']

In [20]:
joblib_file = "joblib_XGB_Model.pkl"  
joblib.dump(model_xgb, 'assets/model_xgb')

['assets/model_xgb']

In [21]:
#Testing if model saved and working correctly
# # Load from file
# load_xgb_model = joblib.load('assets/model_xgb')
# load_xgb_model



Pipeline(steps=[('ordinalencoder',
                 OrdinalEncoder(cols=['category', 'sub_category'],
                                mapping=[{'col': 'category',
                                          'data_type': dtype('O'),
                                          'mapping': publishing                                                                       1
technology                                                                       2
crafts                                                                           3
comics                                                                           4
art                                                                              5
music                                                                            6
food                                                                             7
film & video                                                                     8
fashion                                              

In [22]:
# # Use the Reloaded Joblib Model to 
# # Calculate the accuracy score and predict target values

# # Calculate the Score 
# score = load_xgb_model.score(X_test, y_test)  
# # # Print the Score
# print("Test score: {0:.2f} %".format(100 * score))  

# # # Predict the Labels using the reloaded Model
# Ypredict = load_xgb_model.predict(X_test)  

# Ypredict

Test score: 98.33 %


array([1, 1, 1, ..., 0, 1, 1])