# Final Project: Kickstarter Dataset
## Corbin Higgs, Zach Meyer, Jacob Ulman
##                              What makes a Kickstarter campaign succeed?

In [94]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score

%matplotlib inline

In [95]:
df = pd.read_csv("data/kickstarter.csv")
#df.head(2)

In [96]:
df['deadline'] = pd.to_datetime(df['deadline'])
df['launched'] = pd.to_datetime(df['launched'])

df['timediffdays'] = (df['deadline'] - df['launched']).dt.days

#df.head(2)

In [97]:
le = LabelEncoder()
df['maincat'] = le.fit_transform(df['main_category'])
df['state_encoded'] = le.fit_transform(df['state'])

df.head(2)

#Clearing out unkown, suspended, and live projects as they are not failed or successful
df = df.loc[(df['state_encoded'] != 5)]
df = df.loc[(df['state_encoded'] != 4)]
df = df.loc[(df['state_encoded'] != 2)]

#Re-encoding the states
df['state_encoded'] = le.fit_transform(df['state'])

#fixing encoding
state = []
for item, frame in df['state'].iteritems():
        if (frame == 'successful'):
            state += [1]
        else:
            state += [0]

df['state_encoded'] = state

In [98]:
corr = df.corr()
corr.style.background_gradient(cmap='coolwarm').set_precision(2)

Unnamed: 0,ID,goal,pledged,backers,usd pledged,usd_pledged_real,usd_goal_real,timediffdays,maincat,state_encoded
ID,1.0,0.0017,0.00057,0.00064,-0.00047,6.7e-05,0.0019,-7.9e-05,-0.00096,-0.00089
goal,0.0017,1.0,0.0075,0.0042,0.0058,0.0053,0.94,0.0047,0.0037,-0.025
pledged,0.00057,0.0075,1.0,0.72,0.86,0.95,0.0053,0.00097,0.005,0.11
backers,0.00064,0.0042,0.72,1.0,0.7,0.75,0.0047,-0.00082,0.00045,0.13
usd pledged,-0.00047,0.0058,0.86,0.7,1.0,0.91,0.0065,0.00096,0.0058,0.098
usd_pledged_real,6.7e-05,0.0053,0.95,0.75,0.91,1.0,0.0058,0.0011,0.0062,0.11
usd_goal_real,0.0019,0.94,0.0053,0.0047,0.0065,0.0058,1.0,0.0046,0.002,-0.024
timediffdays,-7.9e-05,0.0047,0.00097,-0.00082,0.00096,0.0011,0.0046,1.0,0.0069,-0.028
maincat,-0.00096,0.0037,0.005,0.00045,0.0058,0.0062,0.002,0.0069,1.0,-0.032
state_encoded,-0.00089,-0.025,0.11,0.13,0.098,0.11,-0.024,-0.028,-0.032,1.0


In [99]:
feature_names = ['usd_pledged_real','usd_goal_real','timediffdays','maincat','backers']
features = df[feature_names]
labels = df['state_encoded']

#data splitting
x_train, x_test, y_train, y_test = train_test_split(features, labels, test_size=0.33, random_state=42)

In [100]:
model = RandomForestRegressor(
                                n_estimators=100
                             )
model.fit(x_train, y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=100,
                      n_jobs=None, oob_score=False, random_state=None,
                      verbose=0, warm_start=False)

In [101]:
test_preds = model.predict(x_test)

r2 = r2_score(y_test, test_preds)
r2

0.9886599715044325