In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

In [None]:
pd.options.display.float_format = '{:.4f}'.format

In [None]:
kickstarter = pd.read_csv('./ks-projects-201801.csv')
kickstarter.shape

In [None]:
kickstarter.dtypes

In [None]:
kickstarter.head(60)

In [None]:
kickstarter.describe()

In [None]:
kickstarter.state.value_counts()

In [None]:
kickstarter.state.value_counts(normalize=True)

In [None]:
kickstarter_classify = kickstarter[kickstarter.state.isin(['failed', 'successful'])]
kickstarter_classify.state.value_counts(normalize=True)

In [None]:
kickstarter_classify.shape

In [None]:
kickstarter_classify.main_category.value_counts()

In [None]:
pd.crosstab(kickstarter_classify.main_category,
            kickstarter_classify.state)


In [None]:
category_crosstab = pd.crosstab(kickstarter_classify.main_category,
                                kickstarter_classify.state)
category_crosstab['success_rate'] = category_crosstab.successful / (category_crosstab.successful+category_crosstab.failed)
category_crosstab

In [None]:
kickstarter_classify.currency.value_counts()

In [None]:
currency_crosstab = pd.crosstab(kickstarter_classify.currency,kickstarter_classify.state)
currency_crosstab['success_rate'] = currency_crosstab.successful/(currency_crosstab.successful+currency_crosstab.failed)
currency_crosstab

In [None]:
kickstarter_classify['launched_date'] = pd.to_datetime(kickstarter_classify.launched)
kickstarter_classify['deadline_date'] = pd.to_datetime(kickstarter_classify.deadline)
kickstarter_classify['duration'] = (kickstarter_classify.deadline_date - kickstarter_classify.launched_date).dt.days
kickstarter_classify.head()

In [None]:
kickstarter_classify.duration.hist()

In [None]:
kickstarter_classify.duration.hist(by=kickstarter_classify.state)

In [None]:
kickstarter_classify.usd_goal_real.hist()

In [None]:
kickstarter_classify.usd_goal_real.hist(bins=100)

In [None]:
max(kickstarter_classify.usd_goal_real)

In [None]:
kickstarter_classify[kickstarter_classify.usd_goal_real > 1000000]


In [None]:
kickstarter_classify[kickstarter_classify.usd_goal_real > 1000000].state.value_counts()

In [None]:
kickstarter_classify.corr()

In [None]:
kickstarter_classify.isnull().sum(axis = 0)

In [None]:
#keep identified relevant features
kickstarter_variables = kickstarter_classify[['usd_goal_real', 'backers', 'main_category', 'duration', 'currency']]

kickstarter_y = pd.get_dummies(data=kickstarter_classify['state'], drop_first=True)
kickstarter_y['successful'].value_counts()


In [None]:
kickstarter_x = pd.get_dummies(data=kickstarter_variables, columns=['main_category', 'currency'], drop_first=True)
kickstarter_x.head()

In [None]:
from sklearn.model_selection import train_test_split
 
X_train, X_test, y_train, y_test = train_test_split(kickstarter_x, kickstarter_y, test_size=0.2)

In [None]:
y_test.shape

In [None]:
#Hurrray! Finally!
# data is prepared (cleaned) and we can start building models

In [None]:
# first model: logistic regression
from sklearn.linear_model import LogisticRegression
 
ks_model = LogisticRegression().fit(X_train, y_train)


In [None]:
ks_model = LogisticRegression().fit(X_train, np.ravel(y_train))

In [None]:
from sklearn.metrics import confusion_matrix
 
y_pred_test = ks_model.predict(X_test)
confusion_matrix(y_test, y_pred_test)


In [None]:
y_pred_test

In [None]:
accuracy_lr = (37834+22198)/(37834+1890+22198+4413)
accuracy_lr

In [None]:
from sklearn import metrics
 
y_pred_proba = ks_model.predict_proba(X_test)[::,1]
fpr, tpr, _ = metrics.roc_curve(y_test,  y_pred_proba)
auc_lr = metrics.roc_auc_score(y_test, y_pred_proba)
 
plt.plot(fpr,tpr)

In [None]:
auc_lr

In [None]:
#Now let's try a decision tree - in fact let's try a whole forest
from sklearn.ensemble import RandomForestClassifier
 
ks_rf = RandomForestClassifier().fit(X_train, np.ravel(y_train))

In [None]:
y_pred_test_rf = ks_rf.predict(X_test)
confusion_matrix(y_test, y_pred_test_rf)

In [None]:
accuracy_rf = (37155+23881)/(37155+2730+23881+2569)
accuracy_rf

In [None]:
y_pred_proba_rf = ks_rf.predict_proba(X_test)[::,1]
fpr, tpr, _ = metrics.roc_curve(y_test,  y_pred_proba_rf)
auc_rf = metrics.roc_auc_score(y_test, y_pred_proba_rf)
 
plt.plot(fpr,tpr)

In [None]:
auc_rf