### Exploring Our Data

### We are going to use our 2018 data set instead 

In [1]:
import collections
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import OneHotEncoder
from IPython.display import Image  
from sklearn import tree
import pydotplus
import pandas as pd
import numpy as np

In [2]:
df_2018 = pd.read_csv('kickstarter-projects/ks-projects-201801.csv', encoding ='iso-8859-1')

In [3]:
df_2018.head()

Unnamed: 0,ID,name,category,main_category,currency,deadline,goal,launched,pledged,state,backers,country,usd pledged,usd_pledged_real,usd_goal_real
0,1000002330,The Songs of Adelaide & Abullah,Poetry,Publishing,GBP,2015-10-09,1000.0,2015-08-11 12:12:28,0.0,failed,0,GB,0.0,0.0,1533.95
1,1000003930,Greeting From Earth: ZGAC Arts Capsule For ET,Narrative Film,Film & Video,USD,2017-11-01,30000.0,2017-09-02 04:43:57,2421.0,failed,15,US,100.0,2421.0,30000.0
2,1000004038,Where is Hank?,Narrative Film,Film & Video,USD,2013-02-26,45000.0,2013-01-12 00:20:50,220.0,failed,3,US,220.0,220.0,45000.0
3,1000007540,ToshiCapital Rekordz Needs Help to Complete Album,Music,Music,USD,2012-04-16,5000.0,2012-03-17 03:24:11,1.0,failed,1,US,1.0,1.0,5000.0
4,1000011046,Community Film Project: The Art of Neighborhoo...,Film & Video,Film & Video,USD,2015-08-29,19500.0,2015-07-04 08:35:03,1283.0,canceled,14,US,1283.0,1283.0,19500.0


In [4]:
df_2018.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 378661 entries, 0 to 378660
Data columns (total 15 columns):
ID                  378661 non-null int64
name                378657 non-null object
category            378661 non-null object
main_category       378661 non-null object
currency            378661 non-null object
deadline            378661 non-null object
goal                378661 non-null float64
launched            378661 non-null object
pledged             378661 non-null float64
state               378661 non-null object
backers             378661 non-null int64
country             378661 non-null object
usd pledged         374864 non-null float64
usd_pledged_real    378661 non-null float64
usd_goal_real       378661 non-null float64
dtypes: float64(5), int64(2), object(8)
memory usage: 43.3+ MB


In [5]:
#Detects the live in state and drops them 
df_2018 = df_2018.loc[df_2018['state']!='live']

#Detects the undefined in state and drops them 
df_2018 = df_2018.loc[df_2018['state']!='undefined']

#Detects the undefined in state and drops them 
df_2018 = df_2018.loc[df_2018['state']!='canceled']

#Detects the undefined in state and drops them 
df_2018 = df_2018.loc[df_2018['state']!='suspended']

#Detects the undefined in usd pledged and drops them 
df_2018 = df_2018.loc[~df_2018['usd pledged'].isna(), :]

#Detects the undefined in usd pledged and drops them 
df_2018 = df_2018.loc[~df_2018['name'].isna(), :]

In [6]:
X_train, X_test, y_train, y_test = train_test_split(df_2018.drop("state", axis=1),
                                                    df_2018["state"],
                                                    test_size=0.25,
                                                    random_state=2019)


In [7]:
y_train.value_counts(), y_test.value_counts()


(failed        148193
 successful    100403
 Name: state, dtype: int64, failed        49418
 successful    33448
 Name: state, dtype: int64)

## One Hot Encoding

In [8]:



encoder = OneHotEncoder(drop='first', categories="auto")
encoder.fit(X_train[["category", "main_category", "country"]])



OneHotEncoder(categorical_features=None, categories='auto', drop='first',
              dtype=<class 'numpy.float64'>, handle_unknown='error',
              n_values=None, sparse=True)

In [9]:
encoder.categories_

[array(['3D Printing', 'Academic', 'Accessories', 'Action', 'Animals',
        'Animation', 'Anthologies', 'Apparel', 'Apps', 'Architecture',
        'Art', 'Art Books', 'Audio', 'Bacon', 'Blues', 'Calendars',
        'Camera Equipment', 'Candles', 'Ceramics', "Children's Books",
        'Childrenswear', 'Chiptune', 'Civic Design', 'Classical Music',
        'Comedy', 'Comic Books', 'Comics', 'Community Gardens',
        'Conceptual Art', 'Cookbooks', 'Country & Folk', 'Couture',
        'Crafts', 'Crochet', 'DIY', 'DIY Electronics', 'Dance', 'Design',
        'Digital Art', 'Documentary', 'Drama', 'Drinks',
        'Electronic Music', 'Embroidery', 'Events', 'Experimental',
        'Fabrication Tools', 'Faith', 'Family', 'Fantasy',
        "Farmer's Markets", 'Farms', 'Fashion', 'Festivals', 'Fiction',
        'Film & Video', 'Fine Art', 'Flight', 'Food', 'Food Trucks',
        'Footwear', 'Gadgets', 'Games', 'Gaming Hardware', 'Glass',
        'Graphic Design', 'Graphic Novels', 'Har

In [10]:
ohe = pd.DataFrame(encoder.transform(X_train[["category", "main_category", "country"]]).toarray(),
                   columns=encoder.get_feature_names(["category", "main_category", "country"]))
ohe.head()

Unnamed: 0,category_Academic,category_Accessories,category_Action,category_Animals,category_Animation,category_Anthologies,category_Apparel,category_Apps,category_Architecture,category_Art,...,country_IT,country_JP,country_LU,country_MX,country_NL,country_NO,country_NZ,country_SE,country_SG,country_US
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [11]:
X_train = X_train.reset_index(drop=True)

In [12]:
X_train.head()

Unnamed: 0,ID,name,category,main_category,currency,deadline,goal,launched,pledged,backers,country,usd pledged,usd_pledged_real,usd_goal_real
0,1426698687,RAPPY: The 3D printer with position feedback c...,3D Printing,Technology,USD,2014-02-16,100000.0,2014-01-16 04:48:30,21543.0,49,US,21543.0,21543.0,100000.0
1,1633937505,Unborn in America - A New Cabaret Opera,Musical,Theater,GBP,2014-12-03,2500.0,2014-11-06 23:41:21,2600.0,74,GB,4162.44,4078.3,3921.45
2,815178419,The Chronicles of Count Carlos: Son of Dracula,Comic Books,Comics,USD,2016-09-01,12000.0,2016-07-03 19:39:13,12813.01,193,US,629.0,12813.01,12000.0
3,344407855,Hidden Love Letters,Video Games,Games,EUR,2017-11-10,500.0,2017-10-10 10:03:54,723.0,106,FR,44.6,842.59,582.7
4,2037941839,Do You Have An Outdoor Grill? Use It To Roast ...,Food,Food,USD,2016-05-10,199000.0,2016-04-10 02:44:49,104.0,6,US,104.0,104.0,199000.0


In [13]:
X_train.shape


(248596, 14)

In [14]:
ohe.shape


(248596, 193)

In [15]:
y_train.shape

(248596,)

In [16]:
X_train['launched_datetime'] = pd.to_datetime(X_train['launched'])
X_train['deadline_datetime'] = pd.to_datetime(X_train['deadline'])
X_train['project_times'] = pd.to_datetime(X_train['deadline']) - pd.to_datetime(X_train['launched'])
'''This extracts the project days from the total project time'''
X_train['project_length'] = X_train.project_times.dt.days



In [17]:
X_train = X_train.drop(["category", "launched_datetime",'deadline_datetime',"main_category", "country", "name",
                       "currency", "launched", 'project_times',"backers", "pledged", "usd_pledged_real",
                       "usd pledged", "deadline"], axis=1)


In [18]:
X_train2 = pd.concat(objs=[X_train, ohe], axis=1)

In [19]:
X_train.shape, X_train2.shape, ohe.shape

((248596, 4), (248596, 197), (248596, 193))

#### Now make a Decision Tree Classifier

In [20]:
X_train2.head()

Unnamed: 0,ID,goal,usd_goal_real,project_length,category_Academic,category_Accessories,category_Action,category_Animals,category_Animation,category_Anthologies,...,country_IT,country_JP,country_LU,country_MX,country_NL,country_NO,country_NZ,country_SE,country_SG,country_US
0,1426698687,100000.0,100000.0,30,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,1633937505,2500.0,3921.45,26,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,815178419,12000.0,12000.0,59,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,344407855,500.0,582.7,30,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,2037941839,199000.0,199000.0,29,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [21]:
X_train2.shape

(248596, 197)

In [22]:
clf = DecisionTreeClassifier(random_state=2019,
                             min_samples_leaf=30,
                             criterion="gini",
                             min_samples_split=2)

clf.fit(X_train2, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=30, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=2019, splitter='best')

In [23]:
# # Create DOT data
# dot_data = tree.export_graphviz(clf, 
#                                 out_file=None, 
#                                 feature_names=X_train2.columns,  
#                                 class_names=["failed", "successful"])

# # Draw graph
# graph = pydotplus.graph_from_dot_data(dot_data)  

# # Show graph
# Image(graph.create_png())

## How well did our model do? 

In [25]:
X_test.head()

Unnamed: 0,ID,name,category,main_category,currency,deadline,goal,launched,pledged,backers,country,usd pledged,usd_pledged_real,usd_goal_real
5879,1030170827,I Dreamed a Dream Project!,Theater,Theater,USD,2014-05-07,25000.0,2014-04-07 20:41:08,25910.0,131,US,25910.0,25910.0,25000.0
227940,228972453,Against the Horde,Video Games,Games,USD,2017-12-23,20000.0,2017-11-21 14:43:27,617.0,16,US,567.0,617.0,20000.0
20849,110568298,movie mode,Apps,Technology,USD,2015-01-16,7000.0,2014-12-17 23:21:33,0.0,0,US,0.0,0.0,7000.0
214178,2091365786,Woken: A Fairy Tale,Fiction,Publishing,USD,2013-04-01,9000.0,2013-03-02 17:22:43,260.0,2,US,260.0,260.0,9000.0
201109,2024028512,Help Technologic TV Grow,Documentary,Film & Video,USD,2011-05-20,1000.0,2011-04-20 06:35:28,0.0,0,US,0.0,0.0,1000.0


In [26]:
encoder2 = OneHotEncoder(drop='first', categories="auto")
encoder2.fit(X_test[["category", "main_category", "country"]])

OneHotEncoder(categorical_features=None, categories='auto', drop='first',
              dtype=<class 'numpy.float64'>, handle_unknown='error',
              n_values=None, sparse=True)

In [27]:
ohe = pd.DataFrame(encoder.transform(X_test[["category", "main_category", "country"]]).toarray(),
                   columns=encoder.get_feature_names(["category", "main_category", "country"]))
ohe.head()

Unnamed: 0,category_Academic,category_Accessories,category_Action,category_Animals,category_Animation,category_Anthologies,category_Apparel,category_Apps,category_Architecture,category_Art,...,country_IT,country_JP,country_LU,country_MX,country_NL,country_NO,country_NZ,country_SE,country_SG,country_US
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [28]:
X_test = X_test.reset_index(drop=True)

In [29]:
X_test['launched_datetime'] = pd.to_datetime(X_test['launched'])
X_test['deadline_datetime'] = pd.to_datetime(X_test['deadline'])
X_test['project_times'] = pd.to_datetime(X_test['deadline']) - pd.to_datetime(X_test['launched'])
'''This extracts the project days from the total project time'''
X_test['project_length'] = X_test.project_times.dt.days

In [30]:
X_test = X_test.drop(["category", "launched_datetime",'deadline_datetime',"main_category", "country", "name",
                       "currency", "launched", 'project_times',"backers", "pledged", "usd_pledged_real",
                       "usd pledged", "deadline"], axis=1)

In [31]:
X_test2 = pd.concat(objs=[X_test, ohe], axis=1)

In [32]:
clf = DecisionTreeClassifier(random_state=2019,
                             min_samples_leaf=30,
                             criterion="gini",
                             min_samples_split=2)

clf.fit(X_test2, y_test)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=30, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=2019, splitter='best')

In [33]:
# # Create DOT data
# dot_data = tree.export_graphviz(clf, 
#                                 out_file=None, 
#                                 feature_names=X_test2.columns,  
#                                 class_names=["failed", "successful"])

# # Draw graph
# graph = pydotplus.graph_from_dot_data(dot_data)  

# # Show graph
# Image(graph.create_png())

In [34]:
y_pred = clf.predict(X_test2)

In [58]:
y_pred = pd.Series(y_pred)

In [59]:
 y_traincf = y_train.replace('successful', 1)
y_traincf = y_train.replace('failed', 0)

y_testcf = y_test.replace('successful', 1)
y_testcf = y_test.replace('failed', 0)

y_predcf = y_pred.replace('successful', 1)
y_predcf = y_pred.replace('failed', 0)

In [61]:
from sklearn.metrics import accuracy_score, roc_curve, auc

# Calculate Accuracy 
acc = accuracy_score(y_test,y_pred) * 100
print("Accuracy is :{0}".format(acc))


# Check the AUC for predictions
false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test, y_pred)
roc_auc = auc(false_positive_rate, true_positive_rate)
print("\nAUC is :{0}".format(round(roc_auc,2)))

# Create and print a confusion matrix 
print('\nConfusion Matrix')
print('----------------')
pd.crosstab(y_testcf, y_predcf, rownames=['True'], colnames=['Predicted'], margins=True)




Accuracy is :71.1546351941689


ValueError: Data is not binary and pos_label is not specified

## Model 2

lets run different models!

In [43]:
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, confusion_matrix

In [44]:
# Takes in a model, trains the model, and evaluates the model on the test set
def fit_and_evaluate(model):
    
    # Train the model
    model.fit(X_train2, y_train)
    
    # Make predictions and evalute
    predictions = model.predict(X_test2)
    
    # Return the performance metric
    return print(classification_report(y_test, predictions), '\n', confusion_matrix(y_test, predictions))

In [None]:
lr = LogisticRegression()

In [47]:
gradient_boosted = GradientBoostingClassifier(random_state=60)

fit_and_evaluate(gradient_boosted)

              precision    recall  f1-score   support

      failed       0.69      0.84      0.76     49418
  successful       0.65      0.44      0.52     33448

    accuracy                           0.68     82866
   macro avg       0.67      0.64      0.64     82866
weighted avg       0.67      0.68      0.66     82866
 
 [[41444  7974]
 [18814 14634]]


In [45]:
fit_and_evaluate(lr)

  'precision', 'predicted', average, warn_for)


              precision    recall  f1-score   support

      failed       0.60      1.00      0.75     49418
  successful       0.00      0.00      0.00     33448

    accuracy                           0.60     82866
   macro avg       0.30      0.50      0.37     82866
weighted avg       0.36      0.60      0.45     82866
 
 [[49418     0]
 [33448     0]]


In [46]:
knn = KNeighborsClassifier(n_neighbors=5)

fit_and_evaluate(knn)

              precision    recall  f1-score   support

      failed       0.61      0.65      0.63     49418
  successful       0.43      0.39      0.41     33448

    accuracy                           0.54     82866
   macro avg       0.52      0.52      0.52     82866
weighted avg       0.54      0.54      0.54     82866
 
 [[32194 17224]
 [20519 12929]]


In [50]:
gradient_boosted = GradientBoostingClassifier(random_state=2019)

fit_and_evaluate(gradient_boosted)



              precision    recall  f1-score   support

      failed       0.69      0.84      0.76     49418
  successful       0.65      0.44      0.52     33448

    accuracy                           0.68     82866
   macro avg       0.67      0.64      0.64     82866
weighted avg       0.67      0.68      0.66     82866
 
 [[41444  7974]
 [18814 14634]]


In [51]:
fit_and_evaluate(clf)


              precision    recall  f1-score   support

      failed       0.69      0.77      0.73     49418
  successful       0.59      0.49      0.53     33448

    accuracy                           0.66     82866
   macro avg       0.64      0.63      0.63     82866
weighted avg       0.65      0.66      0.65     82866
 
 [[37865 11553]
 [17022 16426]]
