In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
#Load in the Kick Starter Dataset
ks = pd.read_csv('ks-projects-201801.csv')
ks

Unnamed: 0,ID,name,category,main_category,currency,deadline,goal,launched,pledged,state,backers,country,usd pledged,usd_pledged_real,usd_goal_real
0,1000002330,The Songs of Adelaide & Abullah,Poetry,Publishing,GBP,2015-10-09,1000.0,2015-08-11 12:12:28,0.0,failed,0,GB,0.0,0.0,1533.95
1,1000003930,Greeting From Earth: ZGAC Arts Capsule For ET,Narrative Film,Film & Video,USD,2017-11-01,30000.0,2017-09-02 04:43:57,2421.0,failed,15,US,100.0,2421.0,30000.00
2,1000004038,Where is Hank?,Narrative Film,Film & Video,USD,2013-02-26,45000.0,2013-01-12 00:20:50,220.0,failed,3,US,220.0,220.0,45000.00
3,1000007540,ToshiCapital Rekordz Needs Help to Complete Album,Music,Music,USD,2012-04-16,5000.0,2012-03-17 03:24:11,1.0,failed,1,US,1.0,1.0,5000.00
4,1000011046,Community Film Project: The Art of Neighborhoo...,Film & Video,Film & Video,USD,2015-08-29,19500.0,2015-07-04 08:35:03,1283.0,canceled,14,US,1283.0,1283.0,19500.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
378656,999976400,ChknTruk Nationwide Charity Drive 2014 (Canceled),Documentary,Film & Video,USD,2014-10-17,50000.0,2014-09-17 02:35:30,25.0,canceled,1,US,25.0,25.0,50000.00
378657,999977640,The Tribe,Narrative Film,Film & Video,USD,2011-07-19,1500.0,2011-06-22 03:35:14,155.0,failed,5,US,155.0,155.0,1500.00
378658,999986353,Walls of Remedy- New lesbian Romantic Comedy f...,Narrative Film,Film & Video,USD,2010-08-16,15000.0,2010-07-01 19:40:30,20.0,failed,1,US,20.0,20.0,15000.00
378659,999987933,BioDefense Education Kit,Technology,Technology,USD,2016-02-13,15000.0,2016-01-13 18:13:53,200.0,failed,6,US,200.0,200.0,15000.00


In [3]:
#Look for Null Values
for columns in ks.columns:
    print(columns, ':', sum(ks[columns].isna()))
# 'usd pledged' has 3797 missing values. This column, usd_pledged is the 'conversion in US dollars of the pledged 
# column (conversion done by kickstarter)' and 'usd pledge real' is the 'conversion in US dollars of the pledged 
# column (conversion from Fixer.io API)''. Thus, for the missing values in the 'usd pledged' column, we will adopt 
# the value from 'usd pledged real'. We will drop 3562 of these values since their 'state' was 'undefined', and 
# will not be useful for our model anyhow. 

ks.drop(ks.loc[(ks['usd pledged'].isna()) & (ks['state'] == 'undefined'), ].index, inplace=True)
ks.loc[ks['usd pledged'].isna(), 'usd pledged'] = ks.loc[ks['usd pledged'].isna(), 'usd_pledged_real']

# Countries that were invalid, 'N,0"', we adopt the country corresponding to the currency.
countries = ['US','GB','CA','EUR','NO','DK','SE','AU']
currency =['USD', 'GBP', 'CAD', 'EUR', 'NOK', 'DKK', 'SEK', 'AUD']

for i in range(8):
    ks.loc[(ks['country']=='N,0"') & (ks['currency']== currency[i]), 'country'] = countries[i]
#

ID : 0
name : 4
category : 0
main_category : 0
currency : 0
deadline : 0
goal : 0
launched : 0
pledged : 0
state : 0
backers : 0
country : 0
usd pledged : 3797
usd_pledged_real : 0
usd_goal_real : 0


In [4]:
#Check for negative values (found no negative values)
negs = ks.select_dtypes(include=['int64', 'float64'])
negs = negs[(negs < 0).all(1)]
negs

Unnamed: 0,ID,goal,pledged,backers,usd pledged,usd_pledged_real,usd_goal_real


In [5]:
#Check for duplicate rows (no duplicates found)
length_ks = len(ks)
drop_dups = ks.drop_duplicates()
num_duplicate_rows = len(ks) - len(drop_dups)
num_duplicate_rows

0

In [6]:
# Drop the rows with state == live since we are only interested
# in the result of the project (ie whether it ultimately succeeded)
ks = ks[ks.state != 'live']
ks = ks.reset_index(drop=True)

In [7]:
#Feature engineering:

# Add boolean column where 1 indicates successful project else 0
ks['outcome'] = ks['state'].map(dict(successful=1, failed=0, canceled=0))

# Add log transformation of goal and pledged
ks['log of goal'] =  np.log(ks['goal'])
ks['log of pledged'] = np.log(ks['pledged'])
inf = ks['log of pledged'][0]
ks['log of pledged'] = ks['log of pledged'].replace([inf],0)

# Add a year column
years = []
for i in range(len(ks)):
  years.append(ks['launched'][i][0:4])
ks['year'] = pd.Series(years)
ks['year'] = pd.to_numeric(ks['year'])

# Add a month column
months = []
for i in range(len(ks)):
  months.append(ks['launched'][i][5:7])
ks['month'] = pd.Series(months)
ks['month'] = pd.to_numeric(ks['month'])

#One hot encode the categories feature
dums = pd.get_dummies(ks.category, prefix='onehot')

# Drop columns with less than 1000 instances
for c in dums.columns:
  if dums[c].sum() < 1000:
    dums = dums.drop(columns = c)
  else:
    ks[c] = dums[c]      

  result = getattr(ufunc, method)(*inputs, **kwargs)


In [8]:
# Model 1:
# Using a random forest classifier , the features I chose
# to use in this model are the log transformation of goal,
# log transformation of pledged, all categories one hot encoded
# and the month and the year

from sklearn import ensemble
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

ks= ks.dropna()
ks = ks.reset_index(drop=True)

#Train test split
train, test = train_test_split(ks, test_size=0.25, random_state=42)
m1_train = train.iloc[:,17:].to_numpy()
m1_test = test.iloc[:,17:].to_numpy()
X = m1_train
y = train['outcome'].to_numpy()

# Train model1 (Random forest classifier)
random_forest_model = RandomForestClassifier(max_depth=20, random_state=42)
random_forest_model.fit(X,y)

# Check accuracy on test set
rf_train_accuracy = random_forest_model.score(m1_train, train['outcome'])
rf_test_accuracy = random_forest_model.score(m1_test, test['outcome'])
rf_train_accuracy, rf_test_accuracy


(0.8715546469525407, 0.8426570783799251)

In [9]:
# Model 2:
# Nueral Network using same features
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

clf = MLPClassifier(hidden_layer_sizes=(50), max_iter=100, 
                    activation= 'logistic', solver='lbfgs', verbose=1,  random_state=42)

#Train test split
train, test = train_test_split(ks, test_size=0.25, random_state=42)
m1_train = train.iloc[:,17:].to_numpy()
m1_test = test.iloc[:,17:].to_numpy()
X = m1_train
y = train['outcome'].to_numpy()

#Fitting the Nueral network
clf.fit(X, y)

#checking accuracy
print('Accuracy on training---')
y_pred_train = clf.predict(X)
print(accuracy_score(y, y_pred_train))

print('Accuracy on test---')
X_test = m1_test
Y_test = test['outcome']
y_pred_test = clf.predict(X_test)
print(accuracy_score(Y_test, y_pred_test))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


Accuracy on training---
0.8396439651883472
Accuracy on test---
0.8384568041203718
