In [13]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

from matplotlib.pylab import rcParams
rcParams['figure.figsize'] = 16, 8

from scipy.stats import norm
from scipy import stats

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import MinMaxScaler


import warnings
warnings.filterwarnings('ignore')
import math
from datetime import date, datetime

In [148]:
data_train = pd.read_csv("KS_train_data.csv", delimiter=",")
data_train.set_index('project_id', inplace=True)
data_test = pd.read_csv("KS_test_data.csv", delimiter=";")
data_test.set_index('project_id', inplace=True)
list(data_train.head(0))

['backers_count',
 'blurb',
 'category',
 'converted_pledged_amount',
 'country',
 'created_at',
 'currency',
 'deadline',
 'fx_rate',
 'goal',
 'launched_at',
 'name',
 'pledged',
 'staff_pick',
 'usd_pledged',
 'location',
 'funded',
 'subcategory',
 'project_url',
 'reward_url']

    backers_count: numbers of persons who have contributed monetary to the project
    blurb: short description of the project underneath the project title
    category: each project is classified in a primary category - art, comics, crafts, dance, design, fashion,        film & video, food, games, journalism, music, photography, publishing, technology, theater
    converted_pledged_amount: amount monetary that was donated in total to the project
    country: 2 letter identifier e.g. UK of the country dated in location - restricts search to the country and                                 then cities are displayed as recommendations
    created_at: UNIX date 
    currency: monetary currency which the project accepts for payments and provides information about                        e.g. rewards brackets 
    deadline: UNIX standard time convertor - last day date of the funding period https://www.epochconverter.com/
    fx_rate: rate of conversion from one currency to USD
    goal: monetary value needed to be donated for the kickstarter project to be be a success
    launched_at: UNIX day date of the first day in the funding period - after project approval
    name: project name - product name or short description of goal
    pledged: total money value raised
    staff_pick: boolean value if staff selected a project -  get prime placement on the website, be promoted to     Kickstarter's 2 million followers on Facebook and Twitter, or appear in Kickstarter's "Projects We Love"                     email, which reaches more than 4 million inboxes every week.  
    usd_pledged: pledged value converted into USD dollars
    location: city, country/state for US - explore category shows projects which are in a specific geolocational part of the world. Posibillity of searching for a project in any city of the world as a filter.
    funded: boolean value whether the project goal has been met through funding 
    subcategory: can be categorised as the category or be in a specific subfield of the primary category
    project_url: website url of the project page 
    reward_url: website url of rewards page - there exist tiers that backers which donate a specific amount         of money will receive

In [11]:
list(data_test.head(0))

['blurb',
 'category',
 'country',
 'created_at',
 'currency',
 'deadline',
 'fx_rate',
 'goal',
 'launched_at',
 'name',
 'staff_pick',
 'location',
 'subcategory',
 'project_url',
 'reward_url']

Data Cleaning

In [None]:
data_train.isnull().any()
data_train['blurb'].isna().sum() #2
data_train['name'].isna().sum() #1
pd.isnull(data_train).any(1).nonzero()[0] #array([ 1857,  5423, 27780, 46838, 56883])
#The null values are NaN in the website data and thus cannot be inserted in the dataframe
data_train['fx_rate'].value_counts()
data_train['currency'].value_counts()
#the goal is not in USD 

In [89]:
#transform goal to USD currency:
goal_usd = []
for i in range(len(data_train)):
    value = math.ceil(data_train.iloc[i].fx_rate * data_train.iloc[i].goal)
    goal_usd.append(value)
data_train['goal'] = goal_usd

Feature Selection and Calculation

In [12]:
#blurb_length: cor with funded: #0.032002518832337956
data_train['blurb'].astype(str)
blurb_length = []
for i in range(len(data_train)):
    if data_train.iloc[i].blurb != data_train.iloc[i].blurb:
        blurb_length.append(0)
    else:
        blurb_length.append(len(data_train.iloc[i].blurb))  
data_train['blurb_length'] = blurb_length

In [190]:
#period_lc
#period_dl
#money_pday
launched_at = []
created_at = []
deadline = []
for i in range(len(data_train)):
    launched_at.append(datetime.utcfromtimestamp(data_train.iloc[i].launched_at).strftime('%Y-%m-%d'))
    created_at.append(datetime.utcfromtimestamp(data_train.iloc[i].created_at).strftime('%Y-%m-%d'))
    deadline.append(datetime.utcfromtimestamp(data_train.iloc[i].deadline).strftime('%Y-%m-%d'))

date_format = "%Y-%m-%d"
period_lc = []
period_dl = []
money_pday = []
for i in range(len(data_train)):
    l = launched_at[i]
    l = datetime.strptime(l, date_format)
    c = created_at[i]
    c = datetime.strptime(c, date_format)
    d = deadline[i]
    d = datetime.strptime(d, date_format)
    lc = l - c
    period_lc.append(lc.days)
    dl = d - l
    period_dl.append(dl.days)
    money_pday.append(math.ceil(data_train.iloc[i].goal / dl.days)) 
data_train['period_lc'] = period_lc
data_train['period_dl'] = period_dl
data_train['money_pday'] = money_pday

In [15]:
#name_length: #0.12593483192454363
data_train['name_length'] = data_train['name']
name_length = []
for i in range(len(data_train)):
    if data_train.iloc[i].name_length != data_train.iloc[i].name_length:
        name_length.append(0)
    else:
        name_length.append(len(data_train.iloc[i].name_length)) 
        data_train['name_length'] = name_length
data_train['name_length'] = name_length

In [191]:
#competitors_qtr
#competitors_month
def get_year(time_stamp):
    time_s = datetime.utcfromtimestamp(time_stamp)
    return time_s.year
data_train['launched_year'] = data_train['launched_at'].apply(get_year)

def get_month(time_stamp):
    time_s = datetime.utcfromtimestamp(time_stamp)
    return time_s.month
data_train['launched_month'] = data_train['launched_at'].apply(get_month)

def get_week(time_stamp):
    time_s = datetime.utcfromtimestamp(time_stamp)
    return time_s.isocalendar()[1]
data_train['launched_week'] = data_train['launched_at'].apply(get_week)

def get_quarter(time_stamp):
    time_s = datetime.utcfromtimestamp(time_stamp)
    if (time_s.month > 0 and time_s.month < 4):
        return 1
    if (time_s.month > 3 and time_s.month < 7):
        return 2
    if (time_s.month > 6 and time_s.month < 10):
        return 3
    if (time_s.month > 9 and time_s.month < 13):
        return 4
data_train['launched_quarter'] = data_train['launched_at'].apply(get_quarter)

data_train['goal_cat_perc'] =  data_train.groupby(['category'])['goal'].transform(
                     lambda x: pd.qcut(x, [0, .35, .70, 1.0], labels =[1,2,3]))
data_train['duration_cat_perc'] =  data_train.groupby(['category'])['period_dl'].transform(
                     lambda x: pd.qcut(x, [0, .35, .70, 1.0], labels =False, duplicates='drop'))

data_train['competitors_qtr'] = data_train.groupby(['category','launched_year','launched_quarter','goal_cat_perc'])['funded'].transform('count')
data_train['competitors_month'] = data_train.groupby(['category','launched_year','launched_month','goal_cat_perc'])['funded'].transform('count')

In [41]:
#category_art #category_comics #category_crafts #category_dance #category_design #category_fashion                                                            
#category_film & video #category_food #category_games #category_journalism #category_music                                                              
#category_photography #category_publishing #category_technology #category_theater    
data_train = pd.get_dummies(data_train, columns = ['category'])

Data Transformation for Model

In [17]:
#standardize: staff_pick
boolean_v = []
for i in range(len(data_train)):
    boolean = data_train.iloc[i].staff_pick
    if boolean == True:
        boolean_v.append(1)
    elif boolean == False:
        boolean_v.append(0)
data_train['staff_pick'] = boolean_v

In [21]:
#standardize: funded
boolean_v = []
for i in range(len(data_train)):
    boolean = data_train.iloc[i].funded
    if boolean == True:
        boolean_v.append(1)
    elif boolean == False:
        boolean_v.append(0)
data_train['funded'] = boolean_v

In [139]:
data_f = data_train[['goal', 'staff_pick', 'blurb_length', 'period_lc', 'period_dl', 'money_pday', 'name_length', 'competitors_qtr', 'competitors_month', 'category_art', 'category_comics', 'category_crafts', 'category_dance', 'category_design', 'category_fashion', 'category_film & video', 'category_food', 'category_games', 'category_journalism', 'category_music', 'category_photography', 'category_publishing', 'category_technology', 'category_theater', 'funded']]

In [140]:
#standardize: MinMax
data_fminmax = data_f
dataset_mm = data_fminmax.values
X = dataset[:,0:24]
Y_mm = dataset[:,24]
scaler = MinMaxScaler()
MinMaxScaler(copy=True, feature_range=(0, 1))
MinMaxScaler.fit(scaler, X)
X_mm = scaler.transform(X)

In [141]:
X_train_mm, X_val_and_test_mm, Y_train_mm, Y_val_and_test_mm = train_test_split(X_mm, Y_mm, test_size=0.3)

Create Model

In [92]:
from keras import backend as K
from keras.utils import plot_model
from keras.models import Model
from keras.layers import Input
from keras.layers import Dense

Using TensorFlow backend.


In [94]:
def recall_m(y_true, y_pred):
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
        recall = true_positives / (possible_positives + K.epsilon())
        return recall

def precision_m(y_true, y_pred):
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
        precision = true_positives / (predicted_positives + K.epsilon())
        return precision

def f1_m(y_true, y_pred):
    precision = precision_m(y_true, y_pred)
    recall = recall_m(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

In [137]:
input_l = Input(shape=(24,))
hidden1 = Dense(12, activation='relu')(input_l)
output_l = Dense(1, activation='sigmoid')(hidden1)
model = Model(inputs=input_l, outputs=output_l)
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy',f1_m,precision_m, recall_m])

In [142]:
history = model.fit(X_train_mm, Y_train_mm, batch_size=32, epochs=150, verbose = 1)

Epoch 1/150
Epoch 2/150
Epoch 3/150
Epoch 4/150
Epoch 5/150
Epoch 6/150
Epoch 7/150
Epoch 8/150
Epoch 9/150
Epoch 10/150
Epoch 11/150
Epoch 12/150
Epoch 13/150
Epoch 14/150
Epoch 15/150
Epoch 16/150
Epoch 17/150
Epoch 18/150
Epoch 19/150
Epoch 20/150
Epoch 21/150
Epoch 22/150
Epoch 23/150
Epoch 24/150
Epoch 25/150
Epoch 26/150
Epoch 27/150
Epoch 28/150
Epoch 29/150
Epoch 30/150
Epoch 31/150
Epoch 32/150
Epoch 33/150
Epoch 34/150
Epoch 35/150
Epoch 36/150
Epoch 37/150
Epoch 38/150
Epoch 39/150
Epoch 40/150
Epoch 41/150
Epoch 42/150
Epoch 43/150
Epoch 44/150
Epoch 45/150
Epoch 46/150
Epoch 47/150
Epoch 48/150
Epoch 49/150


Epoch 50/150
Epoch 51/150
Epoch 52/150
Epoch 53/150
Epoch 54/150
Epoch 55/150
Epoch 56/150
Epoch 57/150
Epoch 58/150
Epoch 59/150
Epoch 60/150
Epoch 61/150
Epoch 62/150
Epoch 63/150
Epoch 64/150
Epoch 65/150
Epoch 66/150
Epoch 67/150
Epoch 68/150
Epoch 69/150
Epoch 70/150
Epoch 71/150
Epoch 72/150
Epoch 73/150
Epoch 74/150
Epoch 75/150
Epoch 76/150
Epoch 77/150
Epoch 78/150
Epoch 79/150
Epoch 80/150
Epoch 81/150
Epoch 82/150
Epoch 83/150
Epoch 84/150
Epoch 85/150
Epoch 86/150
Epoch 87/150
Epoch 88/150
Epoch 89/150
Epoch 90/150
Epoch 91/150
Epoch 92/150
Epoch 93/150
Epoch 94/150
Epoch 95/150
Epoch 96/150
Epoch 97/150
Epoch 98/150
Epoch 99/150


Epoch 100/150
Epoch 101/150
Epoch 102/150
Epoch 103/150
Epoch 104/150
Epoch 105/150
Epoch 106/150
Epoch 107/150
Epoch 108/150
Epoch 109/150
Epoch 110/150
Epoch 111/150
Epoch 112/150
Epoch 113/150
Epoch 114/150
Epoch 115/150
Epoch 116/150
Epoch 117/150
Epoch 118/150
Epoch 119/150
Epoch 120/150
Epoch 121/150
Epoch 122/150
Epoch 123/150
Epoch 124/150
Epoch 125/150
Epoch 126/150
Epoch 127/150
Epoch 128/150
Epoch 129/150
Epoch 130/150
Epoch 131/150
Epoch 132/150
Epoch 133/150
Epoch 134/150
Epoch 135/150
Epoch 136/150
Epoch 137/150
Epoch 138/150
Epoch 139/150
Epoch 140/150
Epoch 141/150
Epoch 142/150
Epoch 143/150
Epoch 144/150
Epoch 145/150
Epoch 146/150
Epoch 147/150
Epoch 148/150
Epoch 149/150


Epoch 150/150


In [143]:
results = model.evaluate(X_val_and_test_mm, Y_val_and_test_mm, batch_size = 32, verbose = 1)
print(results)

[0.5173697016398112, 0.7384666800498962, 0.7822245359420776, 0.7478759288787842, 0.829805850982666]


In [None]:
#loss: 0.5179 - accuracy: 0.7380 - f1_m: 0.7853 - precision_m: 0.7447 - recall_m: 0.8422
#data_f = data_train[['goal', 'staff_pick', 'blurb_length', 'period_lc', 'period_dl', 'money_pday', 'name_length', 'competitors_qtr', 'competitors_month', 'category_art', 'category_comics', 'category_crafts', 'category_dance', 'category_design', 'category_fashion', 'category_film & video', 'category_food', 'category_games', 'category_journalism', 'category_music', 'category_photography', 'category_publishing', 'category_technology', 'category_theater', 'funded']]

Test Data Cleaning and Standardization

In [171]:
#standardize: staff_pick
boolean_v = []
for i in range(len(data_test)):
    boolean = data_test.iloc[i].staff_pick
    if boolean == True:
        boolean_v.append(1)
    elif boolean == False:
        boolean_v.append(0)
data_test['staff_pick'] = boolean_v

In [172]:
launched_at = []
created_at = []
deadline = []
for i in range(len(data_test)):
    launched_at.append(datetime.utcfromtimestamp(data_test.iloc[i].launched_at).strftime('%Y-%m-%d'))
    created_at.append(datetime.utcfromtimestamp(data_test.iloc[i].created_at).strftime('%Y-%m-%d'))
    deadline.append(datetime.utcfromtimestamp(data_test.iloc[i].deadline).strftime('%Y-%m-%d'))

date_format = "%Y-%m-%d"
period_lc = []
period_dl = []
money_pday = []
for i in range(len(data_test)):
    l = launched_at[i]
    l = datetime.strptime(l, date_format)
    c = created_at[i]
    c = datetime.strptime(c, date_format)
    d = deadline[i]
    d = datetime.strptime(d, date_format)
    lc = l - c
    period_lc.append(lc.days)
    dl = d - l
    period_dl.append(dl.days)
    money_pday.append(math.ceil(data_test.iloc[i].goal / dl.days)) 
data_test['period_lc'] = period_lc
data_test['period_dl'] = period_dl
data_test['money_pday'] = money_pday

In [178]:
data_test['name_length'] = data_test['name']
name_length = []
for i in range(len(data_test)):
    if data_test.iloc[i].name_length != data_test.iloc[i].name_length:
        name_length.append(0)
    else:
        name_length.append(len(data_test.iloc[i].name_length)) 
data_train['name_length'] = name_length

In [179]:
data_test['blurb'].astype(str)
blurb_length = []
for i in range(len(data_test)):
    if data_test.iloc[i].blurb != data_test.iloc[i].blurb:
        blurb_length.append(0)
    else:
        blurb_length.append(len(data_test.iloc[i].blurb))  
data_test['blurb_length'] = blurb_length

In [193]:
data_test['launched_month'] = data_test['launched_at'].apply(get_month)
data_test['launched_week'] = data_test['launched_at'].apply(get_week)
data_test['launched_quarter'] = data_test['launched_at'].apply(get_quarter)

data_test['goal_cat_perc'] =  data_test.groupby(['category'])['goal'].transform(
                     lambda x: pd.qcut(x, [0, .35, .70, 1.0], labels =[1,2,3]))
data_test['duration_cat_perc'] =  data_test.groupby(['category'])['period_dl'].transform(
                     lambda x: pd.qcut(x, [0, .35, .70, 1.0], labels =False, duplicates='drop'))

data_test['competitors_qtr'] = data_test.groupby(['category','launched_year','launched_quarter','goal_cat_perc'])['funded'].transform('count')
data_test['competitors_month'] = data_test.groupby(['category','launched_year','launched_month','goal_cat_perc'])['funded'].transform('count')

KeyError: 'category'

In [194]:
data_test.head(0)

Unnamed: 0_level_0,blurb,country,created_at,currency,deadline,fx_rate,goal,launched_at,name,staff_pick,location,subcategory,project_url,reward_url,period_lc,period_dl,money_pday,name_length,blurb_length,category_art,category_comics,category_crafts,category_dance,category_design,category_fashion,category_film & video,category_food,category_games,category_journalism,category_music,category_photography,category_publishing,category_technology,category_theater,launched_month,launched_week,launched_quarter
project_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1


In [180]:
data_test = pd.get_dummies(data_test, columns = ['category'])