In [15]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import datetime
%matplotlib inline

from sklearn.metrics import classification_report, confusion_matrix

In [2]:
df = pd.read_csv("data/combined_csv.csv")

In [None]:
df.info()

In [None]:
df.iloc[:, 3:16].head()

 ## Columns

 * double click for better view

 <br>
 
 0   backers_count             number of backers 
 1   blurb                     short description 
 2   category                  Kickstarter categories 
 3   converted_pledged_amount  pledged amount in USD  
 4   country                   country 
 5   created_at                creation date/time?  
 6   creator                   Kickstarter account that created the project 
 7   currency                  currency the project asks for
 8   currency_symbol           currency symbol 
 9   currency_trailing_code    ?   
 10  current_currency          ?
 11  deadline                  deadline date/time?
 12  disable_communication     don't allow communication/comments   
 13  friends                   drop (not enough data) 
 14  fx_rate                   currency conversion rate
 15  goal                      fixed amount required for funding
 16  id                        Kickstarter project id  
 17  is_backing                drop (not enough data)  
 18  is_starrable              ?  
 19  is_starred                drop (not enough data)  
 20  launched_at               launch date/time?  
 21  location                  location
 22  name                      project name 
 23  permissions               drop (not enough data) 
 24  photo                     maybe drop?
 25  pledged                   pledged amount in original currency
 26  profile                   drop? 
 27  slug                      drop (converted project name) 
 28  source_url                source url - drop 
 29  spotlight                 drop? (probably not influencable by the creators)   
 30  staff_pick                drop? (probably not influencable by the creators)   
 31  state                     !!!!!target!!!!! 
 32  state_changed_at          state changed at date/time?  
 33  static_usd_rate           conversion rate -> drop
 34  urls                      drop 
 35  usd_pledged               drop
 36  usd_type                  drop

In [None]:
df['current_currency'].unique()

In [None]:
no_comm = df[df['disable_communication'] == True]

In [None]:
len(no_comm)

In [None]:
comm = df[df['disable_communication'] == False]

In [None]:
len(comm)

### location example

{"id":2379574,
"name":"Chicago",
"slug":"chicago-il",
"short_name":"Chicago, IL",
"displayable_name":"Chicago, IL",
"localized_name":"Chicago",
"country":"US",
"state":"IL",
"type":"Town",
"is_root":false,
"urls":{"web":{"discover":"https://www.kickstarter.com/discover/places/chicago-il","location":"https://www.kickstarter.com/locations/chicago-il"},"api":{"nearby_projects":"https://api.kickstarter.com/v1/discover?signature=1552595044.c1041c6bca69b0b72738f3b9504ebf921b3e5e0e&woe_id=2379574"}}}'

In [None]:
df['category'].unique()

### category example

{"id":43,
"name":"Rock",
"slug":"music/rock",
"position":17,
"parent_id":14,
"color":10878931,
"urls":{"web":{"discover":"http://www.kickstarter.com/discover/categories/music/rock"}}}'

In [None]:
df['creator'].unique()

### creator example

{"id":1495925645,
"name":"Daniel",
"is_registered":null,
"chosen_currency":null,
"avatar":{"thumb":"https://ksr-ugc.imgix.net/assets/006/041/047/c44d1a95c2139ae46af635c7c6e7ea76_original.jpg?ixlib=rb-1.1.0&w=40&h=40&fit=crop&v=1461362658&auto=format&frame=1&q=92&s=3d655afafac9dbb59c1e675adfa87082","small":"https://ksr-ugc.imgix.net/assets/006/041/047/c44d1a95c2139ae46af635c7c6e7ea76_original.jpg?ixlib=rb-1.1.0&w=160&h=160&fit=crop&v=1461362658&auto=format&frame=1&q=92&s=3973d24f5c3db1ed1d5c84cec8af1d6d","medium":"https://ksr-ugc.imgix.net/assets/006/041/047/c44d1a95c2139ae46af635c7c6e7ea76_original.jpg?ixlib=rb-1.1.0&w=160&h=160&fit=crop&v=1461362658&auto=format&frame=1&q=92&s=3973d24f5c3db1ed1d5c84cec8af1d6d"},"urls":{"web":{"user":"https://www.kickstarter.com/profile/1495925645"},"api":{"user":"https://api.kickstarter.com/v1/users/1495925645?signature=1552621462.382152be8688b069cea0ab5f3a266a1530d83c3f"}}}',

In [None]:
df['photo'].isna().sum()

In [None]:
df['profile'].unique()

### profile example

{"id":822687,
"project_id":822687,
"state":"inactive",
"state_changed_at":1425915845,
"name":null,
"blurb":null,
"background_color":null,
"text_color":null,
"link_background_color":null,
"link_text_color":null,
"link_text":null,
"link_url":null,
"show_feature_image":false,
"background_image_opacity":0.8,
"should_show_feature_image_section":true,
"feature_image_attributes":{"image_urls":{"default":"https://ksr-ugc.imgix.net/assets/011/625/534/5bea1760d7f20943c4cd5e9b4911c1bd_original.jpg?ixlib=rb-1.1.0&crop=faces&w=1552&h=873&fit=crop&v=1463685705&auto=format&frame=1&q=92&s=90c72b785ef97539099dffb8531dcad3","baseball_card":"https://ksr-ugc.imgix.net/assets/011/625/534/5bea1760d7f20943c4cd5e9b4911c1bd_original.jpg?ixlib=rb-1.1.0&crop=faces&w=560&h=315&fit=crop&v=1463685705&auto=format&frame=1&q=92&s=883b3ff098e5fc4cf6fec280665f5fd0"}}}'

## list of features we keep

 - 2   category                  Kickstarter categories  
 - 4   country                   country  
 - 7   currency                  currency the project asks for
 - 31  state                     !!!!!target!!!!! 


 create:
 - goal in usd: fx_rate * goal
    - 14  fx_rate                   currency conversion rate 
    - 15  goal                      fixed amount required for funding (convert with fx_rate)
 
 - length of name: len(name)
    - 22  name                      project name 

- timedelta from launch to deadline


for later
  21  location                  location

## stakeholder

* who? - people/creators who are considering launching a project on Kickstarter
* why? - to find out if it's worth investing the time/money in creating materials/launching a project and which criteria to consider in order to make it successful
* metric? - f_beta (probably imbalanced data)
* model: classifier (binary)


# EDA - Round 1

In [None]:
# load data
df = pd.read_csv("data/combined_csv.csv")
df.info()

In [None]:
df.head()

In [None]:
df.columns

In [3]:
# drop columns we won't be using

df = df[['category', 'country', 'currency', 'deadline',
       'fx_rate', 'goal', 'launched_at', 
       'name', 'state']]

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 209222 entries, 0 to 209221
Data columns (total 9 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   category     209222 non-null  object 
 1   country      209222 non-null  object 
 2   currency     209222 non-null  object 
 3   deadline     209222 non-null  int64  
 4   fx_rate      209222 non-null  float64
 5   goal         209222 non-null  float64
 6   launched_at  209222 non-null  int64  
 7   name         209222 non-null  object 
 8   state        209222 non-null  object 
dtypes: float64(2), int64(2), object(5)
memory usage: 14.4+ MB


In [5]:
## create goal in USD

df['goal_usd'] = df['goal'] * df['fx_rate']
df.head()

# drop fx_rate and goal
df.drop(['goal', 'fx_rate'], inplace=True, axis=1)
df.head()

Unnamed: 0,category,country,currency,deadline,launched_at,name,state,goal_usd
0,"{""id"":266,""name"":""Footwear"",""slug"":""fashion/fo...",US,USD,1552539775,1548223375,Babalus Children's Shoes,live,28000.0
1,"{""id"":273,""name"":""Playing Cards"",""slug"":""games...",US,USD,1504976459,1502384459,The Ofrenda Oracle Deck,successful,1000.0
2,"{""id"":43,""name"":""Rock"",""slug"":""music/rock"",""po...",US,USD,1371013395,1368421395,"Record Electra's Debut Album (Pop, Rock, Class...",successful,15000.0
3,"{""id"":273,""name"":""Playing Cards"",""slug"":""games...",GB,GBP,1489425776,1484245376,The Mist of Tribunal - A Card Game,failed,13083.9361
4,"{""id"":48,""name"":""Nonfiction"",""slug"":""publishin...",US,USD,1357763527,1355171527,Help change the face of Brain Impairment,successful,2800.0


In [7]:
# calculate length of name
name_len = []
for label, content in df['name'].iteritems():
    name_len.append(len(content))

df['name_len'] = name_len


# drop name column
df.drop(['name'], inplace=True, axis=1)
df.head()

Unnamed: 0,category,country,currency,deadline,launched_at,state,goal_usd,name_len
0,"{""id"":266,""name"":""Footwear"",""slug"":""fashion/fo...",US,USD,1552539775,1548223375,live,28000.0,24
1,"{""id"":273,""name"":""Playing Cards"",""slug"":""games...",US,USD,1504976459,1502384459,successful,1000.0,23
2,"{""id"":43,""name"":""Rock"",""slug"":""music/rock"",""po...",US,USD,1371013395,1368421395,successful,15000.0,51
3,"{""id"":273,""name"":""Playing Cards"",""slug"":""games...",GB,GBP,1489425776,1484245376,failed,13083.9361,34
4,"{""id"":48,""name"":""Nonfiction"",""slug"":""publishin...",US,USD,1357763527,1355171527,successful,2800.0,40


## Split category column to make it usable

In [8]:
# subset category column 
cat_df = df.loc[:,['category']]

# split category column
cat_df[['id','name','slug','position','parent_id','color','urls']] = df['category'].str.split(',',expand=True)

In [9]:
# check number of unique values
cat_df['slug'].nunique()

169

Problem:
- most rows have the format 'main category/ subcategory', but some do not have a subcategory
- this means splitting by '/' is difficult
- if we don't split any further, we have 169 unique values for category, seems okay

In [10]:
# subset column 'slug' as dataframe
cat = cat_df.loc[:,['slug']]

cat['slug'] = cat['slug'].str.replace(r'"slug":', '')
cat['slug'] = cat['slug'].str.replace(r'"', '')
cat['slug']

# rename 'slug' to 'category'
cat.rename(columns={'slug': 'category'}, inplace=True)

# drop column 'category' from original dataframe
df.drop('category', axis=1, inplace=True)

# add new column 'category' to original dataframe
df['category'] = cat
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 209222 entries, 0 to 209221
Data columns (total 8 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   country      209222 non-null  object 
 1   currency     209222 non-null  object 
 2   deadline     209222 non-null  int64  
 3   launched_at  209222 non-null  int64  
 4   state        209222 non-null  object 
 5   goal_usd     209222 non-null  float64
 6   name_len     209222 non-null  int64  
 7   category     209222 non-null  object 
dtypes: float64(1), int64(3), object(4)
memory usage: 12.8+ MB


In [12]:
#df.iloc[:, 3:19].head()
df.head()

Unnamed: 0,country,currency,deadline,launched_at,state,goal_usd,name_len,category
0,US,USD,1552539775,1548223375,live,28000.0,24,"""slug"":""fashion/footwear"""
1,US,USD,1504976459,1502384459,successful,1000.0,23,"""slug"":""games/playing cards"""
2,US,USD,1371013395,1368421395,successful,15000.0,51,"""slug"":""music/rock"""
3,GB,GBP,1489425776,1484245376,failed,13083.9361,34,"""slug"":""games/playing cards"""
4,US,USD,1357763527,1355171527,successful,2800.0,40,"""slug"":""publishing/nonfiction"""


In [13]:
## create column for duration (launch to deadline)

launched = []
deadline = []

In [16]:
# created_at launched_at deadline
for label, content in df['launched_at'].iteritems():
    launched.append(datetime.datetime.fromtimestamp(content))

In [17]:
# created_at launched_at deadline
for label, content in df['deadline'].iteritems():
    deadline.append(datetime.datetime.fromtimestamp(content))

In [18]:
for i in range(len(df)):
    df['launched_at'].iloc[[i]] = launched[i]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)


In [22]:
for i in range(len(df)):
    df['deadline'].iloc[[i]] = deadline[i]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)


In [24]:
df['delta_dead_laun'] = (df['deadline'] - df['launched_at']).astype('timedelta64[h]')

In [21]:
#delta = []

#for i in range(len(launched)):
 #   time = deadline[i] - launched[i]
 #   delta.append(time)

#delta


AttributeError: 'datetime.timedelta' object has no attribute 'astype'

In [25]:
df.head()

Unnamed: 0,country,currency,deadline,launched_at,state,goal_usd,name_len,category,delta_dead_laun
0,US,USD,2019-03-14 06:02:55,2019-01-23 07:02:55,live,28000.0,24,"""slug"":""fashion/footwear""",1199.0
1,US,USD,2017-09-09 19:00:59,2017-08-10 19:00:59,successful,1000.0,23,"""slug"":""games/playing cards""",720.0
2,US,USD,2013-06-12 07:03:15,2013-05-13 07:03:15,successful,15000.0,51,"""slug"":""music/rock""",720.0
3,GB,GBP,2017-03-13 18:22:56,2017-01-12 19:22:56,failed,13083.9361,34,"""slug"":""games/playing cards""",1439.0
4,US,USD,2013-01-09 21:32:07,2012-12-10 21:32:07,successful,2800.0,40,"""slug"":""publishing/nonfiction""",720.0


In [27]:
#export to csv
df.to_csv( "data/df_eda1.csv", index=False, encoding='utf-8-sig')

### Missing Values

In [None]:
df['state'].unique()

In [None]:
live = df.query('state == "live"')
len(live)

In [None]:
canceled = df.query('state == "canceled"')
len(canceled)

In [None]:
suspended = df.query('state == "suspended"')
len(suspended)

In [None]:
successful = df.query('state == "successful"')
len(successful)

In [None]:
failed = df.query('state == "failed"')
len(failed)

In [None]:
# convert 'state' to numerical
# successful: 1
# failed: 0
# drop: live, suspended, canceled

df = df.query('state != "live"')
df = df.query('state != "suspended"')
df = df.query('state != "canceled"')
print(df['state'].unique()) # check that 'state' only contains failed and successful

#df['state'].replace({'failed':0, 'successful':1}, inplace=True)
#print(df['state'].unique()) # check that 'state' only contains 1 and 0


In [None]:
# plot frequency of success and failure

sns.countplot(x='state', data=df)




In [None]:
from sklearn.dummy import DummyClassifier
X = df.drop('state', axis=1)
y = df['state']

dummy_clf = DummyClassifier(strategy="most_frequent")
dummy_clf.fit(X, y)

In [None]:
y_pred_dummy = dummy_clf.predict(X)

In [None]:
dummy_clf.score(X, y)

In [None]:
print(classification_report(y, y_pred_dummy))

In [None]:
cfm_dummy = confusion_matrix(y, y_pred_dummy)
cfm_dummy = cfm_dummy.astype('float') / cfm_dummy.sum(axis=1)[:, np.newaxis] # normalize the data

In [None]:
# view with a heatmap
plt.figure()
sns.heatmap(cfm_dummy, annot=True, annot_kws={"size":30}, 
        cmap='Blues', square=True, fmt='.3f')
plt.ylabel('True label')
plt.xlabel('Predicted label')
plt.title('Confusion matrix for Dummy Classifier:');


## Baseline Model

* the value of your product

Our product helps potential kickstarter creators assess whether their project will get funding.
It supports them in finding out if it's worth investing the time and money to launch their project and which criteria to consider in order to make it successful.

* what will you predict, based on what, and evaluation metric used

We will predict whether a project will be successfully funded on Kickstarter.
Our model will be based on features like the funding goal, the currency the goal is in and the category of the project.
We will use the F1-score as the evaluation metric in order to get a good balance between precision and recall.
Both false positives (predicting that a project will be funded and the creator investing time and money to fail in the end) and false negatives (predicting that a project will be unsuccessful and preventing the creators from realizing their idea) are equally undesirable in this context.

* short description of baseline, and the score (based on eval metric)

Our baseline model is a dummy classifier predicting the most frequent class (in this case success).
It reaches an F1-score (our metric of interest) of 76% and an accuracy of 61%.