In [1]:
import pandas as pd
import glob

pd.options.mode.chained_assignment = None
# read files
allfiles = glob.glob('Kickstarter_data/*.csv')
df = pd.concat((pd.read_csv(f) for f in allfiles), ignore_index=True)


In [2]:
# Examine shape
print(df.shape)

# Examine keys
print(df.keys())

# Examine state counts
print(df.state.value_counts())

# Examine head and tail to get an idea of data
print(df.head())
print(df.tail())

(195614, 37)
Index(['backers_count', 'blurb', 'category', 'converted_pledged_amount',
       'country', 'created_at', 'creator', 'currency', 'currency_symbol',
       'currency_trailing_code', 'current_currency', 'deadline',
       'disable_communication', 'fx_rate', 'goal', 'id', 'is_starrable',
       'launched_at', 'name', 'photo', 'pledged', 'profile', 'slug',
       'source_url', 'spotlight', 'staff_pick', 'state', 'state_changed_at',
       'static_usd_rate', 'urls', 'usd_pledged', 'usd_type', 'location',
       'friends', 'is_backing', 'is_starred', 'permissions'],
      dtype='object')
successful    105680
failed         73634
canceled        8823
live            6876
suspended        601
Name: state, dtype: int64
   backers_count                                              blurb  \
0             80  I will be an artist-in-residence at Elsewhere ...   
1             47  We are looking to bring a Visiting Sculptor fr...   
2             80  Surrealistic oil paintings capturing 

In [3]:
# Filter out unnecessary keys that contain either irrelevant or duplicate information
# Such as url for photo, pledged amount in non-USD -currency, etc.
# URLS are not dropped, because they can be used to check state from live projects
unnecessary_keys = ['currency_symbol', 'currency_trailing_code', 'current_currency', 'id', 'name',
                    'photo', 'pledged', 'profile', 'slug', 'source_url', 'static_usd_rate',
                    'usd_type', 'friends', 'is_backing', 'is_starred', 'permissions', 'converted_pledged_amount']

df.drop(unnecessary_keys, axis=1, inplace = True)
print(df.keys())

Index(['backers_count', 'blurb', 'category', 'country', 'created_at',
       'creator', 'currency', 'deadline', 'disable_communication', 'fx_rate',
       'goal', 'is_starrable', 'launched_at', 'spotlight', 'staff_pick',
       'state', 'state_changed_at', 'urls', 'usd_pledged', 'location'],
      dtype='object')


In [4]:
# Clean some features, data is stored as json string
import json

def filter_json(val, key):
    cat = json.loads(val)
    return cat[key]

# Category, name, location
df['category'] = df.apply(lambda row: filter_json(row['category'], 'slug'), axis = 1)
df['creator'] = df.apply(lambda row: filter_json(row['creator'], 'id'), axis = 1)
df['location'] = df.apply(lambda row: filter_json(row['location'], 'displayable_name') if type(row['location']) != float else '', axis = 1)
df['urls'] = df.apply(lambda row: filter_json(row['urls'], 'web'), axis = 1)

In [5]:
# Filter blurb to use word count

df['blurb_lenght'] = df.apply(lambda row: len(row['blurb'].split()) if type(row['blurb']) != float else 0, axis=1)

In [6]:
# Add column 'duration', that contains duration between created_at and deadline in days

import datetime

def count_days(start, end):
    duration = datetime.datetime.fromtimestamp(end) - datetime.datetime.fromtimestamp(start)
    # count only full days
    return duration.days

df['duration'] = df.apply(lambda row: count_days(row.created_at, row.deadline), axis=1)

In [7]:
# Separate finished and live projects
df.drop(['blurb', 'created_at', 'deadline', 'state_changed_at'], axis=1, inplace=True)
finished = df.loc[df.state != 'live']
unfinished = df.loc[df.state == 'live']
print(finished.shape)
print(unfinished.shape)

(188738, 18)
(6876, 18)


In [8]:
# Edit state so that 0 means failed and 1 means successful
finished['state'][finished.state != 'successful'] = 0
finished['state'][finished.state == 'successful'] = 1
print(finished.state.value_counts())

1    105680
0     83058
Name: state, dtype: int64


In [9]:
# save data as csv-file
finished.to_csv('finished.csv', index = False)
unfinished.to_csv('unfinished.csv', index = False)