In [1]:
# preparing the environment
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

np.random.seed(0)

In [2]:
# read in the file
df = pd.read_csv('ks-projects-201801.csv')
df.head()

Unnamed: 0,ID,name,category,main_category,currency,deadline,goal,launched,pledged,state,backers,country,usd pledged,usd_pledged_real,usd_goal_real
0,1000002330,The Songs of Adelaide & Abullah,Poetry,Publishing,GBP,2015-10-09,1000.0,2015-08-11 12:12:28,0.0,failed,0,GB,0.0,0.0,1533.95
1,1000003930,Greeting From Earth: ZGAC Arts Capsule For ET,Narrative Film,Film & Video,USD,2017-11-01,30000.0,2017-09-02 04:43:57,2421.0,failed,15,US,100.0,2421.0,30000.0
2,1000004038,Where is Hank?,Narrative Film,Film & Video,USD,2013-02-26,45000.0,2013-01-12 00:20:50,220.0,failed,3,US,220.0,220.0,45000.0
3,1000007540,ToshiCapital Rekordz Needs Help to Complete Album,Music,Music,USD,2012-04-16,5000.0,2012-03-17 03:24:11,1.0,failed,1,US,1.0,1.0,5000.0
4,1000011046,Community Film Project: The Art of Neighborhoo...,Film & Video,Film & Video,USD,2015-08-29,19500.0,2015-07-04 08:35:03,1283.0,canceled,14,US,1283.0,1283.0,19500.0


# Missing Values

In [4]:
# preview of missing values
missing_values_count = df.isnull().sum()
print(missing_values_count)

ID                     0
name                   4
category               0
main_category          0
currency               0
deadline               0
goal                   0
launched               0
pledged                0
state                  0
backers                0
country                0
usd pledged         3797
usd_pledged_real       0
usd_goal_real          0
dtype: int64


4 values are missing in column "names", 3793 values are missing in column "usd pledged" 

In [5]:
# missing values in percentage
print((missing_values_count/len(df)) * 100)

ID                  0.000000
name                0.001056
category            0.000000
main_category       0.000000
currency            0.000000
deadline            0.000000
goal                0.000000
launched            0.000000
pledged             0.000000
state               0.000000
backers             0.000000
country             0.000000
usd pledged         1.002744
usd_pledged_real    0.000000
usd_goal_real       0.000000
dtype: float64


In [6]:
# total missing value in percentage
total_cells = np.product(df.shape)
total_missing = missing_values_count.sum()

print((total_missing/total_cells) * 100, '%')

0.06692001552840139 %


0.067% of total values are missing, 0.0011% of values missing within "name", 1% of values missing within "usd pledged"

Let's take a closer look into the entries with missing values

In [8]:
# on "name"
missing_name = df[df.name.isnull()]
missing_name

Unnamed: 0,ID,name,category,main_category,currency,deadline,goal,launched,pledged,state,backers,country,usd pledged,usd_pledged_real,usd_goal_real
166851,1848699072,,Narrative Film,Film & Video,USD,2012-02-29,200000.0,2012-01-01 12:35:31,100.0,failed,1,US,100.0,100.0,200000.0
307234,634871725,,Video Games,Games,GBP,2013-01-06,2000.0,2012-12-19 23:57:48,196.0,failed,12,GB,317.73,316.05,3224.97
309991,648853978,,Product Design,Design,USD,2016-07-18,2500.0,2016-06-18 05:01:47,0.0,suspended,0,US,0.0,0.0,2500.0
338931,796533179,,Painting,Art,USD,2011-12-05,35000.0,2011-11-06 23:55:55,220.0,failed,5,US,220.0,220.0,35000.0


Since these are still **real projects**, we are going to keep the projects with missing names

In [9]:
# on "usd pledged", we will sample 100 of these to explore
missing_usd_pledged = df[df['usd pledged'].isnull()]
missing_usd_pledged.sample(100)

Unnamed: 0,ID,name,category,main_category,currency,deadline,goal,launched,pledged,state,backers,country,usd pledged,usd_pledged_real,usd_goal_real
358526,896485300,Protecting Cedar Mesa’s Remnants of the Past –...,Film & Video,Film & Video,USD,2014-12-06,7500.0,2014-11-12 00:08:11,8729.00,undefined,0,"N,0""",,8729.00,7500.00
78899,1401451303,Rebecca Lawrence's Solo Project,Music,Music,USD,2015-04-02,2000.0,2015-03-03 21:20:51,1.00,undefined,0,"N,0""",,1.00,2000.00
91475,1464820550,The Love Will Not Be Defeated Tour,Music,Music,USD,2015-05-01,3000.0,2015-04-01 02:57:39,3045.00,undefined,0,"N,0""",,3045.00,3000.00
18136,1092014622,Becoming a Woman Whose God is Enough - Teachin...,Film & Video,Film & Video,USD,2014-09-15,5325.0,2014-08-15 01:11:22,7525.00,undefined,0,"N,0""",,7525.00,5325.00
233099,25481307,Oliveye's music album debut: Come Undone,Music,Music,CAD,2015-03-23,5000.0,2015-02-11 20:33:43,5167.00,undefined,0,"N,0""",,4130.63,3997.12
3825,1019686062,Fairest the Musical Full Cast Concept Album,Music,Music,USD,2015-06-22,4000.0,2015-05-23 06:35:07,0.00,undefined,0,"N,0""",,0.00,4000.00
13754,1069219698,Everyday Experts,Publishing,Publishing,GBP,2015-09-25,1500.0,2015-08-26 19:45:01,0.00,undefined,0,"N,0""",,0.00,2276.49
28652,1145448158,"Orange Stuff ""Here"", album pre-sale",Music,Music,CAD,2014-10-25,4000.0,2014-09-10 15:37:55,4205.00,undefined,0,"N,0""",,3729.82,3547.99
337884,791103538,Saturn's Rings (A Film by Jake Burgess),Film & Video,Film & Video,CAD,2016-04-17,3000.0,2016-03-03 00:04:02,5750.99,undefined,0,"N,0""",,4516.25,2355.90
128804,1653954505,"""Say Hello"" Music Video",Film & Video,Film & Video,USD,2014-10-15,7635.0,2014-09-13 02:13:31,25.00,undefined,0,"N,0""",,25.00,7635.00


Seems like these missing values are actually missing, considering the portion is very small, they might be dropped before visual analysis

Also, there is something odd about the column, "country", let's take a closer look

In [11]:
# Check for unique values in categorical columns to see if there is anything unusual 
print(df.country.unique())

['GB' 'US' 'CA' 'AU' 'NO' 'IT' 'DE' 'IE' 'MX' 'ES' 'N,0"' 'SE' 'FR' 'NL'
 'NZ' 'CH' 'AT' 'DK' 'BE' 'HK' 'LU' 'SG' 'JP']


Within "country", there is an odd value - 'N,0"', let's inspect that

In [12]:
# let's first see how many of them are there
len(df[df.country == 'N,0"'])

3797

In [13]:
# this looks familiar
len(df[df['usd pledged'].isnull()])

3797

It's highly likely that the "NaN" in "usd pledged" co-occur with the 'N,0"' in "country", let's test that

In [14]:
df_this = df[df.country == 'N,0"']
df_that = df[df['usd pledged'].isnull()]
# same ID, same entry
(df_this.ID != df_that.ID).sum()

0

The test confirmed the assumption

According to the data set overview on Kaggle (https://www.kaggle.com/kemical/kickstarter-projects/home), "usd pledged" come from a conversion through a set of algorithm developed by KickStarters, and "usd_pledged_real" come from a conversion through some other algorithm. The fact that only "usd pledged" is missing and not "usd_pledged_real" implies that "usd pledged" is missing due to issues with the KickStarter's conversion algorithm, not due to missing source data (data before conversion). Because "usd_pledged_real" has the same meaning as "usd pleged" and we can just use the former whenever appropriate, we don't necessarily have to remove the entries missing "usd pledged".

However, we might still need to drop somthing - since this project aims to build a model that predict the success chance a projects, we need to drop the **projects that haven't reached an eventual state** yet. Let's inspect the "state" column before further decisions

In [15]:
# 'state'
df.state.value_counts()

failed        197719
successful    133956
canceled       38779
undefined       3562
live            2799
suspended       1846
Name: state, dtype: int64

Judging by the value counts of the "state", we will keep the failed, successful and canceled projects. Following are the reasons to drop the rest
- "undefined": no conclusion can be made on the eventual states of the projects
- "live": the projects are still in progress, thus no conclusion can be made on the eventual states of the projects
- "suspended": chances are the suspensions would end and the projects can be live again, thus no conclusion can be made on the eventual states of the projects

In [16]:
# how many projects will be dropped
df_drop = df[(df.state=='live') | (df.state=='undefined') | (df.state=='suspended')]
amount_drop = df_drop.state.value_counts().sum()
print(amount_drop, " of entries will be dropped")
print("which accounts for ", (amount_drop/len(df)) * 100, "% of original data volume")

8207  of entries will be dropped
which accounts for  2.167373983589543 % of original data volume


In [17]:
# drop the entries 
print("length of original data frame: ", len(df))
indices = df[(df.state=='live') | (df.state=='undefined') | (df.state=='suspended')].index
df = df.drop(indices)
print("length of data frame after dropping the undesired entries",len(df))

length of original data frame:  378661
length of data frame after dropping the undesired entries 370454


In [18]:
# let's inspect the "state" column in new df
df.state.value_counts()

failed        197719
successful    133956
canceled       38779
Name: state, dtype: int64

Now we have to fixed the weird value, 'N,0"', in "country" column, one strategy is to replace the value based on it's currency (infer country based on currency). However, **one currency might be used by multiple countries**, such as Euro. Moreover, **some projects from countries with other currencies might pledge in us dollars due to it's popularity**. Therefore, this is not the safest strategy

Another strategy is to categorize these entries, in the "country" column, as "unknown". This approach, without making any assumption, will avoid asignment errors. If the portion of the data with 'N,0"' is small, then this approach is more desirable.

In [19]:
# let's take a closer look then
df_inspect = df[df.country == 'N,0"']
print(len(df_inspect), ' projects have this oddity in "country" column')
print('which accounts for only', (len(df_inspect)/len(df))*100, '% of the data volume')

232  projects have this oddity in "country" column
which accounts for only 0.0626258590810195 % of the data volume


Therefore, changing 'N,0"' into "unknow" will be our strategy

In [20]:
# change 'N,0"' into 'unknown' in "country" column
df = df.replace(['N,0"'], "unknown")
df.country.value_counts()

US         289671
GB          33215
CA          14508
AU           7681
DE           4057
FR           2873
NL           2807
IT           2775
ES           2204
SE           1718
MX           1632
NZ           1425
DK           1083
IE            793
CH            736
NO            694
BE            597
AT            576
HK            564
SG            521
unknown       232
LU             61
JP             31
Name: country, dtype: int64

Now let's drop a couple columns, the justifications are as below 
- "goal": the analysis will be using "usd_goal_real", which is "goal" converted into usd 
- "usd pledged": some values missing, also the analysis will be using "usd_pledged_real"
- "pledged": the analysis will be using "usd_pledged_real", which is "pledged" converted into usd

In [21]:
# dropping columns
df = df.drop(columns=['goal', 'usd pledged', 'pledged'])
df.columns

Index(['ID', 'name', 'category', 'main_category', 'currency', 'deadline',
       'launched', 'state', 'backers', 'country', 'usd_pledged_real',
       'usd_goal_real'],
      dtype='object')

# Dates
There appears to be two date columns - "deadline" and "launched", convert them into datetime object if necessary

In [23]:
# deadline
print(df.deadline.head(), '\n')
# launch date
print(df.launched.head())

0    2015-10-09
1    2017-11-01
2    2013-02-26
3    2012-04-16
4    2015-08-29
Name: deadline, dtype: object 

0    2015-08-11 12:12:28
1    2017-09-02 04:43:57
2    2013-01-12 00:20:50
3    2012-03-17 03:24:11
4    2015-07-04 08:35:03
Name: launched, dtype: object


### Parsing Dates

In [24]:
# deadline
parsing_deadline = pd.to_datetime(df.deadline, format="%Y-%m-%d")

# parse into year to see differences across years
df['deadline_year'] = parsing_deadline.dt.year

# parse into month to see seasonal differences
df['deadline_month'] = parsing_deadline.dt.month

# launch_date
parsing_launch = pd.to_datetime(df.launched, format='%Y-%m-%d')

# parse into year to see differences across years
df['launch_year'] = parsing_launch.dt.year

# parse into month to see differences across months
df['launch_month'] = parsing_launch.dt.month

In [25]:
# let's look at the new df's head rows
df.head()

Unnamed: 0,ID,name,category,main_category,currency,deadline,launched,state,backers,country,usd_pledged_real,usd_goal_real,deadline_year,deadline_month,launch_year,launch_month
0,1000002330,The Songs of Adelaide & Abullah,Poetry,Publishing,GBP,2015-10-09,2015-08-11 12:12:28,failed,0,GB,0.0,1533.95,2015,10,2015,8
1,1000003930,Greeting From Earth: ZGAC Arts Capsule For ET,Narrative Film,Film & Video,USD,2017-11-01,2017-09-02 04:43:57,failed,15,US,2421.0,30000.0,2017,11,2017,9
2,1000004038,Where is Hank?,Narrative Film,Film & Video,USD,2013-02-26,2013-01-12 00:20:50,failed,3,US,220.0,45000.0,2013,2,2013,1
3,1000007540,ToshiCapital Rekordz Needs Help to Complete Album,Music,Music,USD,2012-04-16,2012-03-17 03:24:11,failed,1,US,1.0,5000.0,2012,4,2012,3
4,1000011046,Community Film Project: The Art of Neighborhoo...,Film & Video,Film & Video,USD,2015-08-29,2015-07-04 08:35:03,canceled,14,US,1283.0,19500.0,2015,8,2015,7


In [27]:
# information on df
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 370454 entries, 0 to 378660
Data columns (total 16 columns):
ID                  370454 non-null int64
name                370451 non-null object
category            370454 non-null object
main_category       370454 non-null object
currency            370454 non-null object
deadline            370454 non-null object
launched            370454 non-null object
state               370454 non-null object
backers             370454 non-null int64
country             370454 non-null object
usd_pledged_real    370454 non-null float64
usd_goal_real       370454 non-null float64
deadline_year       370454 non-null int64
deadline_month      370454 non-null int64
launch_year         370454 non-null int64
launch_month        370454 non-null int64
dtypes: float64(2), int64(6), object(8)
memory usage: 48.0+ MB


In [28]:
# Now write df to csv
df.to_csv("ks-projects-201801-ready.csv", encoding = "utf-8 ")