In [3]:
# preparing the environment
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

np.random.seed(0)

In [26]:
# read in the file
df = pd.read_csv('ks-projects-201801.csv')
df.head()

Unnamed: 0,ID,name,category,main_category,currency,deadline,goal,launched,pledged,state,backers,country,usd pledged,usd_pledged_real,usd_goal_real
0,1000002330,The Songs of Adelaide & Abullah,Poetry,Publishing,GBP,2015-10-09,1000.0,2015-08-11 12:12:28,0.0,failed,0,GB,0.0,0.0,1533.95
1,1000003930,Greeting From Earth: ZGAC Arts Capsule For ET,Narrative Film,Film & Video,USD,2017-11-01,30000.0,2017-09-02 04:43:57,2421.0,failed,15,US,100.0,2421.0,30000.0
2,1000004038,Where is Hank?,Narrative Film,Film & Video,USD,2013-02-26,45000.0,2013-01-12 00:20:50,220.0,failed,3,US,220.0,220.0,45000.0
3,1000007540,ToshiCapital Rekordz Needs Help to Complete Album,Music,Music,USD,2012-04-16,5000.0,2012-03-17 03:24:11,1.0,failed,1,US,1.0,1.0,5000.0
4,1000011046,Community Film Project: The Art of Neighborhoo...,Film & Video,Film & Video,USD,2015-08-29,19500.0,2015-07-04 08:35:03,1283.0,canceled,14,US,1283.0,1283.0,19500.0


In [11]:
# handling missing values
missing_values_count = df.isnull().sum()
print(missing_values_count)

ID                     0
name                   4
category               0
main_category          0
currency               0
deadline               0
goal                   0
launched               0
pledged                0
state                  0
backers                0
country                0
usd pledged         3797
usd_pledged_real       0
usd_goal_real          0
dtype: int64


In [14]:
# missing values in percentage
print((missing_values_count/len(df)) * 100)

ID                  0.000000
name                0.001056
category            0.000000
main_category       0.000000
currency            0.000000
deadline            0.000000
goal                0.000000
launched            0.000000
pledged             0.000000
state               0.000000
backers             0.000000
country             0.000000
usd pledged         1.002744
usd_pledged_real    0.000000
usd_goal_real       0.000000
dtype: float64

In [16]:
# total missing value in percentage
total_cells = np.product(df.shape)
total_missing = missing_values_count.sum()

(total_missing/total_cells) * 100

0.066920015528401391

0.067% of values are missing, 0.0011% missing "name", 1% missing "usd pledged"

Let's take a deeper look into the entries with missing values

In [22]:
# on "name"
missing_name = df[df.name.isnull()]
missing_name

Unnamed: 0,ID,name,category,main_category,currency,deadline,goal,launched,pledged,state,backers,country,usd pledged,usd_pledged_real,usd_goal_real
166851,1848699072,,Narrative Film,Film & Video,USD,2012-02-29,200000.0,2012-01-01 12:35:31,100.0,failed,1,US,100.0,100.0,200000.0
307234,634871725,,Video Games,Games,GBP,2013-01-06,2000.0,2012-12-19 23:57:48,196.0,failed,12,GB,317.73,316.05,3224.97
309991,648853978,,Product Design,Design,USD,2016-07-18,2500.0,2016-06-18 05:01:47,0.0,suspended,0,US,0.0,0.0,2500.0
338931,796533179,,Painting,Art,USD,2011-12-05,35000.0,2011-11-06 23:55:55,220.0,failed,5,US,220.0,220.0,35000.0


In [27]:
# on "usd pledged"
missing_usd_pledged = df[df['usd pledged'].isnull()]
missing_usd_pledged.sample(100)

Unnamed: 0,ID,name,category,main_category,currency,deadline,goal,launched,pledged,state,backers,country,usd pledged,usd_pledged_real,usd_goal_real
204724,2042940046,Clockwork Rifle on Vinyl,Music,Music,GBP,2015-05-17,1500.0,2015-04-17 22:15:56,51.00,undefined,0,"N,0""",,79.76,2346.02
175669,1893532960,ANALIA'S INVITATIONS,Publishing,Publishing,USD,2015-04-26,3850.0,2015-03-27 20:58:35,1.00,undefined,0,"N,0""",,1.00,3850.00
307665,637141451,(Meta)-Amorphosis,Publishing,Publishing,GBP,2015-05-23,3000.0,2015-03-24 20:05:33,0.00,undefined,0,"N,0""",,0.00,4577.22
308935,64367021,EP Ray Comedy Music Album!,Music,Music,USD,2015-09-16,2000.0,2015-08-17 19:23:31,0.00,undefined,0,"N,0""",,0.00,2000.00
47937,124435438,Forty-nine fifty,Film & Video,Film & Video,USD,2015-10-09,3200.0,2015-10-02 15:11:15,0.00,undefined,0,"N,0""",,0.00,3200.00
91475,1464820550,The Love Will Not Be Defeated Tour,Music,Music,USD,2015-05-01,3000.0,2015-04-01 02:57:39,3045.00,undefined,0,"N,0""",,3045.00,3000.00
56529,1287578139,New Music Video Production,Music,Music,USD,2015-06-01,4000.0,2015-04-29 03:38:21,4001.00,undefined,0,"N,0""",,4001.00,4000.00
68199,1347068641,Sketch - New Album and Launch!,Music,Music,GBP,2015-04-29,5000.0,2015-03-30 19:24:16,186.00,undefined,0,"N,0""",,287.05,7716.41
184154,1937729243,Animals EP,Music,Music,GBP,2015-03-09,800.0,2015-02-07 15:17:18,1182.00,undefined,0,"N,0""",,1747.10,1182.47
254977,366641984,Good Company CD Project,Music,Music,USD,2014-10-11,1500.0,2014-09-06 02:57:31,1825.00,undefined,0,"N,0""",,1825.00,1500.00


Seems like these missing values are actually missing, considering the portion is very small, they will be dropped before visual analysis

In [33]:
# Check for unique values in categorical columns to see if there is anything unusual 
print(df.country.unique())

['GB' 'US' 'CA' 'AU' 'NO' 'IT' 'DE' 'IE' 'MX' 'ES' 'N,0"' 'SE' 'FR' 'NL'
 'NZ' 'CH' 'AT' 'DK' 'BE' 'HK' 'LU' 'SG' 'JP']


In [35]:
# on "country", there is an odd value - 'N,0"', let's inspect that
# let's first see how many of them are there
len(df[df.country == 'N,0"'])

3797

In [37]:
# this looks familiar
len(df[df['usd pledged'].isnull()])

3797

In [42]:
# it's highly likely that the "NaN" in "usd pledged" co-occur with the 'N,0"' in "country", let's test that
df_this = df[df.country == 'N,0"']
df_that = df[df['usd pledged'].isnull()]
# same ID, same entry
(df_this.ID != df_that.ID).sum()

0

the test confirmed the assumption, so dropping the entries with missing values will also take care of this oddity

In [45]:
# now drop the entries with missing values
print(df.shape)
df = df.dropna()
print(df.shape)

# write it to a new file
df.to_csv('ks-projects-201801-na-dropped.csv')

(374860, 15)
(374860, 15)
