## Preparation of Datasets
--- ------------------

### A. Introduction
--- -------------------

In this script, we cleanse and prepare the raw dataset. Please make sure the data files exist in the 'data' folder.

#### C. Imports:
-- ----------

In [1]:
import pandas as pd
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)

In [2]:
#load the raw dataset from excel file. This is slow.
df= pd.read_excel('./data/Kickstarter_File.xlsx')
#--------------------------------------------------
#write the raw dataset as a data frame in a csv file. Do this only once, if the dataframe file is not provided already.
df.dropna(how='all', inplace=True)
df.to_csv('./data/dataframe_raw.csv', index=False)
#--------------------------------------------------
#load the raw dataset from the data frame csv file, ONCE file is already created. This is fast as compared to reading from excel file.
#df= pd.read_csv('./data/dataframe_raw.csv', low_memory=False)
#df.sample(5)

In [3]:
df.rename_axis('index',inplace=True)
print(df.info())

<class 'pandas.core.frame.DataFrame'>
Index: 184187 entries, 0 to 1048574
Data columns (total 24 columns):
 #   Column                    Non-Null Count   Dtype         
---  ------                    --------------   -----         
 0   blurb                     184184 non-null  object        
 1   Environmental             2053 non-null    object        
 2   Social                    2053 non-null    object        
 3   state                     184186 non-null  object        
 4   Subcategory               184186 non-null  object        
 5   Unnamed: 5                176465 non-null  object        
 6   converted_pledged_amount  184186 non-null  float64       
 7   country                   184186 non-null  object        
 8   country_displayable_name  184186 non-null  object        
 9   created_at                184186 non-null  datetime64[ns]
 10  currency                  184186 non-null  object        
 11  deadline                  184186 non-null  datetime64[ns]
 12  fx_rat

In [4]:
df.shape

(184187, 24)

In [5]:
rows_na=df[df.isna().sum(axis=1)>2]
rows_na

Unnamed: 0_level_0,blurb,Environmental,Social,state,Subcategory,Unnamed: 5,converted_pledged_amount,country,country_displayable_name,created_at,currency,deadline,fx_rate,goal,launched_at,duration,name,pledged,slug,staff_pick,state.1,static_usd_rate,usd_exchange_rate,usd_pledged
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1
2115,"Lisa Townsend Co. premieres -indifference-, a multi-media dance work, this March 22-25 as a CounterPulse Theater Artist in Residence.",,,successful,Dance,,5158.0,US,the United States,2012-01-23 23:57:34,USD,2012-03-16 04:20:37,1.000000,5000.0,2012-02-15 05:20:37,29.958333,Lisa Townsend Company / indifference,5158.00,lisa-townsend-company-indifference,1.0,successful,1.000000,1.000000,5.158000e+03
2121,"The perfect companion for anyone who wants an easy draw-from-the-hip, no strap solution to take your camera anywhere.",,,successful,Photography,,73413.0,US,the United States,2016-09-16 15:47:56,USD,2016-11-17 13:33:25,1.000000,30000.0,2016-10-18 12:33:25,30.041667,SpiderLight Holster,73413.00,spiderlight-holster,0.0,successful,1.000000,1.000000,7.341300e+04
2127,"The Ultimate Everyday Cast Iron - Lighter, Smoother, and Naturally Non-Stick",,,successful,Food,,1247444.0,US,the United States,2020-08-03 22:19:11,USD,2021-03-25 17:00:00,1.000000,10000.0,2021-02-23 15:50:49,30.048044,Prepd Chef Skillet,1247444.50,chef-skillet,1.0,successful,1.000000,1.000000,1.247444e+06
2142,Primera vuelta a España en crowdfunding,,,failed,Journalism,,0.0,ES,Spain,2017-05-01 17:12:25,EUR,2017-06-15 17:12:00,1.207659,1000.0,2017-05-07 18:37:31,38.940613,Primera vuelta a España en crowdfunding,0.00,primera-vuelta-a-espana-en-crowdfunding,0.0,failed,1.099700,1.122250,0.000000e+00
2179,A set of Moon Star Sleeping Creature Enamel Pins with Gold Star Charm included - 1st Gen Edition,,,successful,Art,,4468.0,US,the United States,2021-01-11 03:24:35,USD,2021-03-05 03:26:00,1.000000,500.0,2021-01-14 19:36:35,49.325984,Sleeping Creature Enamel Pins with Star Charm - 1st Gen,4468.00,moon-star-pokemon-enamel-pins-1st-gen-edition,0.0,successful,1.000000,1.000000,4.468000e+03
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
184014,"Get the 'Stupid Thin Man', 'It Was For You (Dolphin remix) and any future tracks onto Spotify and iTunes etc.",,,successful,Music,,525.0,GB,the United Kingdom,2020-08-10 08:08:16,GBP,2020-10-01 21:00:00,1.406145,200.0,2020-09-02 09:04:34,29.496829,Stupid Thin Man' Remix Release,406.00,stupid-thin-man-remix-release,0.0,successful,1.340831,1.293551,5.443775e+02
184041,Save the Bees - An enamel keychain with an important message. By supporting this kickstarter you are supporting the bees,,,successful,Art,,543.0,AU,Australia,2020-12-05 00:44:35,AUD,2021-02-15 02:42:12,0.772229,650.0,2021-01-19 02:42:12,27.000000,Save the Bees Stained Glass & Enamel Keychain and Pins,701.32,make-100-save-the-bees-stained-glass-and-enamel-keychain,0.0,successful,0.770300,0.775600,5.402266e+02
184078,We aim to help our ecosystems by creating more beekeepers with an authentic and engaging online beekeeping class.,,,successful,Film & Video,,3641.0,US,the United States,2020-12-14 17:00:15,USD,2021-01-16 01:49:21,1.000000,2800.0,2020-12-19 01:49:21,28.000000,Beekeeping Education Video for Beginners,3641.00,beginning-beekeeping-class-video,0.0,successful,1.000000,1.000000,3.641000e+03
184164,• Smart • Light • Breathable • Connected • Comfortable •,,,successful,Technology,,31718.0,US,the United States,2020-11-06 14:05:40,USD,2021-01-29 08:00:02,1.000000,10000.0,2020-12-15 08:00:02,45.000000,"xHale, Forget You're Wearing A Mask",31718.00,xhale-forget-youre-wearing-a-mask,0.0,successful,1.000000,1.000000,3.171800e+04


In [6]:
#Rename the columns to meaningful names
df.rename(columns={'Environmental':'is_environmental',
                   'Social':'is_social',
                   'state':'is_success',
                   'Unnamed: 5':'main_category',
                   'Subcategory':'sub_category',
                   'converted_pledged_amount':'pledged_amount_usd',
                   'goal':'goal_in_local_currency',
                   'duration':'duration_in_days',
                   'name':'campaign_name',
                   'pledged':'pledged_in_local_currency',
                   },inplace=True)
df.sample(2)
#Reorder the columns
print('column_names:',list(df.columns))
df=df[['campaign_name', 
       'blurb', 
       'slug', 
       'main_category',
       'sub_category', 
       'is_environmental', 
       'is_social', 
       'country', 
       'country_displayable_name', 
       'created_at', 
       'launched_at', 
       'deadline', 
       'duration_in_days', 
       'currency', 
       'goal_in_local_currency', 
       'pledged_in_local_currency', 
       'usd_pledged',
       'pledged_amount_usd', 
       'staff_pick', 
       'state.1', 
       'fx_rate', 
       'static_usd_rate', 
       'usd_exchange_rate',
       'is_success',]]
df.sample(4)

column_names: ['blurb', 'is_environmental', 'is_social', 'is_success', 'sub_category', 'main_category', 'pledged_amount_usd', 'country', 'country_displayable_name', 'created_at', 'currency', 'deadline', 'fx_rate', 'goal_in_local_currency', 'launched_at', 'duration_in_days', 'campaign_name', 'pledged_in_local_currency', 'slug', 'staff_pick', 'state.1', 'static_usd_rate', 'usd_exchange_rate', 'usd_pledged']


Unnamed: 0_level_0,campaign_name,blurb,slug,main_category,sub_category,is_environmental,is_social,country,country_displayable_name,created_at,launched_at,deadline,duration_in_days,currency,goal_in_local_currency,pledged_in_local_currency,usd_pledged,pledged_amount_usd,staff_pick,state.1,fx_rate,static_usd_rate,usd_exchange_rate,is_success
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1
119202,Turtle Bay Playing Cards,"A truly elegant, fully custom/marked deck of playing cards, meant for the art of magic, cardistry, and gameplay. Printed by Cartamundi.",turtle-bay-playing-cards-0,Games,Playing Cards,,,US,the United States,2020-06-12 22:34:03,2020-09-19 16:00:44,2020-10-19 16:00:44,30.0,USD,5000.0,7029.0,7029.0,7029.0,0.0,successful,1.0,1.0,1.0,successful
32517,Kyle Riggins - Senior Film,My senior film project.,kyle-riggins-senior-film,Film & Video,Horror,,,US,the United States,2016-11-09 03:11:00,2016-11-14 01:35:25,2016-12-14 01:35:25,30.0,USD,500.0,500.0,500.0,500.0,0.0,successful,1.0,1.0,1.0,successful
10670,ItsMyChance.com,ItsMyChance.com - Real prizes awarded weekly to players with the highest score!,itsmychancecom,Games,Mobile Games,,,US,the United States,2014-12-05 21:07:07,2014-12-31 17:52:16,2015-01-30 17:52:16,30.0,USD,25000.0,305.0,305.0,305.0,0.0,failed,1.0,1.0,1.0,failed
51534,Wax Rewind - The Video Cassette Candle,Get super nostalgic with this wax candle to scale replica of a VHS Tape!,wax-rewind-the-video-cassette-candle,Crafts,Candles,,,US,the United States,2019-04-27 18:38:32,2019-04-29 15:15:49,2019-05-29 15:15:49,30.0,USD,15000.0,82.0,82.0,82.0,0.0,failed,1.0,1.0,1.0,failed


In [7]:
#Drop columns which do not add value to the analysis
#---------------------------------------------------
#1. 'country' and 'country_displayable_name'.
# We need only on of these; but save the country codes for later reference.
df[['country_displayable_name','country']].drop_duplicates().reset_index(drop=True).to_csv('./data/country_codes.txt',sep='\t', index=False),

#---------------------------------------------------
#2. 'created_at', 'launched_at', 'deadline', 'duration'
# There is no discernible difference between created_at and launched_at since they are, at maximum, only few days apart in oorder to have an 
#effect on the results we look for. duration provides the difference in days between launched_at and deadline and we keep this parameter (for now).

#-------------------------------------------------
#3. 'currency', 'goal_in_local_currency', 'pledged_in_local_currency', 'usd_pledged', 'converted_pledged_amount_usd',
# 'fx_rate', 'static_usd_rate', 'usd_exchange_rate'
# There is the goal- but only in local currency- and the pledged amount- in both local currency and usd. 
# We add a new column, 'goal_in_usd', which gives the goal in usd as well. It is obtained by multiplying the 'goal_in_local_currency' with
# the provided 'usd_exchange_rate' (Logic: The converted_pledged_amount_usd is provided by the author as a product of 'usd_exchange_rate' 
# and pledged_i'n_local_currency).
df['goal_usd']= df['goal_in_local_currency']*df['usd_exchange_rate'] 
#We retain, in the end, 'goal_in_usd' and 'converted_pledged_amount_usd' and drop other currency, exchange rates and goal and pledged amounts
#in local currency.

#-------------------------------------------------
#4. 'staff_pick' and 'state.1'
# These columns are dropped, since state.1 is a reptition of the column 'is_success' and 'staff_pick' do not seem to add value to the 
#analysis at hand.

#-------------------------------------------------
#5. 'slug' and 'campaign_name'
# 'slug'is a repetition of 'campaign_name', it is dropped.

#-------------------------------------------------
#Drop unwanted columns
columns_to_drop= ['country_displayable_name', 
       'slug',
       'created_at', 
       'launched_at', 
       'deadline', 
       'currency', 
       'goal_in_local_currency', 
       'pledged_in_local_currency', 
       'usd_pledged',
       'staff_pick', 
       'state.1', 
       'fx_rate', 
       'static_usd_rate', 
       'usd_exchange_rate',]
for column in columns_to_drop:
       if column in df.columns:
              df.drop(column, axis=1, inplace=True)

#-------------------------------------------------
#Reorder columns
print('column_names:',list(df.columns))
df=df[['campaign_name', 
       'blurb',
       'main_category', 
       'sub_category', 
       'is_environmental', 
       'is_social', 
       'country', 
       'duration_in_days', 
       'goal_usd',
       'pledged_amount_usd', 
       'is_success', 
       ]]
#Round floating number values to 2
df=df.round(2)
df.sample(3)

column_names: ['campaign_name', 'blurb', 'main_category', 'sub_category', 'is_environmental', 'is_social', 'country', 'duration_in_days', 'pledged_amount_usd', 'is_success', 'goal_usd']


Unnamed: 0_level_0,campaign_name,blurb,main_category,sub_category,is_environmental,is_social,country,duration_in_days,goal_usd,pledged_amount_usd,is_success
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
136520,"Amen Code - Life, Death, Judgment, and the Hidden Heavens",The Advanced Teachings of Jesus Christ - Online Video Series,Film & Video,Webseries,,,US,60.0,11300.0,31.0,failed
69907,CLOSER TO HOME: A Community Filmmaking Project,CLOSER TO HOME is a narrative feature made with teenagers in the San Fernando Valley through a process we call community filmmaking,Film & Video,Narrative Film,,,US,21.0,21000.0,21405.0,successful
173432,Chimayó Street Grill's Patio Rebuild,"We are renovating our patio dining area to be ready for another beautiful summer! People love our food, now they need a place to enjoy!",Food,Restaurants,,,US,30.0,5000.0,1.0,failed


In [8]:
#Save the processed data
df.dropna(how='all', inplace=True)
df.to_csv('./data/dataframe_stripped_features.csv', index=False)