# Data Import

In [88]:
import pandas as pd

import datetime as dt

In [89]:
# Emilys datapath

data_path = ("C:\\Users\\EAFle\\U3S4_BW\\GitHub_Repo\\Kickstarter\\data-modeling-1\\KickstarterCleanedv3.csv")

# Sams datapath

#data_path = ("YOUR LOCAL FILE HERE")

raw_df = pd.read_csv(data_path, index_col='id')

In [90]:
raw_df.head()

Unnamed: 0_level_0,Unnamed: 0,backers_count,blurb,category,country,created_at,deadline,goal,launched_at,name,...,spotlight,staff_pick,state,state_changed_at,usd_exchange_rate,usd_pledged,blurb_length,goal_in_usd,campaign_duration,sub_category
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1629235715,0,63,A Steve Lafler graphic novel,comics,US,2020-08-27,2020-09-24,599.0,2020-09-01,1956 Book One: Sweet Sweet Little Ramona,...,1,0,1,2020-09-24,1.0,1942.0,5.0,599.0,23,graphic novels
1593905291,1,132,What happens when two drug-fueled lowlifes fin...,comics,US,2017-01-16,2017-03-16,2000.0,2017-02-14,Modern Animals,...,1,0,1,2017-03-16,1.0,3097.0,20.0,2000.0,30,graphic novels
1341470613,2,6,A pillow meant for two.,crafts,US,2015-02-01,2015-03-05,500.0,2015-02-03,Couples Couch Pillow,...,0,0,0,2015-03-05,1.0,211.0,5.0,500.0,30,diy
510157690,3,16,Professional conservation of the 1880's mural ...,art,US,2018-10-24,2019-04-24,17000.0,2019-03-10,"Downtown Mural Restoration (Ann Arbor, MI)",...,0,0,0,2019-04-24,1.0,1368.0,18.0,17000.0,45,painting
147824964,4,44,"We are trying to raise $2,500 for an art proje...",art,US,2012-06-28,2013-04-05,2500.0,2013-02-04,The Trade Parade of Ecuador,...,1,0,1,2013-04-05,1.0,2506.0,14.0,2500.0,60,performance art


# Exploratory Data Analysis

In [91]:
raw_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 218361 entries, 1629235715 to 807310529
Data columns (total 21 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   Unnamed: 0         218361 non-null  int64  
 1   backers_count      218361 non-null  int64  
 2   blurb              218353 non-null  object 
 3   category           218361 non-null  object 
 4   country            218361 non-null  object 
 5   created_at         218361 non-null  object 
 6   deadline           218361 non-null  object 
 7   goal               218361 non-null  float64
 8   launched_at        218361 non-null  object 
 9   name               218361 non-null  object 
 10  pledged            218361 non-null  float64
 11  spotlight          218361 non-null  int64  
 12  staff_pick         218361 non-null  int64  
 13  state              218361 non-null  int64  
 14  state_changed_at   218361 non-null  object 
 15  usd_exchange_rate  218361 non-null  flo

In [92]:
from pandas_profiling import ProfileReport

In [93]:
# raw_df.profile_report()

# Dataframe Cleaning

In [94]:
# Applying the necessary data cleaning and featuring engineering
# in one wrangle function.

def wrangle(df):

    # Defining new reader friendly column names

    column_names = ['Kickstarter_id', 'Backers Count', 'Campaign Description', 'Primary Category', \
        'Country', 'Campaign Created', 'Deadline', 'Campaign Goal', 'Campaign Start', \
        'Product Name', 'Amount Pledged', 'Product Spotlight', 'Staff Pick', 'Status', \
        'Goal Reached Date', 'USD Exchange Rate', 'USD Pledged', 'Description Length', 'Goal in USD', \
        'Campaign Length', 'Subcategory']

    df.columns = column_names


    # Capitalizing the category and subcategory data points for an easier read

    df['Primary Category'] = df['Primary Category'].str.title()
    df['Subcategory'] = df['Subcategory'].str.title()


    # Converting date columns to datetime dtype
    date_columns = ['Campaign Created', 'Deadline', 'Campaign Start', 'Goal Reached Date']
    df[date_columns] = df[date_columns].apply(pd.to_datetime)


    # Dropping columns and rows not applicable to a US audience

    non_essential_columns = ['Kickstarter_id', 'USD Exchange Rate', 'USD Pledged', 'Goal in USD']
    df.drop(columns=non_essential_columns, axis=1, inplace=True)

    df.drop(df.loc[df['Country']!='US'].index, inplace=True)


    # Filling absent subcategory values with primary category values
    # We don't want to discredit a product because it cannot qualify
    # For a granular subcategory

    df['Subcategory'].fillna(df['Primary Category'], inplace=True)


    # Dropping rows with no campaign description
    # As they may not be serious campaigns

    df.dropna(axis=0, inplace=True)


    # Creating features to indicate goal and pledge amounts per day and per backer
    # These features can assist campaign success measurements

    df['Goal Amount Per Backer'] = round(df['Campaign Goal']/df['Backers Count'])
    df['Pledge Amount Per Backer'] = round(df['Amount Pledged']/df['Backers Count'])
    df['Goal Amount Per Day'] = round(df['Campaign Goal']/df['Campaign Length'])
    df['Pledge Amount Per Day'] = round(df['Amount Pledged']/df['Campaign Length'])

    return df

In [95]:
df = wrangle(raw_df)

df.head()

Unnamed: 0_level_0,Backers Count,Campaign Description,Primary Category,Country,Campaign Created,Deadline,Campaign Goal,Campaign Start,Product Name,Amount Pledged,...,Staff Pick,Status,Goal Reached Date,Description Length,Campaign Length,Subcategory,Goal Amount Per Backer,Pledge Amount Per Backer,Goal Amount Per Day,Pledge Amount Per Day
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1629235715,63,A Steve Lafler graphic novel,Comics,US,2020-08-27,2020-09-24,599.0,2020-09-01,1956 Book One: Sweet Sweet Little Ramona,1942.0,...,0,1,2020-09-24,5.0,23,Graphic Novels,10.0,31.0,26.0,84.0
1593905291,132,What happens when two drug-fueled lowlifes fin...,Comics,US,2017-01-16,2017-03-16,2000.0,2017-02-14,Modern Animals,3097.0,...,0,1,2017-03-16,20.0,30,Graphic Novels,15.0,23.0,67.0,103.0
1341470613,6,A pillow meant for two.,Crafts,US,2015-02-01,2015-03-05,500.0,2015-02-03,Couples Couch Pillow,211.0,...,0,0,2015-03-05,5.0,30,Diy,83.0,35.0,17.0,7.0
510157690,16,Professional conservation of the 1880's mural ...,Art,US,2018-10-24,2019-04-24,17000.0,2019-03-10,"Downtown Mural Restoration (Ann Arbor, MI)",1368.0,...,0,0,2019-04-24,18.0,45,Painting,1062.0,86.0,378.0,30.0
147824964,44,"We are trying to raise $2,500 for an art proje...",Art,US,2012-06-28,2013-04-05,2500.0,2013-02-04,The Trade Parade of Ecuador,2506.0,...,0,1,2013-04-05,14.0,60,Performance Art,57.0,57.0,42.0,42.0


In [96]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 148206 entries, 1629235715 to 807310529
Data columns (total 21 columns):
 #   Column                    Non-Null Count   Dtype         
---  ------                    --------------   -----         
 0   Backers Count             148206 non-null  int64         
 1   Campaign Description      148206 non-null  object        
 2   Primary Category          148206 non-null  object        
 3   Country                   148206 non-null  object        
 4   Campaign Created          148206 non-null  datetime64[ns]
 5   Deadline                  148206 non-null  datetime64[ns]
 6   Campaign Goal             148206 non-null  float64       
 7   Campaign Start            148206 non-null  datetime64[ns]
 8   Product Name              148206 non-null  object        
 9   Amount Pledged            148206 non-null  float64       
 10  Product Spotlight         148206 non-null  int64         
 11  Staff Pick                148206 non-null  int64     

## Feature Engineering


In [98]:
# Creating features to calculate word count for each text column

df['Product Name Word Count'] = len(df['Product Name'].split())

df['Description Word Count'] = len(df['Campaign Description'].split())


AttributeError: 'Series' object has no attribute 'split'