## split up the category column to make it usable 

In [13]:
# load required packages
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [14]:
# load data
df = pd.read_csv("data/combined_csv.csv")
#df.info()

In [15]:
# subset category column 
cat_df = df.loc[:,['category']]
#cat_df.info()
#cat_df.head()

### category example

- {"id":43,
- "name":"Rock",
- "slug":"music/rock",
- "position":17,
- "parent_id":14,
- "color":10878931,
- "urls":{"web":{"discover":"http://www.kickstarter.com/discover/categories/music/rock"}}}'

In [16]:
# split category column
cat_df[['id','name','slug','position','parent_id','color','urls']] = df['category'].str.split(',',expand=True)
#cat_df.info()
#cat_df.head()

In [17]:
cat_df['slug'].nunique()

169

In [18]:
cat_df['slug'].unique()

array(['"slug":"fashion/footwear"', '"slug":"games/playing cards"',
       '"slug":"music/rock"', '"slug":"publishing/nonfiction"',
       '"slug":"music/classical music"', '"slug":"music"',
       '"slug":"theater/immersive"', '"slug":"fashion/accessories"',
       '"slug":"food/restaurants"', '"slug":"art/mixed media"',
       '"slug":"music/world music"', '"slug":"theater/experimental"',
       '"slug":"theater/spaces"', '"slug":"photography/fine art"',
       '"slug":"food/small batch"', '"slug":"theater/plays"',
       '"slug":"art/sculpture"', '"slug":"photography"',
       '"slug":"fashion"', '"slug":"music/electronic music"',
       '"slug":"technology/software"', '"slug":"fashion/apparel"',
       '"slug":"art/performance art"', '"slug":"fashion/jewelry"',
       '"slug":"dance/workshops"', '"slug":"theater"',
       '"slug":"theater/musical"', '"slug":"photography/photobooks"',
       '"slug":"photography/nature"', '"slug":"photography/animals"',
       '"slug":"fashion/ready

Problem:
- most rows have the format 'main category/ subcategory', but some do not have a subcategory
- this means splitting by '/' is difficult
- if we don't split any further, we have 169 unique values for category, seems okay


In [19]:
# subset column 'slug' as dataframe
cat = cat_df.loc[:,['slug']]
cat.info()
cat['slug']

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 209222 entries, 0 to 209221
Data columns (total 1 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   slug    209222 non-null  object
dtypes: object(1)
memory usage: 1.6+ MB


0               "slug":"fashion/footwear"
1            "slug":"games/playing cards"
2                     "slug":"music/rock"
3            "slug":"games/playing cards"
4          "slug":"publishing/nonfiction"
                       ...               
209217      "slug":"games/tabletop games"
209218    "slug":"music/electronic music"
209219       "slug":"technology/hardware"
209220    "slug":"film & video/festivals"
209221                "slug":"journalism"
Name: slug, Length: 209222, dtype: object

### Clean-up category entries

- remove ' " '
- remove 'slug:'

In [21]:
cat['slug'] = cat['slug'].str.replace(r'"slug":', '')
cat['slug'] = cat['slug'].str.replace(r'"', '')
cat['slug']


0               fashion/footwear
1            games/playing cards
2                     music/rock
3            games/playing cards
4          publishing/nonfiction
                   ...          
209217      games/tabletop games
209218    music/electronic music
209219       technology/hardware
209220    film & video/festivals
209221                journalism
Name: slug, Length: 209222, dtype: object

In [35]:
# rename 'slug' to 'category'
cat.rename(columns={'slug': 'category'}, inplace=True)
#cat.info()

In [36]:
# drop column 'category' from original dataframe
df.drop('category', axis=1, inplace=True)
#df.info()

In [37]:
# add new column 'category' to original dataframe
df['category'] = cat
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 209222 entries, 0 to 209221
Data columns (total 37 columns):
 #   Column                    Non-Null Count   Dtype  
---  ------                    --------------   -----  
 0   backers_count             209222 non-null  int64  
 1   blurb                     209214 non-null  object 
 2   converted_pledged_amount  209222 non-null  int64  
 3   country                   209222 non-null  object 
 4   created_at                209222 non-null  int64  
 5   creator                   209222 non-null  object 
 6   currency                  209222 non-null  object 
 7   currency_symbol           209222 non-null  object 
 8   currency_trailing_code    209222 non-null  bool   
 9   current_currency          209222 non-null  object 
 10  deadline                  209222 non-null  int64  
 11  disable_communication     209222 non-null  bool   
 12  friends                   300 non-null     object 
 13  fx_rate                   209222 non-null  f

## split location column

### location example

- {"id":2379574,
- "name":"Chicago",
- "slug":"chicago-il",
- "short_name":"Chicago, IL",
- "displayable_name":"Chicago, IL",
- "localized_name":"Chicago",
- "country":"US",
- "state":"IL",
- "type":"Town",
- "is_root":false,
- "urls":{"web":{"discover":"https://www.kickstarter.com/discover/places/chicago-il",
- "location":"https://www.kickstarter.com/locations/chicago-il"},
- "api":{"nearby_projects":"https://api.kickstarter.com/v1/discover?signature=1552595044.c1041c6bca69b0b72738f3b9504ebf921b3e5e0e&woe_id=2379574"}}}'

In [42]:
# subset location column 
#loc_df = df.loc[:,['location']]

## this doesn't work because the names have commas in them ('Chicago, IL')
# split location column
#loc_df[['id','name','slug','short_name','displayable_name','localized_name','country_name', 'state', 'type', 'is_root', 'urls', 'location_name', 'api']] = df['location'].str.split(',',expand=True)

ValueError: Columns must be same length as key