In [1]:
import pandas as pd

### Categorical variables

In [2]:
# Let's use pandas to create Categorical Series. One way is by 
# specifying dtype="category" when constructing a Series:

s = pd.Series(["a","b","c","a"], dtype="category")
s

0    a
1    b
2    c
3    a
dtype: category
Categories (3, object): [a, b, c]

In [3]:
# Another way is to convert an existing Series or column to a 
# category dtype:

df = pd.DataFrame({"A":["a","b","c","a"]})
df["B"] = df["A"].astype('category')
df

Unnamed: 0,A,B
0,a,a
1,b,b
2,c,c
3,a,a


In [4]:
# You can also pass a pandas.Categorical object to a Series 

raw_cat = pd.Categorical(["a","b","c","a"], categories=["b","c","d"],
                          ordered=False)

In [5]:
s = pd.Series(raw_cat)
s

0    NaN
1      b
2      c
3    NaN
dtype: category
Categories (3, object): [b, c, d]

### Dummy variables

In [6]:
# Let's use pd.get_dummies to convert categorical variables into dummy 
# variables. First let's create a small DataFrame with categorical variables. 

df = pd.DataFrame({'key': list('bbacab'), 'data1': range(6)})

In [7]:
df

Unnamed: 0,data1,key
0,0,b
1,1,b
2,2,a
3,3,c
4,4,a
5,5,b


In [8]:
# Now, let's convert the categorical variables into dummy variables. 

pd.get_dummies(df['key'])

Unnamed: 0,a,b,c
0,0,1,0
1,0,1,0
2,1,0,0
3,0,0,1
4,1,0,0
5,0,1,0


In [9]:
pd.concat([df.drop('key',axis=1),pd.get_dummies(df['key'])], axis = 1)

Unnamed: 0,data1,a,b,c
0,0,0,1,0
1,1,0,1,0
2,2,1,0,0
3,3,0,0,1
4,4,1,0,0
5,5,0,1,0


In [10]:
import os
os.getcwd()


crunch = pd.read_csv('/Users/HudsonCavanagh/Documents/crunchbase_monthly_export.csv')
cb = pd.DataFrame(crunch)
cb.head(50)

Unnamed: 0,permalink,name,homepage_url,category_list,market,funding_total_usd,status,country_code,state_code,region,city,funding_rounds,founded_at,founded_month,founded_quarter,founded_year,first_funding_at,last_funding_at,Unnamed: 18
0,/organization/canal-do-credito,Canal do Credito,http://www.canaldocredito.com.br,|Credit|Technology|Services|Finance|,Credit,750000,,BRA,,Rio de Janeiro,Belo Horizonte,1,,,,,1/1/10,1/1/10,
1,/organization/waywire,#waywire,http://www.waywire.com,|Entertainment|Politics|Social Media|News|,Entertainment,1750000,acquired,USA,NY,New York City,New York,1,6/1/12,2012-06,2012-Q2,2012.0,6/30/12,6/30/12,
2,/organization/tv-communications,&TV Communications,http://enjoyandtv.com,|Games|,Games,4000000,operating,USA,CA,Los Angeles,Los Angeles,2,,,,,6/4/10,9/23/10,
3,/organization/rock-your-paper,'Rock' Your Paper,http://www.rockyourpaper.org,|Publishing|Education|,Education,40000,operating,EST,,Tallinn,Tallinn,1,10/26/12,2012-10,2012-Q4,2012.0,8/9/12,8/9/12,
4,/organization/in-touch-network,(In)Touch Network,http://www.InTouchNetwork.com,|Electronics|Guides|Coffee|Restaurants|Music|i...,Apps,1500000,operating,GBR,,London,London,1,4/1/11,2011-04,2011-Q2,2011.0,4/1/11,4/1/11,
5,/organization/n-plusn,#NAME?,http://plusn.com,|Software|,Software,600000,operating,USA,NY,New York City,New York,1,1/1/12,2012-01,2012-Q1,2012.0,8/29/12,8/29/12,
6,/organization/club-domains,.Club Domains,http://nic.club/,|Software|,Software,7000000,,USA,FL,Ft. Lauderdale,Oakland Park,1,10/10/11,2011-10,2011-Q4,2011.0,5/31/13,5/31/13,
7,/organization/fox-networks,.Fox Networks,http://www.dotfox.com,|Advertising|,Advertising,4912393,closed,ARG,,Buenos Aires,Buenos Aires,1,,,,,1/16/07,1/16/07,
8,/organization/0-6-com,0-6.com,http://www.0-6.com,|Curated Web|,Curated Web,2000000,operating,,,,,1,1/1/07,2007-01,2007-Q1,2007.0,3/19/08,3/19/08,
9,/organization/004-technologies,004 Technologies,http://004gmbh.de/en/004-interact,|Software|,Software,-,operating,USA,IL,"Springfield, Illinois",Champaign,1,1/1/10,2010-01,2010-Q1,2010.0,7/24/14,7/24/14,


In [11]:
market_dummy = pd.get_dummies(cb[' market '])
cb_market = pd.concat([cb, market_dummy], axis=1)

# pd.concat([df.drop('key',axis=1),pd.get_dummies(df['key'])], axis = 1)
cols = cb_market.columns
print(cols)

Index([u'permalink', u'name', u'homepage_url', u'category_list', u' market ',
       u' funding_total_usd ', u'status', u'country_code', u'state_code',
       u'region',
       ...
       u' Web Hosting ', u' Web Tools ', u' Weddings ', u' Wholesale ',
       u' Wine And Spirits ', u' Wireless ', u' iOS ', u' iPad ', u' iPhone ',
       u' mHealth '],
      dtype='object', length=566)


In [12]:
cb_market['funding_total_usd'] = cb_market[' funding_total_usd '].apply(lambda x: str(x))
cb_market['funding_total_usd'] = cb_market['funding_total_usd'].apply(lambda x: x.strip(','))
cb_market['funding_total_usd'] = cb_market['funding_total_usd'].apply(lambda x: x.strip())
cb_market['funding_total_usd']

# cb_market['funding_total_usd'] = cb_market['funding_total_usd'].apply(lambda x: int(x))

# print(type(cb_market['funding_total_usd'][4]))

0            750,000
1          1,750,000
2          4,000,000
3             40,000
4          1,500,000
5            600,000
6          7,000,000
7          4,912,393
8          2,000,000
9                  -
10         1,700,000
11            40,000
12                 -
13                 -
14         1,750,000
15                 -
16         2,050,000
17            40,000
18           500,000
19                 -
20         2,535,000
21         4,462,651
22         1,869,079
23        10,000,000
24         3,000,000
25         3,000,000
26         1,250,000
27        35,000,000
28            50,000
29         1,600,000
            ...     
44920        651,000
44921      3,500,000
44922        190,000
44923        100,000
44924        870,000
44925      3,845,100
44926     45,750,000
44927              -
44928      9,000,000
44929      3,384,225
44930        800,000
44931         75,000
44932     12,039,999
44933      2,257,464
44934     38,900,000
44935              -
44936        