In [57]:
# data analysis imports
import pymysql as msql
import pandas as pd
import numpy as np

# ml imports
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

import pdb

% matplotlib inline

In [None]:
"""
Tables
------
cb_acquisitions
cb_degrees
cb_funding_rounds
cb_funds
cb_ipos
cb_milestones
cb_objects
cb_offices
cb_people
cb_relationships

Missing
-------
cb_investments
"""

### Helper Methods

In [8]:
def establish_connection():
    """
        Helper to establish connection with local mysql database. Returns cursor object.
    """
    conn = msql.connect(host="localhost", user="root", password="startupmi", db="startupmi", 
                        cursorclass=msql.cursors.DictCursor)
    return conn.cursor()

In [19]:
def fetch_dataframe(cursor):
    """
        Helper to fetch from database and return pandas dataframe.
        
        Parameters
        ----------
        cursor : {pymysql.connection.cursor}
            primary cursor obj
    """
    return pd.DataFrame(cursor.fetchall())

### Data Ingestion
Create dataframes for all of the db tables.

In [156]:
crs = establish_connection()

In [30]:
crs.execute("select * from cb_acquisitions")
df_acquisitions = fetch_dataframe(crs)

crs.execute("select * from cb_degrees")
df_degrees = fetch_dataframe(crs)

crs.execute("select * from cb_funding_rounds")
df_funrnds = fetch_dataframe(crs)

crs.execute("select * from cb_funds")
df_funds = fetch_dataframe(crs)

crs.execute("select * from cb_ipos")
df_ipos = fetch_dataframe(crs)

crs.execute("select * from cb_milestones")
df_milestones = fetch_dataframe(crs)

crs.execute("select * from cb_objects")
df_objects = fetch_dataframe(crs)

crs.execute("select * from cb_offices")
df_offices = fetch_dataframe(crs)

crs.execute("select * from cb_people")
df_people = fetch_dataframe(crs)

crs.execute("select * from cb_relationships")
df_relationships = fetch_dataframe(crs)

### Transformations
Apply transformations to each of the transformations for dtype compatibility. Create additional dataframes where appropriate.

In [158]:
def acquisition_transform(df):
    """
        Method to transform acquisition dataframe for compatibility with remaining data set.
        
        Parameters
        ----------
        df : {pandas.DataFrame}
            acquisitons dataframe
    """
    # we trust master source (crunchbase) and we're not scraping source data
    # uninterested in creation/update dates
    drop_cols = ["source_url", "source_description", "created_at", "updated_at"]
    for col in drop_cols:
        df.drop(col, inplace=True, axis=1)
    
    # convert all decimal values to float for sklearn compatibility
    decimal_cols = ["price_amount"]
    for col in decimal_cols:
        df[col] = df[col].apply(lambda x: float(x) if x is not None else x)

In [159]:
df_acquisitions.head(2)

Unnamed: 0,acquired_at,acquired_object_id,acquiring_object_id,acquisition_id,created_at,id,price_amount,price_currency_code,source_description,source_url,term_code,updated_at
0,2007-05-30,c:10,c:11,1,2007-05-31 22:19:54,1,20000000,USD,Fox Interactive confirms purchase of Photobuck...,http://venturebeat.com/2007/05/30/fox-interact...,,2008-05-21 19:23:44
1,2007-07-01,c:72,c:59,7,2007-07-03 08:14:50,2,60000000,USD,Deal is Confirmed: Google Acquired GrandCentral,http://www.techcrunch.com/2007/07/02/deal-is-c...,cash,2011-05-06 21:51:05


In [160]:
acquisition_transform(df_acquisitions)

In [161]:
df_acquisitions.head(2)

Unnamed: 0,acquired_at,acquired_object_id,acquiring_object_id,acquisition_id,id,price_amount,price_currency_code,term_code
0,2007-05-30,c:10,c:11,1,1,20000000,USD,
1,2007-07-01,c:72,c:59,7,2,60000000,USD,cash


In [162]:
df_degrees.head(5)

Unnamed: 0,created_at,degree_type,graduated_at,id,institution,object_id,subject,updated_at
0,2008-02-19 03:17:36,MBA,,1,,p:6117,,2008-02-19 03:17:36
1,2008-02-19 17:58:31,BA,1990-01-01,2,"Washington University, St. Louis",p:6136,"English, French",2008-02-25 00:23:55
2,2008-02-19 17:58:31,MS,1992-01-01,3,Boston University,p:6136,Mass Communication,2008-02-25 00:23:55
3,2008-02-19 23:40:40,MS,2006-01-01,4,University of Greenwich,p:6005,Internet Technology,2008-02-25 00:23:55
4,2008-02-20 05:28:09,BCS,,5,Rice University,p:5832,"Computer Science, Psychology",2008-02-20 05:28:09


In [165]:
test = pd.DataFrame([1,2,3,4], columns=["col1"])

In [166]:
test

Unnamed: 0,col1
0,1
1,2
2,3
3,4


In [190]:
def degrees_transform(df, subject_ohe=False):
    """
        Method to transform degrees dataframe. All transformations are conducted on original dataframe.
        
        Parameters
        ----------
        df : {pandas.DataFrame}
            degrees dataframe
        
        subject_ohe : {bool}
            one hot encode subjects. default to False as degree subject might not necessarily be
            strong indicator of people success.
    """
    # uninterested in creation and update dates
    drop_cols = ["created_at", "updated_at"]
    for col in drop_cols:
        df.drop(col, inplace=True, axis=1)
    
    # significant date ranges for features
    
    df_degrees["subject"] = \
            df_degrees["subject"].apply(lambda x: [sub.strip() for sub in x.split(",")] if x is not None else [])
    
    if subject_ohe:
        # binary features for subjects (multiple subjects are grouped together)
        subject_store = set(sum(df_degrees.subject.tolist(), []))

        # OHE for each subject
        for subj in subject_store:
            df["is_%s" % subj] = df["subject"].apply(lambda x: 1 if subj in x else 0)

        df.drop("subject", inplace=True, axis=1)

In [None]:
degrees_transform(df_degrees)

In [191]:
df_degrees.head(2)

Unnamed: 0,degree_type,graduated_at,id,institution,object_id,subject
0,MBA,,1,,p:6117,[]
1,BA,1990-01-01,2,"Washington University, St. Louis",p:6136,"[English, French]"


In [74]:
df_funds.head(2)

Unnamed: 0,created_at,fund_id,funded_at,id,name,object_id,raised_amount,raised_currency_code,source_description,source_url,updated_at
0,2008-12-17 03:07:16,1,2008-12-16,1,Second Fund,f:371,300000000,USD,peHub,http://www.pehub.com/26194/dfj-dragon-raising-...,2008-12-17 03:07:16
1,2008-12-18 22:04:42,4,2008-12-17,4,Sequoia Israel Fourth Fund,f:17,200750000,USD,Sequoia Israel Raises Fourth Fund,http://www.pehub.com/26725/sequoia-israel-rais...,2008-12-18 22:04:42


In [75]:
df_funrnds.head(2)

Unnamed: 0,created_at,created_by,funded_at,funding_round_code,funding_round_id,funding_round_type,id,is_first_round,is_last_round,object_id,...,post_money_valuation_usd,pre_money_currency_code,pre_money_valuation,pre_money_valuation_usd,raised_amount,raised_amount_usd,raised_currency_code,source_description,source_url,updated_at
0,2007-07-04 04:52:57,initial-importer,2006-12-01,b,1,series-b,1,0,0,c:4,...,,,,,8500000,8500000,USD,,http://www.marketingvox.com/archives/2006/12/2...,2008-02-27 23:14:29
1,2007-05-27 06:08:18,initial-importer,2004-09-01,angel,2,angel,2,0,1,c:5,...,,USD,,,500000,500000,USD,,,2013-06-28 20:07:23


In [66]:
df_funrnds.columns

Index([              u'created_at',               u'created_by',
                      u'funded_at',       u'funding_round_code',
               u'funding_round_id',       u'funding_round_type',
                             u'id',           u'is_first_round',
                  u'is_last_round',                u'object_id',
                   u'participants', u'post_money_currency_code',
           u'post_money_valuation', u'post_money_valuation_usd',
        u'pre_money_currency_code',      u'pre_money_valuation',
        u'pre_money_valuation_usd',            u'raised_amount',
              u'raised_amount_usd',     u'raised_currency_code',
             u'source_description',               u'source_url',
                     u'updated_at'],
      dtype='object')

In [67]:
df_ipos.head()

Unnamed: 0,created_at,id,ipo_id,object_id,public_at,raised_amount,raised_currency_code,source_description,source_url,stock_symbol,updated_at,valuation_amount,valuation_currency_code
0,2008-02-09 05:17:45,1,1,c:1654,1980-12-19,,USD,,,NASDAQ:AAPL,2012-04-12 04:02:59,,USD
1,2008-02-09 05:25:18,2,2,c:1242,1986-03-13,,,,,NASDAQ:MSFT,2010-12-11 12:39:46,,USD
2,2008-02-09 05:40:32,3,3,c:342,1969-06-09,,,,,NYSE:DIS,2010-12-23 08:58:16,,USD
3,2008-02-10 22:51:24,4,4,c:59,2004-08-25,,,,,NASDAQ:GOOG,2011-08-01 20:47:08,,USD
4,2008-02-10 23:28:09,5,5,c:317,1997-05-01,,,,,NASDAQ:AMZN,2011-08-01 21:11:22,100000000000.0,USD


In [70]:
df_milestones.head(2) # seems irrelevant

Unnamed: 0,created_at,description,id,milestone_at,milestone_code,object_id,source_description,source_url,updated_at
0,2008-06-18 08:14:06,Survives iPhone 3G Stevenote,1,2008-06-09,other,c:12,"Twitter Fails To Fail, Community Rejoices",http://www.techcrunch.com/2008/06/10/twitter-f...,2008-06-18 08:14:06
1,2008-06-18 08:46:28,Twhirl announces support for Seesmic video pla...,2,2008-06-17,other,c:3138,Seesmic Now Available In Twhirl,http://www.inquisitr.com/1103/seesmic-now-avai...,2008-06-18 08:46:28


In [76]:
df_objects.head(2) # largest table

Unnamed: 0,category_code,city,closed_at,country_code,created_at,created_by,description,domain,entity_id,entity_type,...,parent_id,permalink,region,relationships,short_description,state_code,status,tag_list,twitter_username,updated_at
0,web,Seattle,,USA,2007-05-25 06:51:27,initial-importer,Technology Platform Company,wetpaint-inc.com,1,Company,...,,/company/wetpaint,Seattle,17,,WA,operating,"wiki, seattle, elowitz, media-industry, media-...",BachelrWetpaint,2013-04-13 03:29:00
1,games_video,Culver City,,USA,2007-05-31 21:11:51,initial-importer,,flektor.com,10,Company,...,,/company/flektor,Los Angeles,6,,CA,acquired,"flektor, photo, video",,2008-05-23 23:23:14


In [77]:
df_objects.columns

Index([      u'category_code',                u'city',           u'closed_at',
              u'country_code',          u'created_at',          u'created_by',
               u'description',              u'domain',           u'entity_id',
               u'entity_type',    u'first_funding_at', u'first_investment_at',
        u'first_milestone_at',          u'founded_at',      u'funding_rounds',
         u'funding_total_usd',        u'homepage_url',                  u'id',
        u'invested_companies',   u'investment_rounds',     u'last_funding_at',
        u'last_investment_at',   u'last_milestone_at',         u'logo_height',
                  u'logo_url',          u'logo_width',          u'milestones',
                      u'name',     u'normalized_name',            u'overview',
                 u'parent_id',           u'permalink',              u'region',
             u'relationships',   u'short_description',          u'state_code',
                    u'status',            u'tag_list

In [104]:
df_offices.head(2) # probably irrelevant outside of state, region, and city

Unnamed: 0,address1,address2,city,country_code,created_at,description,id,latitude,longitude,object_id,office_id,region,state_code,updated_at,zip_code
0,710 - 2nd Avenue,Suite 1100,Seattle,USA,,,1,47.603122,-122.333253,c:1,1,Seattle,WA,,98104
1,4900 Hopyard Rd,Suite 310,Pleasanton,USA,,Headquarters,2,37.692934,-121.904945,c:3,3,SF Bay,CA,,94588


In [107]:
df_people.head(2)

Unnamed: 0,affiliation_name,birthplace,first_name,id,last_name,object_id
0,Blue Nile,,Ben,1,Elowitz,p:2
1,Wetpaint,,Kevin,2,Flaherty,p:3


In [109]:
df_relationships.head(2)

Unnamed: 0,created_at,end_at,id,is_past,person_object_id,relationship_id,relationship_object_id,sequence,start_at,title,updated_at
0,2007-05-25 07:03:54,,1,0,p:2,1,c:1,8,,Co-Founder/CEO/Board of Directors,2013-06-03 09:58:46
1,2007-05-25 07:04:16,,2,1,p:3,2,c:1,279242,,VP Marketing,2010-05-21 16:31:34


In [None]:
# remove urls as we're not scraping data

In [None]:
# overview in objects table can be tokenized
# remove stop words thus having additional keywords that 'describe' company
# might need custom one hot encoding

In [None]:
# can use objects table as primary data store for companies (apply entity_type == "Company" filter)

In [None]:
# create single dataset for people

In [None]:
"""
Table Mapping


"""