In [1]:
import datetime as dt

# data analysis imports
import pymysql as msql
import pandas as pd
import numpy as np

# ml imports
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

from sklearn.feature_extraction.text import TfidfVectorizer

import pdb

% matplotlib inline

In [None]:
"""
Tables
------
cb_acquisitions
cb_degrees
cb_funding_rounds
cb_funds
cb_ipos
cb_milestones
cb_objects
cb_offices
cb_people
cb_relationships

Missing
-------
cb_investments
"""

### Helper Methods

In [2]:
def establish_connection():
    """
        Helper to establish connection with local mysql database. Returns cursor object.
    """
    conn = msql.connect(host="localhost", user="root", password="startupmi", db="startupmi", 
                        cursorclass=msql.cursors.DictCursor)
    return conn.cursor()

In [3]:
def fetch_dataframe(cursor):
    """
        Helper to fetch from database and return pandas dataframe.
        
        Parameters
        ----------
        cursor : {pymysql.connection.cursor}
            primary cursor obj
    """
    return pd.DataFrame(cursor.fetchall())

### Data Ingestion
Create dataframes for all of the db tables.

In [4]:
crs = establish_connection()

In [5]:
crs.execute("select * from cb_objects")
df_objects = fetch_dataframe(crs)

In [6]:
crs.execute("select * from cb_investments_1")
df_investments = fetch_dataframe(crs)

In [30]:
crs.execute("select * from cb_acquisitions")
df_acquisitions = fetch_dataframe(crs)

crs.execute("select * from cb_degrees")
df_degrees = fetch_dataframe(crs)

crs.execute("select * from cb_funding_rounds")
df_funrnds = fetch_dataframe(crs)

crs.execute("select * from cb_funds")
df_funds = fetch_dataframe(crs)

crs.execute("select * from cb_investments_1")
df_investments = fetch_dataframe(crs)

crs.execute("select * from cb_ipos")
df_ipos = fetch_dataframe(crs)

crs.execute("select * from cb_milestones")
df_milestones = fetch_dataframe(crs)

crs.execute("select * from cb_objects")
df_objects = fetch_dataframe(crs)

crs.execute("select * from cb_offices")
df_offices = fetch_dataframe(crs)

crs.execute("select * from cb_people")
df_people = fetch_dataframe(crs)

crs.execute("select * from cb_relationships")
df_relationships = fetch_dataframe(crs)

In [172]:
crs.execute("select * from cb_relationships")
df_relationships = fetch_dataframe(crs)

### Transformations
Apply transformations to each of the transformations for dtype compatibility. Create additional dataframes where appropriate.

In [158]:
def acquisition_transform(df):
    """
        Method to transform acquisition dataframe for compatibility with remaining data set.
        
        Parameters
        ----------
        df : {pandas.DataFrame}
            acquisitons dataframe
    """
    # we trust master source (crunchbase) and we're not scraping source data
    # uninterested in creation/update dates
    drop_cols = ["source_url", "source_description", "created_at", "updated_at"]
    for col in drop_cols:
        df.drop(col, inplace=True, axis=1)
    
    # convert all decimal values to float for sklearn compatibility
    decimal_cols = ["price_amount"]
    for col in decimal_cols:
        df[col] = df[col].apply(lambda x: float(x) if x is not None else x)

In [160]:
acquisition_transform(df_acquisitions)

In [190]:
def degrees_transform(df, subject_ohe=False):
    """
        Method to transform degrees dataframe. All transformations are conducted on original dataframe.
        
        Parameters
        ----------
        df : {pandas.DataFrame}
            degrees dataframe
        
        subject_ohe : {bool}
            one hot encode subjects. default to False as degree subject might not necessarily be
            strong indicator of people success.
    """
    # uninterested in creation and update dates
    drop_cols = ["created_at", "updated_at"]
    for col in drop_cols:
        df.drop(col, inplace=True, axis=1)
    
    # significant date ranges for features
    
    df_degrees["subject"] = \
            df_degrees["subject"].apply(lambda x: [sub.strip() for sub in x.split(",")] if x is not None else [])
    
    if subject_ohe:
        # binary features for subjects (multiple subjects are grouped together)
        subject_store = set(sum(df_degrees.subject.tolist(), []))

        # OHE for each subject
        for subj in subject_store:
            df["is_%s" % subj] = df["subject"].apply(lambda x: 1 if subj in x else 0)

        df.drop("subject", inplace=True, axis=1)

In [None]:
degrees_transform(df_degrees)

In [None]:
def funds_transform(df):
    drop_cols = ["created_at", "updated_at", "source_description", "source_url"]
    for col in drop_cols:
        df.drop(col, inplace=True, axis=1)
    
    decimal_cols = ["raised_amount"]
    for col in decimal_cols:
        df[col] = df[col].apply(lambda x: float(x) if x is not None else x)

In [None]:
funds_transform(df_funds)

In [None]:
df_funds.head(2)

In [75]:
df_funrnds.head(2)

Unnamed: 0,created_at,created_by,funded_at,funding_round_code,funding_round_id,funding_round_type,id,is_first_round,is_last_round,object_id,...,post_money_valuation_usd,pre_money_currency_code,pre_money_valuation,pre_money_valuation_usd,raised_amount,raised_amount_usd,raised_currency_code,source_description,source_url,updated_at
0,2007-07-04 04:52:57,initial-importer,2006-12-01,b,1,series-b,1,0,0,c:4,...,,,,,8500000,8500000,USD,,http://www.marketingvox.com/archives/2006/12/2...,2008-02-27 23:14:29
1,2007-05-27 06:08:18,initial-importer,2004-09-01,angel,2,angel,2,0,1,c:5,...,,USD,,,500000,500000,USD,,,2013-06-28 20:07:23


In [66]:
df_funrnds.columns

Index([              u'created_at',               u'created_by',
                      u'funded_at',       u'funding_round_code',
               u'funding_round_id',       u'funding_round_type',
                             u'id',           u'is_first_round',
                  u'is_last_round',                u'object_id',
                   u'participants', u'post_money_currency_code',
           u'post_money_valuation', u'post_money_valuation_usd',
        u'pre_money_currency_code',      u'pre_money_valuation',
        u'pre_money_valuation_usd',            u'raised_amount',
              u'raised_amount_usd',     u'raised_currency_code',
             u'source_description',               u'source_url',
                     u'updated_at'],
      dtype='object')

In [223]:
empty_funds = df_company[df_company.funding_total_usd.apply(lambda x: x is None)]

In [224]:
empty_funds["object_id"] = empty_funds["id"]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [262]:
empty_funds[["object_id", "name", "funding_total_usd", "funding_rounds"]].join(
    df_funds[["object_id"]], on="object_id", how="inner", lsuffix="_1")

Unnamed: 0,object_id_1,name,funding_total_usd,funding_rounds,object_id


In [106]:
df_company = df_objects[(df_objects.status != "operating") & (df_objects.entity_type == "Company")]

In [107]:
df_company.head(2)

Unnamed: 0,category_code,city,closed_at,country_code,created_at,created_by,description,domain,entity_id,entity_type,...,parent_id,permalink,region,relationships,short_description,state_code,status,tag_list,twitter_username,updated_at
1,games_video,Culver City,,USA,2007-05-31 21:11:51,initial-importer,,flektor.com,10,Company,...,,/company/flektor,Los Angeles,6,,CA,acquired,"flektor, photo, video",,2008-05-23 23:23:14
2,games_video,San Mateo,,USA,2007-08-06 23:52:45,initial-importer,,there.com,100,Company,...,,/company/there,SF Bay,12,,CA,acquired,"virtualworld, there, teens",,2013-11-04 02:09:48


In [None]:
"""
invested_companies => number of companies invested in; can cross referene with cb_investments

milestones => look at number and frequency of milestones

closed_at => remove from feature set as it is a perfect indicator of the closed class

state_code => unnecessary level of granularity

city => unnecessary level of granularity
"""

In [108]:
df_company.columns

Index([      u'category_code',                u'city',           u'closed_at',
              u'country_code',          u'created_at',          u'created_by',
               u'description',              u'domain',           u'entity_id',
               u'entity_type',    u'first_funding_at', u'first_investment_at',
        u'first_milestone_at',          u'founded_at',      u'funding_rounds',
         u'funding_total_usd',        u'homepage_url',                  u'id',
        u'invested_companies',   u'investment_rounds',     u'last_funding_at',
        u'last_investment_at',   u'last_milestone_at',         u'logo_height',
                  u'logo_url',          u'logo_width',          u'milestones',
                      u'name',     u'normalized_name',            u'overview',
                 u'parent_id',           u'permalink',              u'region',
             u'relationships',   u'short_description',          u'state_code',
                    u'status',            u'tag_list

In [109]:
# store names for future reference
name_index = df_company[["id", "normalized_name"]]
df_company.drop("normalized_name", inplace=True, axis=1)

A value is trying to be set on a copy of a slice from a DataFrame

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()


In [110]:
drop_cols = ["entity_type", "description", "first_milestone_at", "logo_url", "parent_id", "updated_at",
                 "created_at", "homepage_url", "investment_rounds", "last_milestone_at",
                 "logo_width", "permalink", "short_description", "closed_at", "created_by", "entity_id", "logo_height",
                 "twitter_username", "name", "domain"]
df_company_mod = df_company.drop(drop_cols, axis=1)

In [111]:
df_company_mod.columns

Index([      u'category_code',                u'city',        u'country_code',
          u'first_funding_at', u'first_investment_at',          u'founded_at',
            u'funding_rounds',   u'funding_total_usd',                  u'id',
        u'invested_companies',     u'last_funding_at',  u'last_investment_at',
                u'milestones',            u'overview',              u'region',
             u'relationships',          u'state_code',              u'status',
                  u'tag_list'],
      dtype='object')

In [112]:
df_investments["investor_class"] = df_investments.investor_object_id.apply(lambda x: x.split(":")[0])

In [113]:
# map investor class to new col in company df
inv_class_map = {"c":"num_comp_investors", "f":"num_fin_investors", "p":"num_ppl_investors"}

for cls in inv_class_map:
    inter_count = pd.DataFrame(
        df_investments[df_investments.investor_class == cls].groupby("funded_object_id").count()["investor_object_id"]).reset_index()
    inter_count.columns = ["object_id", inv_class_map[cls]]
    df_company_mod = df_company_mod.merge(inter_count, left_on="id", right_on="object_id", how="left")
    df_company_mod.drop("object_id", inplace=True, axis=1)
    
    # assign 0 for companies with 0 count
    df_company_mod[inv_class_map[cls]] = df_company_mod[inv_class_map[cls]].apply(lambda x: 0 if np.isnan(x) else x)

df_company_mod["total_investors"] = \
    df_company_mod["num_comp_investors"] + df_company_mod["num_fin_investors"] + df_company_mod["num_ppl_investors"]

> <ipython-input-113-5907344f22ed>(7)<module>()
-> inter_count = pd.DataFrame(
(Pdb) n
> <ipython-input-113-5907344f22ed>(8)<module>()
-> df_investments[df_investments.investor_class == cls].groupby("funded_object_id").count()["investor_object_id"]).reset_index()
(Pdb) inter_count.head()
  object_id  num_ppl_investors
0    c:1001                  2
1  c:100189                  1
2    c:1007                  4
3   c:10076                  2
4   c:10082                  2
(Pdb) n
> <ipython-input-113-5907344f22ed>(9)<module>()
-> inter_count.columns = ["object_id", inv_class_map[cls]]
(Pdb) inter_count.head()
  funded_object_id  investor_object_id
0           c:1001                   2
1         c:100189                   1
2           c:1007                   4
3          c:10076                   2
4          c:10082                   2
(Pdb) n
> <ipython-input-113-5907344f22ed>(10)<module>()
-> df_company_mod = df_company_mod.merge(inter_count, left_on="id", right_on="object_id", how=

In [175]:
df_company_mod.head()

Unnamed: 0,category_code,city,country_code,first_funding_at,first_investment_at,founded_at,funding_rounds,funding_total_usd,id,invested_companies,...,num_comp_investors,num_fin_investors,delta_funding,age,success,ppl_inv_successes,comp_inv_successes,fin_inv_successes,total_investor_successes,total_investors
0,games_video,Culver City,USA,,,,0,0,c:10,,...,0,0,,,1,0,0,0,0,0
1,games_video,San Mateo,USA,,,,0,0,c:100,,...,0,0,,,1,0,0,0,0,0
2,web,Mountain View,USA,2008-02-26,,2007-10-01,1,5000000,c:1001,,...,0,1,0.0,-3491.0,1,18,0,48,66,3
3,games_video,,,,,2008-08-22,0,0,c:10012,,...,0,0,,-3165.0,0,0,0,0,0,0
4,mobile,Palo Alto,USA,2008-09-01,,2008-03-01,1,0,c:10014,,...,1,0,0.0,-3339.0,1,0,3,0,3,1


In [115]:
# null values across all columns
nan_count = pd.DataFrame(df_company_mod.isnull().sum())
nan_count.columns = ["nan_count"]
nan_count = nan_count.T

In [116]:
nan_count

Unnamed: 0,category_code,city,country_code,first_funding_at,first_investment_at,founded_at,funding_rounds,funding_total_usd,id,invested_companies,...,milestones,overview,region,relationships,state_code,status,tag_list,num_ppl_investors,num_comp_investors,num_fin_investors
nan_count,2991,3662,3256,8013,12775,5728,7940,8549,0,12774,...,6693,1074,0,4658,6008,0,2394,0,0,0


In [118]:
def delta_cols(dt_1, dt_2):
    """
        Compute days delta for date/time columns.
    """
    if dt_1 is None or dt_2 is None:
        return np.nan
    return (dt_2 - dt_1).days

In [120]:
# temporal features
td_dt = dt.date.today()

df_company_mod["age"] = df_company_mod.apply(lambda x: delta_cols(x["founded_at"], td_dt), axis=1)

df_company_mod["delta_funding"] = df_company_mod.apply(lambda x: delta_cols(x["last_funding_at"], x["first_funding_at"]),axis=1)

# skip investments for now

In [122]:
# clean continuous cols
df_company_mod["funding_rounds"] = df_company_mod["funding_rounds"].apply(lambda x: 0 if np.isnan(x) else x)

df_company_mod["funding_total_usd"] = df_company_mod["funding_total_usd"].apply(lambda x: 0.0 if x is None else x)

df_company_mod["milestones"] = df_company_mod["milestones"].apply(lambda x: 0 if np.isnan(x) else x)

In [141]:
df_company_mod["success"] = df_company_mod["status"].apply(lambda x: 1 if x == "acquired" or x == "ipo" else 0)

In [169]:
# feature - number of successes (ipo or acquisition) across all investors

inv_cls_map = {"f": "fin_inv_successes", "c": "comp_inv_successes", "p": "ppl_inv_successes"}

for cls, colnm in inv_cls_map.iteritems():
    # group by investor id and funded object id
    inv_cls_filt = df_investments[df_investments.investor_class == cls]

    inv_fnd_grp = inv_cls_filt.groupby(["investor_object_id", "funded_object_id"]).count()
    old_cols = inv_fnd_grp.columns
    inv_fnd_grp = inv_fnd_grp.reset_index()
    inv_fnd_grp.columns = ["investor_id", "object_id"] + [col for col in old_cols]
    inv_fnd_grp.drop(old_cols, axis=1, inplace=True)

    # label funded objects as successful
    inv_fnd_grp = inv_fnd_grp.merge(df_company_mod[["id","success"]], left_on="object_id", right_on="id", how="inner")

    # group by investor id and sum successes
    inv_fnd_grp = inv_fnd_grp.groupby("investor_id").sum()["success"].reset_index()
    inv_fnd_grp.columns = ["investor_id", "num_success"]

    # group investments by funded companies
    fnd_inv_grp = inv_cls_filt.groupby(["funded_object_id", "investor_object_id"]).count()
    old_cols = fnd_inv_grp.columns
    fnd_inv_grp = fnd_inv_grp.reset_index()
    fnd_inv_grp.columns = ["object_id", "investor_id"] + [col for col in old_cols]
    fnd_inv_grp.drop(old_cols, axis=1, inplace=True)

    # merge with investor-success group
    fnd_inv_grp = fnd_inv_grp.merge(inv_fnd_grp, on="investor_id", how="inner")

    # group by funded companies and sum
    fnd_inv_grp = fnd_inv_grp.groupby("object_id").sum()["num_success"].reset_index()
    fnd_inv_grp.columns = ["object_id", colnm]
    
    df_company_mod = df_company_mod.merge(fnd_inv_grp, left_on="id", right_on = "object_id", how="left")
    df_company_mod.drop("object_id", inplace=True, axis=1)

    # fill nan values
    df_company_mod[colnm] = df_company_mod[colnm].apply(lambda x: 0 if np.isnan(x) else x)
    
# compute total number of successes across all types of investors
df_company_mod["total_investor_successes"] = \
    df_company_mod["fin_inv_successes"] + df_company_mod["comp_inv_successes"] + df_company_mod["ppl_inv_successes"]

In [117]:
print df_company_mod[df_company_mod.id == "c:10"].iloc[0]["overview"]
print df_company_mod[df_company_mod.id == "c:10"].iloc[0]["tag_list"]

Flektor is a rich-media mash-up platform that enables consumers to create, remix and share photos and videos on the internet without the need for advanced video-editing skills or software.

Fox Interactive Media, a division of News Corporation, announced that it had completed the purchase of Flektor on May 30, 2007.  The estimated puchase price is $15-20 million.
flektor, photo, video


In [200]:
# experimental - topic features from overview (no topic binning)
exper = df_company_mod.head()
exper = exper.reset_index()

In [183]:
v = TfidfVectorizer(stop_words="english")

In [184]:
x = v.fit_transform(exper["overview"])

In [188]:
x = x.toarray()

In [203]:
ov_feats = pd.DataFrame(x,columns=v.get_feature_names())
ov_feats = ov_feats.reset_index()

In [204]:
exper = exper.merge(ov_feats, on="index", how="inner")
exper.drop("index", inplace=True, axis=1)

In [21]:
# set target var
target_map = {class_var:idx for idx, class_var in enumerate(set(df_company_mod["status"].tolist()))}
df_company_mod["class"] = df_company_mod["status"].apply(lambda x: target_map[x])
df_company_mod.drop("status", inplace=True, axis=1)

In [35]:
# check unique cities and countries
# volume indicates whether column should be kept OHE
print len(set(df_company.city.tolist()))
print len(set(df_company.country_code.tolist()))
print len(set(df_company.state_code.tolist()))
print len(set(df_company.region.tolist()))

2132
84
52
1285


In [None]:
# OHE categorical columns

# drop_cols = ["state_code", "region", "city"]
# df_company_mod.drop(drop_cols, inplace=True, axis=1)

categorical_cols = ["category_code", "country_code"]
df_encode = pd.get_dummies(df_company_mod, columns=categorical_cols)

In [104]:
df_offices.head(2) # probably irrelevant outside of state, region, and city

Unnamed: 0,address1,address2,city,country_code,created_at,description,id,latitude,longitude,object_id,office_id,region,state_code,updated_at,zip_code
0,710 - 2nd Avenue,Suite 1100,Seattle,USA,,,1,47.603122,-122.333253,c:1,1,Seattle,WA,,98104
1,4900 Hopyard Rd,Suite 310,Pleasanton,USA,,Headquarters,2,37.692934,-121.904945,c:3,3,SF Bay,CA,,94588


In [107]:
df_people.head(2)

Unnamed: 0,affiliation_name,birthplace,first_name,id,last_name,object_id
0,Blue Nile,,Ben,1,Elowitz,p:2
1,Wetpaint,,Kevin,2,Flaherty,p:3


In [109]:
df_relationships.head(2)

Unnamed: 0,created_at,end_at,id,is_past,person_object_id,relationship_id,relationship_object_id,sequence,start_at,title,updated_at
0,2007-05-25 07:03:54,,1,0,p:2,1,c:1,8,,Co-Founder/CEO/Board of Directors,2013-06-03 09:58:46
1,2007-05-25 07:04:16,,2,1,p:3,2,c:1,279242,,VP Marketing,2010-05-21 16:31:34


In [None]:
# remove urls as we're not scraping data

In [None]:
# overview in objects table can be tokenized
# remove stop words thus having additional keywords that 'describe' company
# might need custom one hot encoding

In [None]:
# can use objects table as primary data store for companies (apply entity_type == "Company" filter)

In [None]:
# create single dataset for people

In [None]:
"""
Table Mapping
"""