In [4]:
%matplotlib inline
from bs4 import BeautifulSoup
import urllib2
import urllib
import json
import csv
import time
import pickle
import pandas as pd
import numpy as np
import scipy as sp
import matplotlib as mpl
import matplotlib.cm as cm
import matplotlib.pyplot as plt
import pandas as pd
pd.set_option('display.width', 500)
pd.set_option('display.max_columns', 100)
pd.set_option('display.notebook_repr_html', True)
import seaborn as sns
import sys
sns.set_style("whitegrid")
sns.set_context("poster")

from selenium import webdriver
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By

Code for backing up any data that we calculate. We want to make sure that if there is some reason why the twitter data stops being scraped, we save all data scraped thus far.

In [5]:
def backup(obj, info):
    name = "twit_data" + str(info)
    with open(name + '.pkl', 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)
        
def load_obj(name):
    with open(name + '.pkl', 'rb') as f:
        return pickle.load(f)

In [6]:
funding_df=pd.read_csv("funding.csv")
funding_df.fillna('', inplace=True)
funding_df = funding_df.replace(['Unknown'], [''])
funding_df.head(15)

Unnamed: 0,Description,Market,Names,No_Stage_Amount,No_Stage_Date,Pitch,Seed_Amount,Seed_Date,Series_A_Amount,Series_A_Date,Series_B_Amount,Series_B_Date,Series_C_Amount,Series_C_Date,Series_D_Amount,Series_D_Date,Stage
0,,Cable,Epic-Sciences,,,,,,,,"$13,000,000","Nov 13, 2012","$30,000,000","Jul 30, 2014",,,Series C
1,,All Students,Apreso-Classroom,,,,,,,,"$15,000,000","Oct 14, 2008",,,,,Series B
2,Visualead (视觉码) creates better interactions be...,Bridging Online and Offline,Visualead,,,Effective and Secure Offline to Mobile experie...,"$750,000","Mar 25, 2012","$1,600,000","Aug 15, 2013",,"Jan 20, 2015",,,,,Series B
3,,Food Processing,Onshift,"$7,000,000","Feb 3, 2014",,,,,,"$3,000,000","Feb 2, 2012",,,,,Series C
4,,-,Xendex-Holding,,,,,,,"Jun 25, 2008","$3,500,000","Nov 30, 2009",,,,,Series A
5,Palo Alto-based FilmLoop has reportedly layed ...,Software,filmloop,,,,,,"$5,600,000","Feb 1, 2005","$7,000,000","May 1, 2006",,,,,Series A
6,,,Mochi-Media,,,,,,"$4,000,000","Mar 12, 2008","$10,000,000","Jun 18, 2008",,,,,
7,,-,SkyRecon-Systems,,,,,,"$3,730,000","Sep 12, 2005","$6,500,000","Mar 12, 2007",,,,,Series A
8,Challenge your brain with games designed by ne...,,Lumos-Labs,,,Creator of Lumosity,"$450,000","Jun 11, 2007","$3,100,000","Jun 3, 2008",,,"$32,500,000","Jun 16, 2011","$31,500,000","Aug 22, 2012",
9,"ROBLOX is a comprehensive 3D creation, publish...",Blockchains,ROBLOX,,,User-Generated Online Gaming Platform,,,"$2,200,000","Aug 14, 2009","$4,000,000","Jun 14, 2011",,,,,Series A


In [7]:
months = {'Jan':1, 'Feb':2, 'Mar':3, 'Apr':4, 'May':5, 'Jun':6, 'Jul':7, 'Aug':8, 'Sep':9, 'Oct':10, 'Nov':11, 'Dec':12}
def find_dates(date_str):
    if len(date_str) < 8:
        return None
    yr = date_str[-4:]
    mon_str = date_str[:3]
    mon_num = None
    for k in months.keys():
        if k == mon_str:
            mon_num = months[k]
    if not yr.isdigit() or mon_num is None:
        return None
    mon_num = (mon_num - 1) 
    if mon_num == 0:
        mon_num = 12
    end_date = str(yr) + '-' + str(mon_num) + '-15'
    start_date = str(int(yr) - 10) + '-' + str(mon_num) + '-15'
    return (start_date, end_date)

In [8]:
series_A = funding_df[funding_df['Series_A_Amount'] != ''][['Names', 'Series_A_Amount', 'Series_A_Date']]
series_A.columns = ['Names', 'Series_Amount', 'Series_Date']
series_A['Series_Type'] = pd.Series('Series_A', index=series_A.index)

series_B = funding_df[funding_df['Series_B_Amount'] != ''][['Names', 'Series_B_Amount', 'Series_B_Date']]
series_B.columns = ['Names', 'Series_Amount', 'Series_Date']
series_B['Series_Type'] = pd.Series('Series_B', index=series_B.index)

series_C = funding_df[funding_df['Series_C_Amount'] != ''][['Names', 'Series_C_Amount', 'Series_C_Date']]
series_C.columns = ['Names', 'Series_Amount', 'Series_Date']
series_C['Series_Type'] = pd.Series('Series_C', index=series_C.index)

series_D = funding_df[funding_df['Series_D_Amount'] != ''][['Names', 'Series_D_Amount', 'Series_D_Date']]
series_D.columns = ['Names', 'Series_Amount', 'Series_Date']
series_D['Series_Type'] = pd.Series('Series_D', index=series_D.index)

funding_collapsed = pd.concat([series_A, series_B, series_C, series_D])
funding_collapsed.head()

Unnamed: 0,Names,Series_Amount,Series_Date,Series_Type
2,Visualead,"$1,600,000","Aug 15, 2013",Series_A
5,filmloop,"$5,600,000","Feb 1, 2005",Series_A
6,Mochi-Media,"$4,000,000","Mar 12, 2008",Series_A
7,SkyRecon-Systems,"$3,730,000","Sep 12, 2005",Series_A
8,Lumos-Labs,"$3,100,000","Jun 3, 2008",Series_A


This is the list of startup funding rounds and their respective dates. We now want to do a query from Twitter for the last 200 tweets before this date of the funding round. The code below does this using a Selenium web drive. We tried using the Twitter API, but it was unsuccesful because Twitter only allows you to scrape the last couple days.

In [69]:
comps = [name.replace('-', '') for name in list(funding_collapsed['Names'])]
series = list(funding_collapsed['Series_Type'])
dates = list(funding_collapsed['Series_Date'])


browser = webdriver.Firefox()
max_iters = len(comps)
start = time.time()

# This was parallelized across multiple notebooks run by multiple people
for i in range(158, 1000):
    try: 
        if i % 50 == 0:
            mins = (time.time() - start) / 60
            print 'Percent progress ' + str(100.0 * i / max_iters) + ' running for ' + str(mins) + ' mins'            
            backup(comp_tweets, i)
        
        # Get 200 tweets per company
        comp_name = comps[i]
        round_fund = series[i]
        date = dates[i]

        start_end_dates = find_dates(date)
        if start_end_dates is None:
            errors.append((comp_name, round_fund, date))
            continue
        else:
            start_date, end_date = start_end_dates

        if not comp_tweets.has_key(comp_name):
            comp_tweets[comp_name] = {}
        if comp_tweets[comp_name].has_key(round_fund):
            errors.append((comp_name, round_fund, date))
            continue

        comp_tweets[comp_name][round_fund] = []

        url = "https://twitter.com/search?q=" + comp_name + "%20since%3A" + start_date + "%20until%3A" + end_date + "&src=typd&lang=en"

        browser.get(url)
        y_pos, y_pos_old = 0, -1
        for j in range(11):           
            if y_pos != y_pos_old:
                wait = WebDriverWait(browser, 30)
                time.sleep(1)
                browser.execute_script("window.scrollTo(0, document.body.scrollHeight);")
                y_pos_old = y_pos
                y_pos = int(browser.execute_script("return window.scrollY;"))
        
        html_source = browser.page_source
        soup = BeautifulSoup(html_source, "lxml")

        for tweet in soup.find_all("li"):
            if tweet.get('id') is not None:
                if tweet.get('id')[:17] == 'stream-item-tweet':
                    comp_tweets[comp_name][round_fund].append(str(tweet))
            
    except KeyboardInterrupt:
        try:
            sys.exit(0)
        except SystemExit:
            os._exit(0) 
    except:
        errors.append((comp_name, series, date))

Percent progress 3.33667000334 running for 2.48515653213 mins
Percent progress 4.17083750417 running for 13.5679881175 mins
Percent progress 5.00500500501 running for 29.8278889497 mins
Percent progress 5.83917250584 running for 47.3346038342 mins
Percent progress 6.67334000667 running for 60.4890218178 mins
Percent progress 7.50750750751 running for 90.4858969013 mins
Percent progress 8.34167500834 running for 114.817298734 mins
Percent progress 9.17584250918 running for 144.381359633 mins
Percent progress 10.01001001 running for 158.611524598 mins
Percent progress 10.8441775108 running for 177.963723048 mins
Percent progress 11.6783450117 running for 201.168078498 mins
Percent progress 12.5125125125 running for 210.130816066 mins
Percent progress 13.3466800133 running for 231.395539149 mins
Percent progress 14.1808475142 running for 249.466241948 mins
Percent progress 15.015015015 running for 274.641155465 mins
Percent progress 15.8491825158 running for 296.378485099 mins


In [70]:
# Final backup of the data
backup(comp_tweets, 1)