# How to crawl data from Twitter without using Twitter API
* Connect to the link
* Parser html
* Store data

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

In [8]:
company_list = ['microsoft','apple']
startID_list = ['869627961883803648','864110182892261379']

** Connect to the link **
1. Use Developer Tools (right click of 'Inspect' in Chrome). 
2. Click 'Network' to find the real link which includes the information you need. Usually it is different from the website in your browser. In this case, the real link starts with 'timeline?'.
3. Click 'Headers' to find 'Request Headers' and 'Query String Paramters', which will be needed when you set header and param.
4. Copy all the information in 'Request Headers' to header={}, except 'cookie'. Note that 'user-agent' is the most import element in this method. 
5. Copy all the information in 'Query String Paramters' to param{}.
6. Set response.


In [3]:
# First test with neurodiversity microsoft since:2015-04-30 until:2017-04-30
# https://twitter.com/search?l=en&q=neurodiversity%20microsoft%20%20since%3A2015-01-01%20until%3A2017-05-31&src=typd&lang=en
def connectTwitter(company,endID,startID):
    url = 'http://twitter.com/search'
    param = {
        'vertical':'default',
        'q':'since:2015-01-01 until:2017-05-31 neurodiversity '+company,# since:2015-01-01 until:2017-05-31
        'l':'en',
        'src':'typd',
        'include_available_features':'1',
        'include_entities':'1',
        'lang':'en',
        'max_position':'TWEET-'+endID+'-'+startID+'-BD1UO2FFu9QAAAAAAAAETAAAAAcAAAASAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA',
        'reset_error_state':'false'
    } # TWEET-857-857 means starting with the first page.
    header = {
        'accept':'application/json, text/javascript, */*; q=0.01',
        'accept-encoding':'gzip, deflate, sdch, br',
        'accept-language':'en,zh-CN;q=0.8,zh;q=0.6,en-US;q=0.4',
        'dnt':'1',
        'referer':'https://twitter.com/search?l=&q=&src=typd&vertical=&max_position=&',# Remove the information that
                                                                                       # param has included.
        'user-agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36',
        'x-requested-with':'XMLHttpRequest',
        'x-twitter-active-user':'yes'
    }
    response = requests.get(url,params=param, headers=header)
    html = response.content.decode('utf-8')
    soup = BeautifulSoup(html,'lxml')
    
    try:
        tweets_html_id = soup.find_all('div',class_='stream')[0].find_all('li')
        tweets_id = [tweet_html_id.get('data-item-id') for tweet_html_id in tweets_html_id]
        tweets_id = [t for t in tweets_id if t is not None]
        next_endID = tweets_id[len(tweets_id)-1]
        
    except IndexError:
        soup = None
        next_endID = None

    return soup,next_endID

** Parser HTML and Store data to DataFrame**


In [4]:
# DOM Parser
def parser_DF(soup):
    tweets_html = soup.find_all('li',class_='js-stream-item stream-item stream-item ') # note of the space
    # get user name
    tweets_username = [tweet_html.find('div').get('data-screen-name') 
                           for tweet_html in tweets_html]
    # get DataTime
    tweets_datetime = [tweet_html.find('small',class_='time').find('a').get('title') 
                           for tweet_html in tweets_html]
    # get content
    tweets_content = [tweet_html.find('p',class_='TweetTextSize js-tweet-text tweet-text').text.strip() 
                          for tweet_html in tweets_html]
    # get replies, retweets, likes
    tweets_replies = [tweet_html.find('span',class_='ProfileTweet-action--reply u-hiddenVisually').find('span')
                          .get('data-tweet-stat-count') for tweet_html in tweets_html]
    tweets_retweet = [tweet_html.find('span',class_='ProfileTweet-action--retweet u-hiddenVisually').find('span')
                          .get('data-tweet-stat-count') for tweet_html in tweets_html]
    tweets_likes = [tweet_html.find('span',class_='ProfileTweet-action--favorite u-hiddenVisually').find('span')
                          .get('data-tweet-stat-count') for tweet_html in tweets_html]
    # get image link
    tweets_image=[]
    tweets_image_html = [tweet_html.find('div',class_='AdaptiveMedia-singlePhoto') for tweet_html in tweets_html]
    for tweet_image_html in tweets_image_html:
        if tweet_image_html is not None:
            tweets_image.append(tweet_image_html.find('div').get('data-image-url'))
        else:
            tweets_image.append(None)
    # get verified account
    tweets_verified=[]
    tweets_html[6].find('span',class_='Icon Icon--verified')
    tweets_verified_html = [tweet_html.find('span',class_='Icon Icon--verified') for tweet_html in tweets_html]
    for tweet_verified_html in tweets_verified_html:
        if tweet_verified_html is None:
            tweets_verified.append(0)
        else:
            tweets_verified.append(1)
    
    # store data to dataframe        
    tweets = [('username',tweets_username),
              ('verified',tweets_verified),
              ('DateTime',tweets_datetime),
              ('content',tweets_content),
              ('likes',tweets_likes),
              ('retweet',tweets_retweet),
              ('replies',tweets_replies),
              ('image',tweets_image)]

    tweets_df = pd.DataFrame.from_items(tweets)
    
    return tweets_df

Call Method

In [9]:
# Automatically get lastID and go to next page
companies_html = [] # global variable
for i in range(len(company_list)): # for loop of company_list
    soup_list=[]
    id_list=[]

    if len(id_list)==0: # First page: startID and endID would be the same
        s = connectTwitter(company_list[i],startID_list[i],startID_list[i])
        soup_list.append(s[0])
        id_list.append(s[1])

    if len(id_list)>=1:
        while(id_list[-1] is not None):
            s = connectTwitter(company_list[i],id_list[-1],startID_list[i])
            soup_list.append(s[0])
            id_list.append(s[1])

    if(id_list[-1] is None): # because of the try catch in the def
        del soup_list[-1]
        del id_list[-1]
        
    companies_html.append(soup_list)

In [10]:
# parse html and merge dataframe
companies_df_list = [] # global variable
for j in range(len(companies_html)):
    tweets_df_list = []
    for soup in companies_html[j]:
        tweets_df = parser_DF(soup)
        tweets_df_list.append(tweets_df)
    companies_df_list.append(pd.concat(tweets_df_list,ignore_index=True))
    companies_df_list[j].to_csv(company_list[j]+'.csv')

In [11]:
companies_df_list[0]

Unnamed: 0,username,verified,DateTime,content,likes,retweet,replies,image
0,nicolelweston,0,11:45 AM - 30 May 2017,More companies are reforming their HR processe...,0,0,0,
1,zipmydress,0,3:32 PM - 23 May 2017,ZMD is @Microsoft #Inclusion Conference! Ferna...,0,0,0,
2,MikeCamel,0,8:33 AM - 4 May 2017,"Attending a session on ""Neurodiversity in the ...",1,0,0,
3,MSFTEnable,1,1:19 AM - 26 Apr 2017,"""Neurodiversity in the workforce"" with Stu Sha...",5,4,0,
4,100PercentWine,0,2:59 PM - 24 Apr 2017,@Microsoft's neurodiversity program for people...,4,4,0,
5,derickfisher,0,2:09 PM - 22 Apr 2017,@microsoft and other have been adapting to #ne...,0,0,0,
6,morl8tr,0,11:18 AM - 13 Apr 2017,"SAP, HPE, Microsoft, and EY sharing know-how a...",7,0,0,https://pbs.twimg.com/media/C9T_NuVUwAIy7PC.jpg
7,JenBeason,0,10:32 AM - 13 Apr 2017,So cool that @HPE @sap @microsoft + more comin...,4,0,1,https://pbs.twimg.com/media/C9T0rLaUMAE1PyA.jpg
8,leecm363,0,9:39 AM - 12 Apr 2017,The Neurodiversity Movement Finds a Friend in ...,3,4,0,
9,PRFund,0,8:50 AM - 1 Apr 2017,#Dyslexia and Neurodiversity in the High Tech ...,2,0,0,


In [12]:
companies_df_list[1]

Unnamed: 0,username,verified,DateTime,content,likes,retweet,replies,image
0,JosCuadros,0,7:10 PM - 7 Mar 2017,#Apple #Uber #Amazon #BrillantMind #Mind #Neur...,1,0,0,
1,LeesaVlahosMP,1,9:10 AM - 25 Oct 2016,@Apple support vision hearing physical motor n...,0,1,0,
2,dontdismyabilit,0,9:28 AM - 3 Sep 2016,Doesn't matter how great you are if no one can...,3,7,0,https://pbs.twimg.com/media/CrcVFSRWcAApAbE.jpg
3,Rom_Duck,0,8:38 AM - 11 May 2016,#FollowMe Rom_duck #entrepreneurship #happyTi...,1,0,0,
4,kit_boyer,0,9:32 PM - 4 Apr 2016,Apple's new film shows how tech transforms liv...,0,0,0,https://pbs.twimg.com/media/CfQJKdaWQAI0m61.jpg
5,jrtgirl35,0,1:37 PM - 2 Apr 2016,@tim_cook thankyou for promoting #AutismAccept...,0,0,0,
6,KatKnoernschild,0,10:50 AM - 2 Apr 2016,#AutismAwareness ad by #Apple 'Dylan's Voice'....,0,0,0,
7,excludenone,1,5:34 PM - 16 Jan 2016,@HuffingtonPost how does Apple measure up in #...,0,0,0,
8,AutismAdvoCat,0,8:46 AM - 18 Oct 2015,Interesting \nApple working with Autism\nresea...,1,1,0,
9,TraysJourney,0,7:56 PM - 6 Sep 2015,Hi @brittopopart your apple sculpture was in a...,0,0,0,
