# How to crawl data from Twitter without using Twitter API
* Connect to the link
* Parser html
* Store data

** Connect to the link **
1. Use Developer Tools (right click of 'Inspect' in Chrome). 
2. Click 'Network' to find the real link which includes the information you need. Usually it is different from the website in your browser. In this case, the real link starts with 'timeline?'.
3. Click 'Headers' to find 'Request Headers' and 'Query String Paramters', which will be needed when you set header and param.
4. Copy all the information in 'Request Headers' to header={}, except 'cookie'. Note that 'user-agent' is the most import element in this method. 
5. Copy all the information in 'Query String Paramters' to param{}.
6. Set response.


In [64]:
# First test with neurodiversity microsoft since:2015-04-30 until:2017-04-30
# https://twitter.com/search?l=en&q=neurodiversity%20microsoft%20%20since%3A2015-01-01%20until%3A2017-05-31&src=typd&lang=en
import requests
url = 'http://twitter.com/search'
param = {
    'vertical':'default',
    'q':'neurodiversity microsoft since:2015-01-01 until:2017-05-31',# since:2015-01-01 until:2017-05-31
    'l':'en',
    'src':'typd',
    'include_available_features':'1',
    'include_entities':'1',
    'lang':'en',
    'max_position':'TWEET-869627961883803648-869627961883803648-BD1UO2FFu9QAAAAAAAAETAAAAAcAAAASAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA',
    'reset_error_state':'false'
} # TWEET-857-857 means starting with the first page.
header = {
    'accept':'application/json, text/javascript, */*; q=0.01',
    'accept-encoding':'gzip, deflate, sdch, br',
    'accept-language':'en,zh-CN;q=0.8,zh;q=0.6,en-US;q=0.4',
    'dnt':'1',
    'referer':'https://twitter.com/search?l=&q=&src=typd&vertical=&max_position=&',# Remove the information that
                                                                                   # param has included.
    'user-agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36',
    'x-requested-with':'XMLHttpRequest',
    'x-twitter-active-user':'yes'
}
response = requests.get(url,params=param, headers=header)
html = response.content.decode('utf-8')

**Parser HTML**

In [65]:
from bs4 import BeautifulSoup
soup = BeautifulSoup(html,'lxml')
# print(soup.prettify())

In [66]:
# Tips
# find --> find the first one. find_all --> return all the selection
# find --> tag; get --> attribute
# get tweet ID
try:
    tweets_html_id = soup.find_all('div',class_='stream')[0].find_all('li')
    tweets_id = [tweet_html_id.get('data-item-id') for tweet_html_id in tweets_html_id]
    tweets_id = [t for t in tweets_id if t is not None] # remove None from the list 
                                                       # becasue some of the 'li' contain 'data-item-id', some are not
except IndexError:
    pass;



print(tweets_id)
# if using div --> data-item-id, there will be mulitple tweet id because some users retweet others.

['869625787506466817', '867146272263811072', '860155419876106240', '857147106062200832', '856628684752551936', '855891433613656068', '852586842352361472', '852575268938166272', '852199404274634753', '848200843912523776', '844267248306991104', '835202643278639104', '830460184606756865', '817085767138709504']


In [None]:
# get tweets
tweets_html = soup.find_all('li',class_='js-stream-item stream-item stream-item ') # note of the space

In [173]:
# get user name
tweets_username = [tweet_html.find('div').get('data-screen-name') 
                   for tweet_html in tweets_html]
print(tweets_username)

['100PercentWine', 'derickfisher', 'morl8tr', 'JenBeason', 'leecm363', 'PRFund', 'FRCteam4911', 'RyanDCook', 'onBoardCanada', 'MicrosoftSV', 'Jill_T_Elliott', 'draorose1']


In [178]:
# get DataTime
tweets_datetime = [tweet_html.find('small',class_='time').find('a').get('title') 
                   for tweet_html in tweets_html]
print(tweets_datetime)

['2:59 PM - 24 Apr 2017', '2:09 PM - 22 Apr 2017', '11:18 AM - 13 Apr 2017', '10:32 AM - 13 Apr 2017', '9:39 AM - 12 Apr 2017', '8:50 AM - 1 Apr 2017', '12:19 PM - 21 Mar 2017', '11:00 AM - 24 Feb 2017', '8:55 AM - 11 Feb 2017', '11:10 AM - 5 Jan 2017', '8:31 AM - 28 Dec 2016', '11:43 AM - 26 Dec 2016']


In [196]:
# get content
tweets_content = [tweet_html.find('p',class_='TweetTextSize js-tweet-text tweet-text').text.strip() 
                  for tweet_html in tweets_html]
print(tweets_content)

["@Microsoft's neurodiversity program for people living with #Autism. Uses unique talents to make a better workplace. http://bit.ly/2cryBSV", '@microsoft and other have been adapting to #neurodiversity http://s.hbr.org/2oWyVC6', 'SAP, HPE, Microsoft, and EY sharing know-how about neurodiversity employment programs. Palo Alto. #AutismatWork.pic.twitter.com/cHbbAQTSPl', 'So cool that @HPE @sap @microsoft + more coming together to discuss importance of #inclusivehiring! #WeAreOne #AutismatWork #neurodiversity pic.twitter.com/C21PYNijRp', 'The Neurodiversity Movement Finds a Friend in Microsoft, But Tough Questions About Employment Lie Ahead | Transmosis http://buff.ly/2dPJDDL', '#Dyslexia and Neurodiversity in the High Tech Workforce ‚Äì Stu Shader Microsoft Silicon Valley http://ow.ly/2Wx630akcL7\xa0 #PRF', '@Microsoft we are looking forward to a continued partnership with @UnifiedRobotics as well!  #neurodiversity #inclusionhttps://twitter.com/timshriver/status/844137867257380865\xa0‚Ä¶

In [214]:
# get replies, retweets, likes
tweets_replies = [tweet_html.find('span',class_='ProfileTweet-action--reply u-hiddenVisually').find('span')
               .get('data-tweet-stat-count') for tweet_html in tweets_html]
print(tweets_replies)

tweets_retweet = [tweet_html.find('span',class_='ProfileTweet-action--retweet u-hiddenVisually').find('span')
                  .get('data-tweet-stat-count') for tweet_html in tweets_html]
print(tweets_retweet)

tweets_likes = [tweet_html.find('span',class_='ProfileTweet-action--favorite u-hiddenVisually').find('span')
                .get('data-tweet-stat-count') for tweet_html in tweets_html]
print(tweets_likes)

['0', '0', '0', '1', '0', '0', '0', '0', '0', '0', '0', '0']
['4', '0', '0', '0', '4', '0', '1', '2', '3', '2', '1', '0']
['4', '0', '7', '4', '3', '2', '3', '1', '8', '7', '2', '1']


In [225]:
# get image link
tweets_image=[]
tweets_image_html = [tweet_html.find('div',class_='AdaptiveMedia-singlePhoto') for tweet_html in tweets_html]
for tweet_image_html in tweets_image_html:
    if tweet_image_html is not None:
        tweets_image.append(tweet_image_html.find('div').get('data-image-url'))
    else:
        tweets_image.append(None)
print(tweets_image)

[None,
 None,
 'https://pbs.twimg.com/media/C9T_NuVUwAIy7PC.jpg',
 'https://pbs.twimg.com/media/C9T0rLaUMAE1PyA.jpg',
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None]

In [276]:
# get verified account
tweets_verified=[]
tweets_html[6].find('span',class_='Icon Icon--verified')
tweets_verified_html = [tweet_html.find('span',class_='Icon Icon--verified') for tweet_html in tweets_html]
for tweet_verified_html in tweets_verified_html:
    if tweet_verified_html is None:
        tweets_verified.append(0)
    else:
        tweets_verified.append(1)
print(tweets_verified)

[0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0]


** Store Data **

In [261]:
print(tweets_html)

[<li class="js-stream-item stream-item stream-item " data-item-id="856628684752551936" data-item-type="tweet" id="stream-item-tweet-856628684752551936">
<div class="tweet js-stream-tweet js-actionable-tweet js-profile-popup-actionable dismissible-content original-tweet js-original-tweet has-cards " data-card2-type="summary_large_image" data-component-context="tweet" data-conversation-id="856628684752551936" data-disclosure-type="" data-follows-you="false" data-has-cards="true" data-item-id="856628684752551936" data-mentions="Microsoft" data-name="DrinkWineHelpPeople" data-permalink-path="/100PercentWine/status/856628684752551936" data-reply-to-users-json='[{"id_str":"2835526080","screen_name":"100PercentWine","name":"DrinkWineHelpPeople","emojified_name":{"text":"DrinkWineHelpPeople","emojified_text_as_html":"DrinkWineHelpPeople"}},{"id_str":"74286565","screen_name":"Microsoft","name":"Microsoft","emojified_name":{"text":"Microsoft","emojified_text_as_html":"Microsoft"}}]' data-screen-

In [283]:
# merge data to dataframe
import pandas as pd
tweets = [('username',tweets_username),
          ('verified',tweets_verified),
          ('DateTime',tweets_datetime),
          ('content',tweets_content),
          ('likes',tweets_likes),
          ('retweet',tweets_retweet),
          ('replies',tweets_replies),
          ('image',tweets_image)]

tweets_df = pd.DataFrame.from_items(tweets)
# rames = [df1, df2, df3]
# result = pd.concat(frames)
# tweets_df
# tweets

In [285]:
tweets_df.to_csv('tweets_df.csv')

In [None]:
########################################################################################################################

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

In [None]:
company_list = ['microsoft']
startID_list = ['869627961883803648']

In [76]:
# First test with neurodiversity microsoft since:2015-04-30 until:2017-04-30
# https://twitter.com/search?l=en&q=neurodiversity%20microsoft%20%20since%3A2015-01-01%20until%3A2017-05-31&src=typd&lang=en

def connectTwitter(company,endID,startID):
    url = 'http://twitter.com/search'
    param = {
        'vertical':'default',
        'q':'since:2015-01-01 until:2017-05-31 neurodiversity '+company,# since:2015-01-01 until:2017-05-31
        'l':'en',
        'src':'typd',
        'include_available_features':'1',
        'include_entities':'1',
        'lang':'en',
        'max_position':'TWEET-'+endID+'-'+startID+'-BD1UO2FFu9QAAAAAAAAETAAAAAcAAAASAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA',
        'reset_error_state':'false'
    } # TWEET-857-857 means starting with the first page.
    header = {
        'accept':'application/json, text/javascript, */*; q=0.01',
        'accept-encoding':'gzip, deflate, sdch, br',
        'accept-language':'en,zh-CN;q=0.8,zh;q=0.6,en-US;q=0.4',
        'dnt':'1',
        'referer':'https://twitter.com/search?l=&q=&src=typd&vertical=&max_position=&',# Remove the information that
                                                                                       # param has included.
        'user-agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36',
        'x-requested-with':'XMLHttpRequest',
        'x-twitter-active-user':'yes'
    }
    response = requests.get(url,params=param, headers=header)
    html = response.content.decode('utf-8')
    soup = BeautifulSoup(html,'lxml')
    
    try:
        tweets_html_id = soup.find_all('div',class_='stream')[0].find_all('li')
        tweets_id = [tweet_html_id.get('data-item-id') for tweet_html_id in tweets_html_id]
        tweets_id = [t for t in tweets_id if t is not None]
        next_endID = tweets_id[len(tweets_id)-1]
        
    except IndexError:
        soup = None
        next_endID = None

    return soup,next_endID

In [90]:
# Automatically get lastID and go to next page
companies_html = []
for i in range(len(company_list)): # for loop of company_list
    soup_list=[]
    id_list=[]

    if len(id_list)==0: # First page: startID and endID would be the same
        s = connectTwitter(company_list[i],startID_list[i],startID_list[i])
        soup_list.append(s[0])
        id_list.append(s[1])

    if len(id_list)>=1:
        while(id_list[-1] is not None):
            s = connectTwitter(company_list[i],id_list[-1],startID_list[i])
            soup_list.append(s[0])
            id_list.append(s[1])

    if(id_list[-1] is None): # because of the try catch in the def
        del soup_list[-1]
        del id_list[-1]
        
    companies_html.append(soup_list)

In [None]:
def parser_DF():
    soup = BeautifulSoup(html,'lxml')
    tweets_html = soup.find_all('li',class_='js-stream-item stream-item stream-item ') # note of the space
    # get user name
    tweets_username = [tweet_html.find('div').get('data-screen-name') 
                           for tweet_html in tweets_html]
    # get DataTime
    tweets_datetime = [tweet_html.find('small',class_='time').find('a').get('title') 
                           for tweet_html in tweets_html]
    # get content
    tweets_content = [tweet_html.find('p',class_='TweetTextSize js-tweet-text tweet-text').text.strip() 
                          for tweet_html in tweets_html]
    # get replies, retweets, likes
    tweets_replies = [tweet_html.find('span',class_='ProfileTweet-action--reply u-hiddenVisually').find('span')
                          .get('data-tweet-stat-count') for tweet_html in tweets_html]
    tweets_retweet = [tweet_html.find('span',class_='ProfileTweet-action--retweet u-hiddenVisually').find('span')
                          .get('data-tweet-stat-count') for tweet_html in tweets_html]
    tweets_likes = [tweet_html.find('span',class_='ProfileTweet-action--favorite u-hiddenVisually').find('span')
                          .get('data-tweet-stat-count') for tweet_html in tweets_html]
    # get image link
    tweets_image=[]
    tweets_image_html = [tweet_html.find('div',class_='AdaptiveMedia-singlePhoto') for tweet_html in tweets_html]
    for tweet_image_html in tweets_image_html:
        if tweet_image_html is not None:
            tweets_image.append(tweet_image_html.find('div').get('data-image-url'))
        else:
            tweets_image.append(None)
    # get verified account
    tweets_verified=[]
    tweets_html[6].find('span',class_='Icon Icon--verified')
    tweets_verified_html = [tweet_html.find('span',class_='Icon Icon--verified') for tweet_html in tweets_html]
    for tweet_verified_html in tweets_verified_html:
        if tweet_verified_html is None:
            tweets_verified.append(0)
        else:
            tweets_verified.append(1)
    # store data to dataframe        
    tweets = [('username',tweets_username),
              ('verified',tweets_verified),
              ('DateTime',tweets_datetime),
              ('content',tweets_content),
              ('likes',tweets_likes),
              ('retweet',tweets_retweet),
              ('replies',tweets_replies),
              ('image',tweets_image)]

    tweets_df = pd.DataFrame.from_items(tweets)
    return tweets_df