In [0]:
# !pip3 install tweepy

Defaulting to user installation because normal site-packages is not writeable
Collecting tweepy
  Downloading tweepy-3.8.0-py2.py3-none-any.whl (28 kB)
Collecting PySocks>=1.5.7
  Downloading PySocks-1.7.1-py3-none-any.whl (16 kB)
Installing collected packages: PySocks, tweepy
Successfully installed PySocks-1.7.1 tweepy-3.8.0
You should consider upgrading via the '/usr/bin/python -m pip install --upgrade pip' command.[0m


In [0]:
import os
import urllib
from datetime import datetime
import sys
import pandas as pd

try:
    import json
except ImportError:
    import simplejson as json

# Import the tweepy library
import tweepy

import logging
logger = logging.getLogger(__name__)
logging.basicConfig(level=logging.DEBUG,
                      format='[%(asctime)s.%(msecs)03d] [%(levelname)s] (%(threadName)-9s) %(message)s',
                      datefmt='%m-%d %H:%M:%S')



In [0]:
def get_tweets_by_hashtag(auth, textToSearch, num_of_results):
    tweets_lis = []
    
    api = tweepy.API(auth)
    for tweet_info in tweepy.Cursor(api.search, q=textToSearch, lang = 'en', tweet_mode='extended', rpp=100).items(num_of_results):                 
      msg = {}
      if 'retweeted_status' in dir(tweet_info):
          tweet=tweet_info.retweeted_status
          msg.update({'expanded_url': tweet.entities['media'][0]['expanded_url']})
      else:
          tweet=tweet_info

      msg.update({'created_at': tweet.created_at, 'full_text': tweet.full_text, 'name': tweet.user.name, 'user_id':tweet.user.id, 'tweet_id': tweet.id})

      tweets_list.append(msg)
    
    return tweets_list
    

In [0]:
def get_tweets_by_textsearch(auth, textToSearch, num_of_results):
    # Create the api to connect to twitter with your creadentials
    api = tweepy.API(auth, wait_on_rate_limit=True, wait_on_rate_limit_notify=True, compression=True, parser=tweepy.parsers.JSONParser())

    search_results = api.search(q=textToSearch+' filter:native_video', count=num_of_results)
    # print(len(search_results['statuses']))
    # print(search_results['statuses'][0])

    return search_results

In [0]:
def prepare_dict(tweet, real_fact):
    temp = {}
    if real_fact == True:
      temp.update({'expanded_url': tweet.entities['media'][0]['expanded_url'], 'created_at': tweet.created_at, 'full_text': tweet.full_text, 'name': tweet.user.name, 'user_id':tweet.user.id, 'tweet_id': tweet.id, 'real_fact': False, 'lang':tweet.lang})
    else:
      temp.update({'expanded_url': tweet.entities['urls'], 'created_at': tweet.created_at, 'full_text': tweet.full_text, 'name': tweet.user.name, 'user_id':tweet.user.id, 'tweet_id': tweet.id, 'real_fact': True, 'lang':tweet.lang})
    return temp

In [0]:
def get_tweets_by_screen_name(auth, screen_name, num_of_results):
    api = tweepy.API(auth)
    
    #initialize a list to hold all the tweepy Tweets
    alltweets = []  
    
    #make initial request for most recent tweets (200 is the maximum allowed count)
    new_tweets = api.user_timeline(screen_name = screen_name,count=num_of_results, tweet_mode="extended")
    
    #save most recent tweets
    alltweets.extend(new_tweets)
    
    #save the id of the oldest tweet less one
    oldest = alltweets[-1].id - 1
    
    iter = 0
    #keep grabbing tweets until there are no tweets left to grab
    while len(new_tweets) > 0:
        print(f"getting tweets before {oldest}")
        
        #all subsiquent requests use the max_id param to prevent duplicates
        new_tweets = api.user_timeline(screen_name = screen_name,count=num_of_results,max_id=oldest, tweet_mode="extended")
        
        #save most recent tweets
        alltweets.extend(new_tweets)
        
        #update the id of the oldest tweet less one
        oldest = alltweets[-1].id - 1
        
        print(f"...{len(alltweets)} tweets downloaded so far")

    
    #transform the tweepy tweets into a 2D array that will populate the csv 
    # outtweets = [{'expanded_url': tweet.entities['media'][0]['expanded_url'], 'created_at': tweet.created_at, 'full_text': tweet.full_text, 'name': tweet.user.name, 'user_id':tweet.user.id, 'tweet_id': tweet.id} for tweet in alltweets if ((tweet.lang == 'en') and (tweet.full_text.find('MyGovFactCheck') != -1))]  
    outtweets = [ prepare_dict(tweet, True) if ((tweet.lang == 'en') and (tweet.full_text.find('MyGovFactCheck') != -1)) else prepare_dict(tweet, False) for tweet in alltweets]
    
    return outtweets

In [0]:
def search_api(textToSearch, num_of_results, tw_type, datefrom, dateto):
    tweets_list = []

    try:
        textToSearch = textToSearch.encode("utf-8")
    except:
        logging.exception("*****KNOWN ERROR HANDLED****** textToSearch encode utf-8")
        textToSearch = textToSearch

    print(textToSearch)
    textToSearch = urllib.parse.quote(textToSearch)

    # Variables that contains the user credentials to access Twitter API 
    ACCESS_TOKEN = 'XXXXXXXXXXXXXXXXXXXXXX'
    ACCESS_SECRET = 'XXXXXXXXXXXXXXXXXXXXXX'
    CONSUMER_KEY = 'XXXXXXXXXXXXXXXXXXXXXX'
    CONSUMER_SECRET = 'XXXXXXXXXXXXXXXXXXXXXX'

    # Setup tweepy to authenticate with Twitter credentials:
    auth = tweepy.OAuthHandler(CONSUMER_KEY, CONSUMER_SECRET)
    auth.set_access_token(ACCESS_TOKEN, ACCESS_SECRET)

    if tw_type.find('hashtag') != -1:
      tweets_list = get_tweets_by_hashtag(auth, textToSearch, num_of_results)
    elif tw_type.find('screen_name') != -1:
      tweets_list = get_tweets_by_screen_name(auth, textToSearch, num_of_results)
    elif tw_type.find('textsearch') != -1:
      tweets_list = get_tweets_by_textsearch(auth, textToSearch, num_of_results)

    return tweets_list

In [0]:
MAX_TWEETS = 5000

# tweets_list = search_api("#MyGovFactCheck", MAX_TWEETS, 'hashtag',  '2019-01-01', '2019-03-01')
tweets_list = search_api("mygovindia", MAX_TWEETS, 'screen_name',  '2019-01-01', '2019-03-01')
# tweets_list = search_api("mygovindia", MAX_TWEETS, None,  '2019-01-01', '2019-03-01')


In [0]:
df = pd.DataFrame(tweets_list)
df = df.sort_values(by='created_at', ascending=False)
df = df[df.lang == 'en']
df = df[df.expanded_url.str.len() != 0] 
df = df.reset_index(drop=True)
print(df.head(10))
print(len(df))

           created_at                                       expanded_url  \
0 2020-05-19 13:18:12  [{'url': 'https://t.co/voPQh6JXvn', 'expanded_...   
1 2020-05-19 10:53:55  [{'url': 'https://t.co/NtOxikeknW', 'expanded_...   
2 2020-05-19 09:15:33  [{'url': 'https://t.co/tYTfLBU5ka', 'expanded_...   
3 2020-05-19 09:07:16  [{'url': 'https://t.co/D5UL7YGQLx', 'expanded_...   
4 2020-05-19 09:01:14  [{'url': 'https://t.co/D5UL7YpfTZ', 'expanded_...   
5 2020-05-19 08:56:41  [{'url': 'https://t.co/D5UL7YGQLx', 'expanded_...   
6 2020-05-19 08:49:20  [{'url': 'https://t.co/D5UL7YGQLx', 'expanded_...   
7 2020-05-19 08:44:34  [{'url': 'https://t.co/voPQh6JXvn', 'expanded_...   
8 2020-05-19 08:40:47  [{'url': 'https://t.co/D5UL7YGQLx', 'expanded_...   
9 2020-05-19 08:32:27  [{'url': 'https://t.co/D5UL7YGQLx', 'expanded_...   

                                           full_text lang        name  \
0  Vande Bharat Mission: Operating Samudra Setu o...   en  MyGovIndia   
1  Delhi Police h

In [0]:
print(len(df[df['real_fact']==False]))
print((df.iloc[0].full_text))

42
Vande Bharat Mission: Operating Samudra Setu on Forefront Evacuating Indians from Maldives. https://t.co/voPQh6JXvn #IndiaFightsCorona https://t.co/MEQ70kow7b


In [0]:
df.to_csv('mygovfactcheck_data.csv')