In [1]:
import twitter
import urllib.request
import os
import config as cf

In [2]:
api = twitter.Api(consumer_key = cf.credentials["consumer_key"],
                  consumer_secret = cf.credentials["consumer_secret"],
                  access_token_key = cf.credentials["access_token"],
                  access_token_secret = cf.credentials["access_token_secret"])

In [3]:
hashtag = 'rickyrenuncia'
result_type = 'mixed' # possible values: mixed, recent, popular
include_entities = 'true'
with_twitter_user_id = 'true' # include user information
since = '2019-07-20' # start date
until = '2019-07-23'
count = '100' # The number of tweets to return per page

In [4]:
query = ('q={hashtag}' + 
         '&result_type={result_type}' +
         '&include_entities={include_entities}' +
         '&with_twitter_user_id={with_twitter_user_id}' + 
         '&since={since}' + 
         '&until={until}' +
         '&count={count}')

query = query.format(hashtag=hashtag,
                 result_type=result_type,
                 include_entities=include_entities,
                 with_twitter_user_id=with_twitter_user_id,
                 since=since,
                 until=until,
                 count=count)

In [5]:
query

'q=rickyrenuncia&result_type=mixed&include_entities=true&with_twitter_user_id=true&since=2019-07-20&until=2019-07-23&count=100'

To use max_id correctly, an application’s first request to a timeline endpoint should only specify a count. When processing this and subsequent responses, keep track of the lowest ID received. This ID should be passed as the value of the max_id parameter for the next request, which will only return Tweets with IDs lower than or equal to the value of the max_id parameter.

In [6]:
all_results = []
max_id = None
IDs = []

for i in range(0,180):
    
    results = api.GetSearch(raw_query = query)
    all_results.extend(results)
    IDs = [result.id for result in results]
    smallest_ID = min(IDs)
    
    if max_id == None: # first call 
        max_id = smallest_ID
        query += '&max_id={max_id}'.format(max_id=max_id)
    else:
        old_max_id = "max_id={max_id}".format(max_id=max_id)
        max_id = smallest_ID
        new_max_id = "max_id={max_id}".format(max_id=max_id)
        query = query.replace(old_max_id,new_max_id)

In [15]:
print("last max_id =",max_id)

last max_id = 1152379193541431296


In [12]:
len(all_results)

17500

In [13]:
folder_name = "downloaded_media"

In [14]:
downloaded_img_ids = [file[:file.find('.')] for file in os.listdir(folder_name)]

for tweet in all_results:
    
    if tweet.media:
        
        for media in tweet.media:
            
            media_id = str(media.id)
            
            if not(media_id in downloaded_img_ids): # don't re-download images
            
                media_url = media.media_url
                file_name = media_id
                file_type = os.path.splitext(media_url)[1]

                urllib.request.urlretrieve(media_url, os.path.join(folder_name,file_name+file_type))
                
                downloaded_img_ids.append(media_id)

The `Status` class represents the Status structure used by the twitter API. It has the following attributes (all are None by default):
* contributors
* coordinates
* created_at: Date the tweet was published. e.g. 'Mon Jul 22 17:17:39 +0000 2019'
* current_user_retweet
* favorite_count: # of times the tweet has been liked (heart button).
* favorited: Boolean
* full_text: List of hashtags the post has. Each element is an object of the class `Hashtag`.
* geo
* hashtags
* id: ID which identifies tweet e.g. 1153353426291572736
* id_str: id as a string. e.g. '1153353426291572736'
* in_reply_to_screen_name
* in_reply_to_status_id
* in_reply_to_user_id
* lang: language of post. e.g. 'en'
* location
* media: List of objects of class `Media`, containing information of media present in the tweet, with URL to access the media. e.g. `[Media(ID=1153108337967468544, Type=video, DisplayURL='pic.twitter.com/IsRTsXfDs9')]`
* place
* possibly_sensitive: Boolean. e.g. False
* quoted_status: Twitter status shared by poster (retweet), if one was shared. It is an object of the class `Status`.
* quoted_status_id: id which identifies the shared post
* quoted_status_id_str: id of shared post as a string
* retweet_count: Number of times post was retweeted
* retweeted: Boolean
* retweeted_status
* scopes
* source: Device/source where post was shared. e.g. `<a href="http://twitter.com/download/iphone" rel="nofollow">Twitter for iPhone</a>`
* text: Text of post, and link of retweet post of the post has a shared tweet. e.g. 'The people have spoken. #RickyRenuncia https://t.co/7UmyDL3Ry4'
* truncated: Boolean
* urls: list of urls mentioned in post, will usually be retweeted post. Each element is an object of the class `URL`. e.g. `[URL(URL=https://t.co/7UmyDL3Ry4, ExpandedURL=https://twitter.com/davidbegnaud/status/1153344412107399169)]`
* user: object of class `User`, has info of tweet author (user ID, screen name, etc)
* user_mentions: list
* withheld_copyright 
* withheld_in_countries
* withheld_scope

The `Hashtag` class represents a twitter hashtag. It has the following attributes:
* text: hashtag text (e.g. RickyRenuncia)

The `URL` class represents a URL contained in a tweet. It has the following attributes:
* expanded_url
* url

The `User` class represents the User structure. It has the following attributes:
* contributors_enabled
* created_at
* default_profile
* default_profile_image
* description
* email
* favourites_count
* followers_count
* following
* friends_count
* geo_enabled
* id
* id_str
* lang
* listed_count
* location
* name
* notifications
* profile attributes ...
* protected
* screen_name
* status
* statuses_count
* time_zone
* url
* utc_offset
* verified
* withheld_in_countries
* withheld_scope

The `Media` class represents the media component of a tweet. It has the following attributes:
* display_url
* expanded_url
* ext_alt_text
* id
* media_url
* media_url_https
* sizes
* type
* url 
* video_info