# Find Tweets about Manchester United

In [1]:
import requests
import pandas as pd
import json
import time

bearer_token = 'AAAAAAAAAAAAAAAAAAAAAGnaTwEAAAAAhRdM6yLmei6skyaWcjbx8IDFnlw%3DLPQHO2CTw1nVjjHLx3htgP9qmeCOgPpt96EdDujokNcWljI5iP'
headers = {'Authorization':('Bearer '+ bearer_token)}

n = 10000                             # The total number of tweets we want
max_results = 10                      # The number of tweets to pull per request; must be between 10 and 100
total_retrieved = 0                   # To keep track of when to stop
next_token = ""                       # Must be empty on first iteration
search_term = "manchester%20united"   # To form an advanced query, see here: https://twitter.com/search-advanced?lang=en

# Create empty DataFrames and set columns
df_tweets = pd.DataFrame(columns=['tweet_id', 'author_id', 'retweet_count', 'like_count', 'text', 'language', 'created_at', 'source', 'possibly_sensitive', 'image_url'])
df_users = pd.DataFrame(columns=['user_id', 'username', 'created_at', 'description', 'profile_image_url', 'protected', 'verified', 'followers_count', 'following_count', 'tweet_count', 'listed_count'])

# stop when we have n results
while total_retrieved < n:

  # the first time through the loop, we do not need the next_token parameter
  if next_token == "":
    url = f'https://api.twitter.com/2/tweets/search/recent?query={search_term}&max_results={max_results}'
  else:
    url = f'https://api.twitter.com/2/tweets/search/recent?query={search_term}&max_results={max_results}&next_token={next_token}'

  # These are the extra parameters we will add to the querystring; we won't store them all though; just want you to see what's possible
  url += f'&expansions=geo.place_id,author_id,attachments.media_keys'
  url += f'&tweet.fields=attachments,author_id,context_annotations,conversation_id,created_at,entities,geo,id,in_reply_to_user_id,lang,possibly_sensitive,public_metrics,referenced_tweets,reply_settings,source,text,withheld'
  url += f'&media.fields=media_key,type,url&user.fields=created_at,description,entities,id,location,name,pinned_tweet_id,profile_image_url,protected,public_metrics,url,username,verified,withheld'


  # make the request to the Twitter API Recent Search endpoint
  response = requests.request("GET", url, headers=headers)
  try:  # Just in case we get an error
    json_data = json.loads(response.text)
    # print(json_data)
  except:
    print(response.text)

  for tweet in json_data['data']:
    media_key = ""  # Reset to empty each time through the loop so that we can use it for a condition later

    # Store the data into variables
    tweet_id = tweet['id']
    author_id = tweet['author_id']                               
    retweet_count = tweet['public_metrics']['retweet_count']     #label
    like_count = tweet['public_metrics']['like_count']           #label
    image_url = ""                                               #image
    text = tweet['text']                                         #text
    created_at = tweet['created_at']                             #categorical
    source = tweet['source']                                     #categorical
    possibly_sensitive = tweet['possibly_sensitive']             #categorical
    language = tweet['lang']                                     #categorical

    # Find out if there is media
    if 'attachments' in tweet:
      if 'media_keys' in tweet['attachments']:
        media_key = tweet['attachments']['media_keys'][0]

    # If there is a media key in this tweet, iterate through tweet['includes']['media'] until we find it
    if media_key != "":
      for media in json_data['includes']['media']:
        if media['media_key'] == media_key: # Only if the media_key matches the one we stored
          if media['type'] == 'photo':      # Only if it is a photo; ignore videos
            image_url = media['url']        # Store the url in a variable

    # Add the new data to a new record in the DataFrame
    df_tweets.loc[tweet_id] = [tweet_id, author_id, retweet_count, like_count, text, language, created_at, source, possibly_sensitive, image_url]

  # keep track of how many results have been obtained so far:
  total_retrieved += 10

  # keep track of where to start next time, but quit if there are no more results
  try:
    next_token = json_data['meta']['next_token']
  except:
    break  

  # get user info
  for user in json_data['includes']['users']:
    user_id = user['id']
    user_name = user['username']
    user_created_at = user['created_at']
    user_description = user['description']
    user_profile_image_url = user['profile_image_url']
    user_protected = user['protected']
    user_verified = user['verified']
    user_followers_count = user['public_metrics']['followers_count']
    user_following_count = user['public_metrics']['following_count']
    user_tweet_count = user['public_metrics']['tweet_count']
    user_listed_count = user['public_metrics']['listed_count']

    #put user info into a user dataframe
    df_users.loc[user_id] = [user_id, user_name, user_created_at, user_description, user_profile_image_url, user_protected, user_verified, user_followers_count, user_following_count, user_tweet_count, user_listed_count]
  
  #sleep to avoid hitting the rate limit
  time.sleep(10)

print('Got all the tweets!')

#set df indexes. I"m not doing that here as it messes up the indexes when sending it to a sqlite database
# df_tweets.set_index('tweet_id', inplace=True)
# df_users.set_index('user_id', inplace=True)

# df_tweets.to_csv('tweets.csv')
# df_users.to_csv('users.csv')


Got all the tweets!


In [2]:
# Read data from my server and store into a new sqlite3 database
import pyodbc
import sqlite3

conn_write = sqlite3.connect('twitter.db')
df_tweets.to_sql(name='tweets', con=conn_write, if_exists='replace', index=False)
df_users.to_sql(name='users', con=conn_write, if_exists='replace', index=False)

conn_write.close()

print('Succesfully wrote to DB!')

Succesfully wrote to DB!
