In [None]:
import tweepy
import json
import requests
from nltk.corpus import stopwords # stopword examples, 'its', 'on', 'the', etc <---- will be helpful later
# most pythonistas will rename pandas as pd, numpy as np, and datetime as dt for short (you don't have to)
import pandas as pd
import numpy as np
import datetime as dt

In [None]:
# open file with keys and set the path to your credentials JSON file
# see example.json for formatting
# you'll need to replace my file with yours
credentials = "keys.json"
with open(credentials, "r") as keys:
    api_tokens = json.load(keys)

In [None]:
# read the keys and assign each to a variable
bearer_token = api_tokens["bearer_token"]
api_key = api_tokens["api_key"]
api_secret = api_tokens["api_secret"]
access_token = api_tokens["access_token"]
access_secret = api_tokens["access_secret"]

In [None]:
client = tweepy.Client(
    bearer_token=bearer_token,
    consumer_key=api_key,
    consumer_secret=api_secret,
    access_token=access_token,
    access_token_secret=access_secret,
    return_type = requests.Response
)

Let's look up the 100 most recent tweets using #London
<br/>We're going to use the search_recent_tweets() function and 5 parameters, they are:
<br/><b>q: </b>Short for query, <a href="https://developer.twitter.com/en/docs/twitter-api/tweets/search/integrate/build-a-query">learn more about building queries here</a>
<br/><b>max_results: </b>The maximum number of search results to be returned by a request. A number between 10 and 100. By default, a request response will return 10 results.
<br/><b>tweet_fields: </b><a href="https://developer.twitter.com/en/docs/twitter-api/data-dictionary/object-model/tweet">See all tweet fields here</a>
<br/><b>user_fields: </b><a href="https://developer.twitter.com/en/docs/twitter-api/data-dictionary/object-model/user">See all user fields here</a>
<br/><b>expansions: </b>This field will allow us to include the user_field values. <a href="https://docs.tweepy.org/en/latest/expansions_and_fields.html">Learn more about expansions here</a>

In [None]:
tweets = client.search_recent_tweets(
    query = "#WPP -is:retweet",  # searches for #London while ignoring retweets
    max_results = 100,
    tweet_fields = ['author_id', 'created_at', 'text', 'source', 'lang', 'geo'],
    user_fields = ['name','username','location','verified'],
    expansions = 'author_id'
)

In [None]:
tweets_dict = tweets.json()

In [None]:
tweets_dict

In [None]:
# extract 'data' from dictionary, this will exclude the metadata
tweets_data = tweets_dict['data']
tweets_users = tweets_dict['includes']['users']

In [None]:
tweets_data

In [None]:
tweets_users

In [None]:
print(len(tweets_data))
print(len(tweets_users))

In [None]:
# the different numbers let's us know some users did multiple tweets with #WPP
# transform to pandas dataframe
df_data = pd.json_normalize(tweets_data)
df_users = pd.json_normalize(tweets_users)

In [None]:
df_data

In [None]:
df_users

In [None]:
# I want to merge these two data frames together. 
# I can see author_id in my data dataframe, and id in my users dataframe is what connect the two
# let's make sure both columns use 'author_id' so pandas can merge the two
df_users.rename(columns={'id': 'author_id'}, inplace=True)
df_users

In [None]:
# Now I can merge our two DataFrames
df_merged = df_data.merge(df_users, on='author_id')
df_merged

In [None]:
# the created_at time is a little difficult to read so let's fix that
df_merged["created_at"] = df_merged["created_at"].dt.strftime('%d-%m-%Y')

In [None]:
# this didn't work!
# let's just the type value of "created_at"
print(type(df_merged.iloc[0].created_at))

In [None]:
# it's a string, so we need to convert to this a DateTime object first
df_merged['created_at'] = pd.to_datetime(df_merged['created_at'])
df_merged["created_at"] = df_merged["created_at"].dt.strftime('%d-%m-%Y %H:%M')
df_merged

In [None]:
# better!
# I also don't care about the ids anymore, so let's get rid of those
# 1 is the axis number (0 for rows and 1 for columns.)
df_merged.drop(['author_id','id'], axis=1)

In [None]:
# that's a lot better!
# now let's save our data in pickled format - so that we don't have to grab it again if our machine crashes

import pickle
path = 'twitter' + dt.datetime.now().strftime("%Y-%m-%d_%I-%M-%S_%p") + '.pkl'
df_merged.to_pickle(path)

In [None]:
# compute a collection of all words from all tweets
# this one takes a min or two to run

words = []
for text in df_merged['text']:
    for word in text.split():
        if word in stopwords.words() or 'http' in word or word == '#WPP':
            continue # skip if word is a link
        else:
            words.append(word)
words

In [None]:
# let's find the most frequent words in these tweets

from collections import Counter

c = Counter(words)
print(c.most_common()[:10])  # top 10

In [None]:
from prettytable import PrettyTable

pt = PrettyTable(field_names=['Word', 'Count']) 
c = Counter(words)
[ pt.add_row(kv) for kv in c.most_common()[:10] ]
pt.align['Word'], pt.align['Count'] = 'l', 'r' # Set column alignment
print(pt)