# **Using Tweepy to extract recent tweets from any account**



*   Note: you will need to have a Twitter developer account and its related API keys



# **Libraries**

In [None]:
# global purpose libraries
from datetime import date, datetime, timedelta
import random
import re
import time
import pandas as pd
import numpy as np
from collections import Counter

# scraping libraries
import tweepy

# NLP
from textblob import TextBlob, Word
import pandas as pd
import spacy
from spacy.lang.en import English
from spacy.tokens import Doc, Span

# **Example: Scraping the HSE Twitter account**

In [None]:
# uploading a *.txt file with my Twitter dev keys
from google.colab import files
files.upload()

In [None]:
# getting the keys and passwords from the files we just uploaded, and passing them into a list
with open("tweepy.csv") as f:
    lines = f.readlines()
    for line in lines:
      keys = [l.replace('"',"") for l in line.split(",")]

In [None]:
# where the magic happens

class Twitter:
  
  def __init__(self, handle, keys):
    self.Handle = handle
    self.Keys = keys

  # adding in our credentials
  def getAccess(self):
    twitter_keys = {
            "consumer_key":        self.Keys[0],
            "consumer_secret":     self.Keys[1],
            "access_token_key":    self.Keys[2],
            "access_token_secret": self.Keys[3]
                  }
    auth = tweepy.OAuthHandler(twitter_keys["consumer_key"],
                              twitter_keys["consumer_secret"]
                              )
    auth.set_access_token(twitter_keys["access_token_key"],
                          twitter_keys["access_token_secret"]
                          )
    api = tweepy.API(auth,
                    wait_on_rate_limit=True
                    )
    return api

  # function to extract data from tweet object
  def getTweetAttributes(self):
    api = self.getAccess()
    target = tweepy.Cursor(api.user_timeline,
                           screen_name=self.Handle,
                           tweet_mode="extended").items(700)
    # create empty list
    tweet_data = []
    # loop through tweet objects
    for t in target:
        created_at = t.created_at
        text = t.full_text
        favorite_count = t.favorite_count
        retweet_count = t.retweet_count
        source = t.source
        # append attributes to list
        tweet_data.append({"created": created_at,
                           "text": text,
                           "favorites": favorite_count,
                           "retweets": retweet_count,
                           "source": source})
    return tweet_data

  # create dataframe  
  def getDataframe(self):
    tweet_data = self.getTweetAttributes()
    df = pd.DataFrame(tweet_data,
                      columns=["created",
                               "text",
                               "favorites",
                               "retweets",
                               "source"]
                     )
    return df

# using the class we just defined to create a dataframe
twitter = Twitter("@hselive",keys)
df = twitter.getDataframe()

# making sure it worked
df.head()

Unnamed: 0,created,text,favorites,retweets,source
0,2021-12-10 20:33:54,"@mosci68 Hi Monica, a full list of these can b...",1,0,Agorapulse app
1,2021-12-10 19:25:32,"@je61068 Hi there, can you drop us a DM with s...",0,0,Agorapulse app
2,2021-12-10 18:22:30,"@ShaneBarriscale Hi Shane, if any changes are ...",0,0,Agorapulse app
3,2021-12-10 18:04:02,"@LauraBFineArt Hi Laura, certs have not yet be...",1,0,Agorapulse app
4,2021-12-10 17:33:59,"@dalibryn Hi there, yes you will need to wait ...",0,0,Agorapulse app


In [None]:
# this is pretty straighforward, as a space separates the data we want to retrieve
df["day"] = df["created"].apply(lambda x: str(x).split(" ")[0])
df["hour"] = df["created"].apply(lambda x: str(x).split(" ")[1])

# we also want to cast the day serie into datetime format
df["day"] = pd.to_datetime(df["day"])

# we no longer need the original serie
df = df.drop(columns=["created"])

In [None]:
# we should also create a serie that transforms a datetime into a day name (ex: "Monday")
df["day_name"] = df["day"].dt.day_name()

# then, we will at a lster stage want to group by hours, but not by minutes. So let's remove them
df["hour"] = df["hour"].apply(lambda x: x.split(":")[0])

# no need for tokenization / lemmatization, as the article titles are short  we can jump straight into sentiment evaluation
df["sentiment_score"] = df["text"].apply(lambda x: TextBlob(x).sentiment.polarity)

# also creating categorical bins for sentiment
def getSentiment(serie):
  if serie > 0.15:
    return "Positive"
  elif serie < 0:
    return "Negative"
  else:
    return "Neutral"

df["sentiment_tag"] = df["sentiment_score"].apply(getSentiment)

# while we're there, we will need a count() of the sentiment tags for the JavaScript dashboard
df["sent_count"] = df["sentiment_tag"].groupby(df["sentiment_tag"]).transform("count")

# creating a copy of the dataset, that we can export later to a csv file
df_twitter = df.copy()

# this is what the dataframe looks like now
df.head()

Unnamed: 0,text,favorites,retweets,source,day,hour,day_name,sentiment_score,sentiment_tag,sent_count
0,"@mosci68 Hi Monica, a full list of these can b...",1,0,Agorapulse app,2021-12-10,20,Friday,0.35,Positive,318
1,"@je61068 Hi there, can you drop us a DM with s...",0,0,Agorapulse app,2021-12-10,19,Friday,0.5,Positive,318
2,"@ShaneBarriscale Hi Shane, if any changes are ...",0,0,Agorapulse app,2021-12-10,18,Friday,0.0,Neutral,318
3,"@LauraBFineArt Hi Laura, certs have not yet be...",1,0,Agorapulse app,2021-12-10,18,Friday,0.136364,Neutral,318
4,"@dalibryn Hi there, yes you will need to wait ...",0,0,Agorapulse app,2021-12-10,17,Friday,0.0,Neutral,318
