### Twitter extraction and first round of cleaning
This notebook aims to retrieve tweets from the *Twitter API* using `tweepy` library and then make a first round of cleaning them (e.g. *drop duplicates*, *sort it* by date, apply some *regex*) and stored them in a csv.

**Working on it...**

In [1]:
import pandas as pd
import numpy as np
import tweepy

import os
from tqdm import tqdm
from datetime import datetime
import time

In [2]:
# Hiding secret API keys in Environment Variables
consumer_key = os.environ.get('CONSUMER_KEY')
consumer_secret = os.environ.get('CONSUMER_SECRET')

access_token = os.environ.get('ACCESS_TOKEN')
access_token_secret = os.environ.get('ACCESS_TOKEN_SECRET')

bearer_token = os.environ.get('BEARER_TOKEN')

In [3]:
query = 'Bitcoin OR BTC OR #Bitcoin OR #BTC OR $Bitcoin OR $BTC'

In [4]:
# Functions

def connect_to_twitter_OAuth2(consumer_key=consumer_key, consumer_secret=consumer_secret):
    """Sets a connection to the twitter API.
    
    Parameters
    ----------
    consumer_key : set by default
    consumer_secret : set by default
    """
    auth = tweepy.AppAuthHandler(consumer_key, consumer_secret)
    api = tweepy.API(auth)
    return api


def retrieve_tweets(api, since_id=None, max_id=None):
    """
    It returns a twitter object with 100 tweets of a specific api response.
    
    Parameters
    ----------
    api : api connection (required)
    since_id : if given, it returns tweets with an ID greater than that (newer)
    max_id : if given, it returns tweets with an ID less or equal than that (older) (max. 7 days prior)
    """
    return api.search(q=query,
                      lang='en',
                      result_type='recent',
                      count=100,
                      since_id=since_id,
                      max_id=max_id,
                      tweet_mode='extended')


def extract_tweet_atributes(tweet_object):
    """It returns a Pandas DataFrame with a tweet per row and its attributes per column."""
    
    tweets_list = []
    
    for tweet in tweet_object:
        # Iterates over each tweet and gets its attributes
        tweet_id = tweet.id   # Unique tweet identifier
        text = tweet.full_text   # Sring, text of the tweet
        screen_name = tweet.user.screen_name   # String, username
        followers = tweet.user.followers_count   # Number of followers
        retweet_count = tweet.retweet_count   # Number of retweets
        favorite_count = tweet.favorite_count   # Number of favorites
        created_at = tweet.created_at   # UTC time tweet created
        source = tweet.source   # Utility used to post the tweet
        reply_to_status = tweet.in_reply_to_status_id   # If reply: orginal tweet's ID
        reply_to_user = tweet.in_reply_to_screen_name   # If reply: original tweet's screenname
        # Append attributes to list
        tweets_list.append({'tweet_id':tweet_id,
                            'text':text, 
                            'screen_name':screen_name,
                            'followers':followers,
                            'retweet_count':retweet_count, 
                            'favorite_count':favorite_count, 
                            'created_at':created_at, 
                            'source':source,
                            'reply_to_status':reply_to_status,
                            'reply_to_user':reply_to_user})
    # Creates a DataFrame
    df = pd.DataFrame(tweets_list, columns=['tweet_id',
                                            'text',
                                            'screen_name',
                                            'followers',
                                            'retweet_count',
                                            'favorite_count', 
                                            'created_at',
                                            'source',
                                            'reply_to_status',
                                            'reply_to_user'])
    return df

**API rate limits:** Maximum of 450 requests per 15 minutes. Endpoint: Recent Search

In [61]:
# Main

# Set a connection to the api
api = connect_to_twitter_OAuth2()
# Set some required variables
number_of_requests = 450
count = 0
laps = 2
last_id = 1356599704956456960
dfs = []
# First loop
while count <= laps:
    # Second loop
    for i in tqdm(range(number_of_requests)):
        if last_id:
            crypto_tweets = retrieve_tweets(api, since_id=last_id)
            df = extract_tweet_atributes(crypto_tweets)
            # Set a new last_id. Next iteration starts taking tweets from it on
            last_id = df['tweet_id'].max()
            dfs.append(df)
        # It's the first iteration and there is no last_id yet
        else:
            crypto_tweets = retrieve_tweets(api)
            df = extract_tweet_atributes(crypto_tweets)
            # Set the first last_id. Next iteration starts taking tweets from it on
            last_id = df['tweet_id'].max()
            dfs.append(df)
            
    print(f'I\'ve got {len(dfs)} dataframes in my list so far.')
    # It releases the counter and break the loop if necessary
    count += 1
    if count > laps:
        break
    # Time info
    now = datetime.now()
    current_time = now.strftime("%H:%M:%S")
    print(f'I\'m sleeping @ {current_time}...')
    
    # Script getting some sleep til next 450 requests window
    time.sleep(15 * 60)
    
df2 = pd.concat(dfs, ignore_index=True)
print('Done :D\nEnjoy it!')

100%|████████████████████████████████████████████████████████████████████████████████| 450/450 [04:01<00:00,  1.86it/s]


I've got 450 dataframes in my list so far.
I'm sleeping @ 15:32:11...


100%|████████████████████████████████████████████████████████████████████████████████| 450/450 [03:59<00:00,  1.88it/s]


I've got 900 dataframes in my list so far.
I'm sleeping @ 15:52:39...


100%|████████████████████████████████████████████████████████████████████████████████| 450/450 [04:01<00:00,  1.86it/s]


I've got 1350 dataframes in my list so far.
Done :D
Enjoy it!


In [62]:
# Weaknesses:
# 1. Set an if statement (or whatever) so the tweet retrieval function can include a since_id parameter:
#     This will allow getting tweets from the last time the function was executed on.
# 2. Data should be stored in a csv (or csvs) instead of a pandas df:
#     We can achieve this by "df.to_csv()" or directly storing tweets in a csv by "with open(.csv, a+)"
# 3. First round of cleaning:
#     The function gathers 45.000 tweets per 15 min (lap)
#     We noticed that most of them are duplicates. 
#     Which means there are certain ranges of time when there are not 45.000 new bitcoin tweets per 15 mins (not even close),
#     therefore we end up with a tone of duplicate, useless tweets.
#     Create a function that removes them, sort them by date ("created_at")
#     and apply some "re" on them to remove links, #, etc. (even before storing them on a csv file)


I'm afraid the method `since_id` from `api.search()` function doesn't work quite as expected :(. It seems that it's able to retrieve tweets just **one hour old**.

Therefore, there's gonna always be a period of time where data is missing (between each time I run the *main* cell) unless the script is continuously running (for 10/14 days or so) :(((.

### Truncated tweets
Texts over 140 characters are truncated. There could be a solution, adding `tweet_mode='extended` parameter when calling my "retrive_tweets" function. <br>
Let's see it in action!

AND IT WORKS!!! We got the full text of the tweet! Take that Twitter!
It doesn't work for retweets though.

In [69]:
def first_cleaning(df):
    """It returns a DataFrame after dropping duplicates (subset=['tweet_id']) and sorting it (by='tweet_id')
    
    Parameters
    ----------
    df : Pandas DataFrame to clean
    """
    df_no_dup = df.drop_duplicates(subset=['tweet_id'], ignore_index=True)
    cleaned_df = df_no_dup.sort_values(by='tweet_id', ignore_index=True)
    return cleaned_df

In [None]:
# First day test
# df = df.sort_values(by='tweet_id', ignore_index=True).drop_duplicates(subset=['tweet_id'], ignore_index=True)

In [87]:
# Path where the set of tweets will be stored to play with them
file_path = 'C:/Users/Javi/00_raw_data/data_tfm/tweet_set.csv'

In [28]:
# When reading from a csv file, specify date format columns you want.
# first_data = pd.read_csv(first_file_path, parse_dates=['created_at'])
# first_df = first_data.copy()

In [68]:
# First df & Second df were concatenated in *new_df*
print(new_df.shape)
new_df.head(3)

(4654, 10)


Unnamed: 0,tweet_id,text,screen_name,followers,retweet_count,favorite_count,created_at,source,reply_to_status,reply_to_user
0,1356375993620049925,RT @LesangT: Elon Musk just got asked about Bi...,chocboipeter,503,19057,0,2021-02-01 22:56:51,Twitter for Android,,
1,1356375994932867072,"RT @luizMilfont: ""My husband used to worry abo...",MoreKoolaidPlz,116,6,0,2021-02-01 22:56:52,Twitter for Android,,
2,1356376000129425409,RT @genesimmons: I’m not recommending any of t...,ShitCinc,48,2731,0,2021-02-01 22:56:53,Twitter for iPhone,,


In [71]:
# Third df
print(df2.shape)
df2.head(3)

(33204, 10)


Unnamed: 0,tweet_id,text,screen_name,followers,retweet_count,favorite_count,created_at,source,reply_to_status,reply_to_user
0,1356610350989336581,RT @CryptoGuardian4: GIVEAWAY time!\nTo celebr...,mathgril,1977,7,0,2021-02-02 14:28:06,_math_bot_,,
1,1356610350796443655,RT @latokens: Join UNIS airdrop to get free cr...,Haseebdogar61,4,2990,0,2021-02-02 14:28:06,Twitter for Android,,
2,1356610350322487301,@CardanoRmy As we begin to see limitations of ...,dan_jackson,150,0,0,2021-02-02 14:28:06,Twitter Web App,1.35659e+18,CardanoRmy


In [70]:
third_df = first_cleaning(df2)

In [72]:
# Third df cleaned
print(third_df.shape)
third_df.head(3)

(2745, 10)


Unnamed: 0,tweet_id,text,screen_name,followers,retweet_count,favorite_count,created_at,source,reply_to_status,reply_to_user
0,1356610227383246850,RT @latokens: Join UNIS airdrop to get free cr...,MdShadd24001234,2,2979,0,2021-02-02 14:27:37,Twitter Web App,,
1,1356610229597790208,Bitcoin Volatility Declines to 1-Month Low; Wh...,crypto_news_19,6222,1,0,2021-02-02 14:27:37,,,
2,1356610234291265536,RT @crypto_news_19: Bitcoin Volatility Decline...,Bitcoin_RT,1654,1,0,2021-02-02 14:27:39,,,


In [75]:
tweets_data = pd.concat([new_df, third_df], ignore_index=True)

In [80]:
new_df['created_at'].max()

Timestamp('2021-02-02 13:45:48')

In [81]:
# It should have started grabing tweets from 13:45, not 14:27.
# At least started taking tweets 1 hour before I ran it (15:28)
third_df['created_at'].min()

Timestamp('2021-02-02 14:27:37')

In [88]:
# Save it data to a csv
tweets_data.to_csv(file_path, sep=',', index=False)