In [1]:
import pandas as pd
import numpy as np
import tweepy

import os
import re
from tqdm import tqdm
from datetime import datetime
import time

# My module
import config

In [2]:
# Hiding secret API keys in Environment Variables
consumer_key = config.CONSUMER_KEY
consumer_secret = config.CONSUMER_SECRET

In [3]:
query = 'Relación tóxica OR Celos OR Chantaje OR Amenaza OR Controlador OR Violencia psicologica OR Infiel OR Gaslighting'

In [4]:
# Check access to the API
auth = tweepy.AppAuthHandler(consumer_key, consumer_secret)
api = tweepy.API(auth)
if(api.verify_credentials):
    print("Access granted :)")
else:
    print("Access denied :(")

Access granted :)


In [5]:
# Functions

def connect_to_twitter_OAuth2(consumer_key=consumer_key, consumer_secret=consumer_secret):
    """Sets a connection to the twitter API.
    
    Parameters
    ----------
    consumer_key : set by default
    consumer_secret : set by default
    """
    auth = tweepy.AppAuthHandler(consumer_key, consumer_secret)
    api = tweepy.API(auth)
    return api


def retrieve_tweets(api, since_id=None, max_id=None):
    """
    It returns a twitter object with 100 tweets of a specific api response.
    
    Parameters
    ----------
    api : api connection (required)
    since_id : if given, it returns tweets with an ID greater than that (newer)
    max_id : if given, it returns tweets with an ID less or equal than that (older) (max. 7 days prior)
    """
    return api.search(q=query,
                      lang='es',
                      result_type='mixed',
                      count=10,
                      since_id=since_id,
                      max_id=max_id,
                      tweet_mode='extended')


def extract_tweet_atributes(tweet_object):
    """It returns a Pandas DataFrame with a tweet per row and its attributes per column."""
    
    tweets_list = []
    
    for tweet in tweet_object:
        # Iterates over each tweet and gets its attributes
        tweet_id = tweet.id   # Unique tweet identifier
        text = tweet.full_text   # Sring, text of the tweet
        screen_name = tweet.user.screen_name   # String, username
        followers = tweet.user.followers_count   # Number of followers
        retweet_count = tweet.retweet_count   # Number of retweets
        favorite_count = tweet.favorite_count   # Number of favorites
        created_at = tweet.created_at   # UTC time tweet created
        source = tweet.source   # Utility used to post the tweet
        location = tweet.user.location   # Location tweet was posted from
        # Append attributes to list
        tweets_list.append({'tweet_id':tweet_id,
                            'text':text, 
                            'screen_name':screen_name,
                            'followers':followers,
                            'retweet_count':retweet_count, 
                            'favorite_count':favorite_count, 
                            'created_at':created_at, 
                            'source':source,
                            'location': location})
    # Creates a DataFrame
    df = pd.DataFrame(tweets_list, columns=['tweet_id',
                                            'text',
                                            'screen_name',
                                            'followers',
                                            'retweet_count',
                                            'favorite_count', 
                                            'created_at',
                                            'source',
                                            'location'])
    return df



def first_cleaning(df):
    """It returns a DataFrame after dropping duplicates (subset=['tweet_id']) and sorting it (by='tweet_id')
    
    Parameters
    ----------
    df : Pandas DataFrame to clean.
    """
    df_no_dup = df.drop_duplicates(subset=['tweet_id'], ignore_index=True)
    cleaned_df = df_no_dup.sort_values(by='tweet_id', ignore_index=True)
    return cleaned_df



# Main functions
def main_retrieval(file_path, last_id=None):
    """
    Main retrieval function.
    It makes 450 requests.
    It saves a DataFrame to a csv in a given path.
    
    Returns 
    -------
    + Last tweet id.
    + DataFrame length
    
    Parameters
    ----------
    file_path : file where the DataFrame will be stored (append mode)
    last_id : if given, it retrieves tweets only with a greter ID (older)
    """
    # Set a connection to the api
    api = connect_to_twitter_OAuth2()
    # Set some required variables
    number_of_requests = 450
    dfs = []
    # Main loop
    for i in tqdm(range(number_of_requests)):
        
        violetta_tweets = retrieve_tweets(api, since_id=last_id)
        df = extract_tweet_atributes(violetta_tweets)
        # Set a new last_id. Next iteration starts taking tweets from it on
        last_id = df['tweet_id'].max()
        dfs.append(df)

    df = pd.concat(dfs, ignore_index=True)
    df = first_cleaning(df)
    last_id = df['tweet_id'].max()
    # Saves df to a csv in the file_path, ignoring index, appending it, and not writting column names each time
    df.to_csv(file_path, sep=',', index=False, mode='a', header=False)

    return last_id, len(df)



def long_term_retrieval(file_path, iterations=25, last_id=None):
    """
    It aims to be retrieving tweets for a long period, 10 hours.
    
    Parameters
    ----------
    file_path : file where the DataFrame will be stored (append mode).
    iterations : number of main_retrieval function calls. 15 iterations -> 11 hours period.
    last_id : if given, it retrieves tweets only with a greter ID (older).
    """
    lap = 0
    while lap <= iterations:
        # Try to retrieve tweets or print 'Error' if it cannot. It does not break the loop
        try:
            # Set the next last_id and the length of the DataFrame that just added to the csv
            last_id, length = main_retrieval(file_path=file_path, last_id=last_id)
            print(f'{length} new rows added to the csv.')
        except:
            print('Error!')
        # Release the counter and break the loop if necessary
        lap += 1
        if lap > iterations:
            break
        print(f'{(iterations + 1) - lap} laps to go.')  
        # Time info
        now = datetime.now()
        current_time = now.strftime("%H:%M:%S")
        print(f'Getting some sleep @ {current_time}...')
        # Getting some sleep til next main retrieval
        time.sleep(18 * 60)
        print('*' * 50)
    print('Done :D\nEnjoy it!')

In [6]:
file_path = 'C:/Users/Javi/Omdena/violetta/violetta_tweets.csv'

In [None]:
long_term_retrieval(file_path, iterations=25, last_id=None)

100%|████████████████████████████████████████████████████████████████████████████████| 450/450 [04:02<00:00,  1.86it/s]


10 new rows added to the csv.
25 laps to go.
Getting some sleep @ 17:51:30...
**************************************************


100%|████████████████████████████████████████████████████████████████████████████████| 450/450 [04:19<00:00,  1.74it/s]


10 new rows added to the csv.
24 laps to go.
Getting some sleep @ 18:13:51...


### First look at the data

In [7]:
columns = ['tweet_id',
           'text',
           'screen_name',
           'followers',
           'retweet_count',
           'favorite_count', 
           'created_at',
           'source',
           'location']

tweets = pd.read_csv(file_path, names=columns)

In [9]:
# Cleaning data functions

def first_cleaning(df):
    """It returns a DataFrame after dropping duplicates (subset=['tweet_id']) and sorting it (by='tweet_id')
    
    Parameters
    ----------
    df : Pandas DataFrame to clean
    """
    df_no_dup = df.drop_duplicates(subset=['tweet_id'], ignore_index=True)
    sorted_df = df_no_dup.sort_values(by='tweet_id', ignore_index=True)
    return sorted_df


def re_cleaning_round(text):
    """It returns a string without hashtags, new line characters, @ mentions and urls."""
    text = re.sub('#', '', text)
    text = re.sub('\n', ' ', text)
    text = re.sub('@\\w+ *', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('\s+', ' ', text)
    text = ' '.join(text.split())
    
    return text

re_cleaning = lambda x: re_cleaning_round(x)

In [15]:
tweets = first_cleaning(tweets)
tweets['text'] = tweets['text'].apply(re_cleaning_round)

In [16]:
tweets

Unnamed: 0,tweet_id,text,screen_name,followers,retweet_count,favorite_count,created_at,source,location
0,1374207793406873600,Porque? Infiel? o Relación toxica?,valeriaxmtwins,424,0,0,2021-03-23 03:54:04,Twitter for Android,JeanCarloLeon
1,1374339472406552578,"Celia tb llama cerdas a las mujeres infieles ,...",geminisvip,29,0,0,2021-03-23 12:37:18,Twitter Web App,
2,1374432831930175490,No te puedes perder este reportaje escrito por...,EscritorasU,158,3,8,2021-03-23 18:48:17,Twitter Web App,México
3,1374432985290698754,RT : No te puedes perder este reportaje escrit...,ixxchelabril,115,3,0,2021-03-23 18:48:54,Twitter Web App,Ciudad de México
4,1374436089671262214,RT : No te puedes perder este reportaje escrit...,Turola_fav,132,3,0,2021-03-23 19:01:14,Twitter for Android,
5,1374496391649579011,RT : No te puedes perder este reportaje escrit...,fernandagerald7,125,3,0,2021-03-23 23:00:51,Twitter Web App,
6,1374524668099448841,Las siguientes acciones son maneras de ejercer...,ISDEMU,27403,3,11,2021-03-24 00:53:12,Twitter for iPhone,El Salvador
7,1374526067956969472,RT : Las siguientes acciones son maneras de ej...,Yanira97530611,0,3,0,2021-03-24 00:58:46,Twitter for Android,
8,1374535194447802370,( la rata ): — vomita tajin. — conoce lo que e...,valenteineuu,1374,0,5,2021-03-24 01:35:02,Twitter for iPhone,𝑙𝑖𝑏𝑟𝑎 - 𝑖𝑛𝑓𝑝 - 2𝑤1
9,1374563967297806337,RT : Las siguientes acciones son maneras de ej...,anaelenabadilla,367,3,0,2021-03-24 03:29:22,Twitter for Android,"San Salvador, El Salvador"
