# Concatenate tweets

In this notebook, we will consolidate all the tweet files into a single compressed pickle file for further analysis. We have three main sets of data that we need to store: 

1. Data from January 2021.
2. Data from October 2021.
3. Data from April 28 to June 30.

Each of these samples corresponds to a specific moment relevant for our analysis. The October data is used for analyzing our community during election periods, specifically the regional elections in Colombia that took place in October 2019. The data from January 2021 represents the period three months before the "Paro Nacional," allowing us to track our community before the social outbreak. Finally, we have the data from the time of the "Paro Nacional," which will be the focal point of our analysis.

In [1]:
import os
import pandas as pd
import numpy as np
from glob import glob
from tqdm import tqdm
import concurrent.futures
from time import perf_counter
import warnings

In [2]:
path = r"/mnt/disk2/Data"
pd.set_option("display.max_columns", None)

In [3]:
def process_file(file:str):
    # Dtypes for IDs
    tipos = {
        'Author ID': 'float64',
        'Referenced Tweet Author ID': 'float64',
        'ID': 'float64',
        'Referenced Tweet':'float64'
    }
    try:
        df = pd.read_csv(file, dtype=tipos)
        if df.empty:
            return None, file
        return df, None
    except Exception as e:
        print(f"Error processing file {file}: {str(e)}")
        return None, file

In [11]:
def main(files:list[str],savefile:str):
    tweets_aux = []
    empties = []

    # Use ProcessPoolExecutor for I/O bound operations
    with concurrent.futures.ProcessPoolExecutor() as executor:
        # Submit all file processing tasks
        futures = [executor.submit(process_file, file) for file in files]

        # Process results as they complete
        for future in tqdm(concurrent.futures.as_completed(futures), total=len(files)):
            df, empty_file = future.result()
            if df is not None:
                tweets_aux.append(df)
            if empty_file:
                empties.append(empty_file)

    tic = perf_counter()
    # Concatenate all dataframes
    tweets = pd.concat(tweets_aux, ignore_index=True)
    del tweets_aux
    toc = perf_counter()
    
    print(f"Finished concatenating in {(toc-tic)//60:,.0f} minutes with {(toc-tic)%60:,.2f} seconds")

    # Sort and reset index
    tic = perf_counter()
    tweets = tweets.sort_values('ID').reset_index(drop=True)
    toc = perf_counter()
    
    print(f"Finished sorting in {(toc-tic)//60:,.0f} minutes with {(toc-tic)%60:,.2f} seconds")
    
    # Store results
    tic = perf_counter()
    tweets.to_pickle(os.path.join(path, f"Tweets_DataFrames/{savefile}.gzip"), compression="gzip")
    toc = perf_counter()
    
    print(f"Finished saving in {(toc-tic)//60:,.0f} minutes with {(toc-tic)%60:,.2f} seconds")
    print("")
    print(f"Processed {len(files):,} files.")
    print(f"Found {len(empties):,} empty files.")
    print(f"Total tweets: {len(tweets):,}")
    
    return tweets

if __name__ == "__main__":
    tic_total = perf_counter()
    tic = perf_counter()
    print("Regional elections: October 2019")
    files_oct = glob(os.path.join(path, 'RawData', 'users_oct_19/*.csv'))
    tweets_oct19 = main(files_oct,"tweets_oct19")
    toc = perf_counter()
    time = toc - tic
    
    print(f"Finish whole cell in {time//60} minutes and {time%60:,.0f} secs.")
    
    print("\n"+"*"*100+"\n")
    
    tic = perf_counter()
    print("Before Paro Nacional: January 2021")
    files_jan = glob(os.path.join(path, "RawData", "users_jan/*.csv"))
    tweets_jan21 = main(files_jan,"tweets_jan21")
    toc = perf_counter()
    time = toc - tic
    
    print(f"Finish whole cell in {time//60} minutes and {time%60:,.0f} secs.")
    
    print("\n"+"*"*100+"\n")
    
    tic = perf_counter()
    pd.concat([tweets_jan21,tweets_oct19],ignore_index=True).to_pickle(os.path.join(path,"Tweets_DataFrames","tweets_presample.gzip"), compression = "gzip")
    toc=perf_counter()
    
    time = toc-tic
    print(f"Finish saving presample in {time//60} minutes and {time%60:,.0f} secs.")
    time = toc-tic_total
    print(f"Everything took {time//60} minutes and {time%60:,.0f} secs.")

Regional elections: October 2019


100%|██████████| 25125/25125 [00:25<00:00, 983.58it/s] 


Finished concatenating in 0 minutes with 15.13 seconds
Finished sorting in 0 minutes with 16.62 seconds
Finished saving in 8 minutes with 49.07 seconds

Processed 25,125 files.
Found 11 empty files.
Total tweets: 5,424,132
Finish whole cell in 9.0 minutes and 59 secs.

****************************************************************************************************

Before Paro Nacional: January 2021


100%|██████████| 34048/34048 [00:36<00:00, 942.21it/s] 


Finished concatenating in 0 minutes with 22.30 seconds
Finished sorting in 0 minutes with 20.74 seconds
Finished saving in 9 minutes with 37.54 seconds

Processed 34,048 files.
Found 5 empty files.
Total tweets: 5,893,802
Finish whole cell in 11.0 minutes and 10 secs.

****************************************************************************************************

Finish presample in 18.0 minutes and 34 secs.
Everything took 39.0 minutes and 42 secs.


## Paro Nacional: April 28 - June 30 2021

In [1]:
def unique_to_string(x):
    unique_values = x.unique()
    return ', '.join(map(str, unique_values))

def process_file(file):
    tipos = {
        'Author ID': 'float64',
        'Referenced Tweet Author ID': 'float64',
        'ID': 'float64',
        'Referenced Tweet':'float64'
    }
    try:
        with warnings.catch_warnings():
            # Ignore nanmean error
            warnings.simplefilter("ignore", category=RuntimeWarning)
            
            # Ignore Dtype errors, check in any case
            #warnings.simplefilter("ignore", category=pd.errors.DtypeWarning)
            
            df = pd.read_csv(file,low_memory=False, dtype=tipos)
            if df.empty:
                return None, file, None
            
            # Fix some datatypes
            df[['Author Followers', 'Author Following', 'Author Tweets']] = df[['Author Followers', 'Author Following', 'Author Tweets']].map(lambda x: pd.to_numeric(x, errors='coerce'))
            
            # Save user information
            user_information = df.groupby(['Author ID', 'Author Name']).agg({
                'Author Location': unique_to_string,
                'Author Description': unique_to_string,
                'Author Followers': lambda x: np.nanmean(x),
                'Author Following': lambda x: np.nanmean(x),
                'Author Tweets': lambda x: np.nanmax(x),
                'Author Verified': unique_to_string})
            
            return df, None, user_information
    except (ValueError, KeyError) as e:
        return None, None, file


In [5]:
def main(files):
    df_list = []
    users_information = []
    empties = []
    problems = []

    count = 0  # Amount of Tweets
    n = 0  # Number of Checkpoint
    tic = perf_counter()

    with concurrent.futures.ProcessPoolExecutor() as executor:
        future_to_file = {executor.submit(process_file, file):file for file in files}
        
        for future in tqdm(concurrent.futures.as_completed(future_to_file), total=len(files)):
            
            # Process the future task
            file = future_to_file[future]
            df, empty, user_info = future.result()
            
            # SAve info of empty, corrputed and correct files
            if df is not None:
                df_list.append(df)
                count += len(df)
                users_information.append(user_info)
            elif empty:
                empties.append(empty)
            else:
                problems.append(file)
            
            # If we reach or exceed 7 million rows, save the file and reset
            if count >= 7_000_000:
                n += 1
                concat_df = pd.concat(df_list)
                output_filename = f"tweets_paro_{n}.gzip"
                concat_df.to_pickle(os.path.join(path, f"Tweets_DataFrames/{output_filename}"), compression='gzip')
                toc = perf_counter()
                tqdm.write(f"Finished chunk {n} in {(toc-tic)/60:,.0f} minutes with {(toc-tic)%60} seconds")
                # Reset counter,list and tic for performance
                count = 0
                df_list = []
                tic = perf_counter()

    # Process any remaining data
    if len(df_list)>0:
        print(f"Processing remaining data")
        n += 1
        concat_df = pd.concat(df_list)
        output_filename = f"tweets_paro_{n}.gzip"
        concat_df.to_pickle(os.path.join(path, f"Tweets_DataFrames/{output_filename}"), compression='gzip')
        toc = perf_counter()
        print(f"Finished chunk {n} in {(toc-tic)/60:,.0f} minutes with {(toc-tic)%60} seconds")
        
    del df_list
    print("Finished processing Tweets. Now process users info")
    # Combine all user information
    tic = perf_counter()
    all_users_info = pd.concat(users_information)
    
    with warnings.catch_warnings():
        # Ignore nanmean error
        warnings.simplefilter("ignore", category=RuntimeWarning)
        all_users_info = (
            all_users_info.groupby(['Author ID', 'Author Name'])
            .agg({'Author Location': unique_to_string,
                'Author Description': unique_to_string,
                'Author Followers': lambda x: np.nanmean(x),
                'Author Following': lambda x: np.nanmean(x),
                'Author Tweets': lambda x: np.nanmax(x),
                'Author Verified': unique_to_string})
            .to_pickle(os.path.join(path, "Tweets_DataFrames/users_information.gzip"), 
                                    compression = 'gzip')
        )
    toc = perf_counter()
    time = toc-tic
    print(f"Finished saving users_information in {time//60:,.0f} minutes with {round(time%60,2):,.2f} seconds")
    print("")
    print(f"Processed {len(files)} files.")
    print(f"Found {len(empties)} empty files.")
    print(f"Encountered problems with {len(problems)} files.")
    print(f"Created {n} tweet files.")
    print(f"Total users processed: {len(users_information)}")

if __name__ == "__main__":
    tic = perf_counter()
    files_v1 = glob(os.path.join(path, 'RawData/Usuarios_V1/*.csv'))
    main(files_v1)
    toc = perf_counter()
    time = toc - tic
    
    print(f"Finish whole cell in {time//60} minutes and {round(time%60,2):,.0f} secs.")

 16%|█▌        | 5792/37324 [07:14<38:18:14,  4.37s/it]

Finished chunk 1 in 7 minutes with 15.519305424764752 seconds


 31%|███       | 11509/37324 [13:41<50:30,  8.52it/s]  

Finished chunk 2 in 6 minutes with 27.193555446341634 seconds


 47%|████▋     | 17392/37324 [20:06<28:47, 11.54it/s]

Finished chunk 3 in 6 minutes with 25.350479869171977 seconds


 62%|██████▏   | 23019/37324 [26:33<18:45, 12.71it/s]

Finished chunk 4 in 6 minutes with 27.075258273631334 seconds


 77%|███████▋  | 28816/37324 [32:57<10:27, 13.56it/s]

Finished chunk 5 in 6 minutes with 23.962212254293263 seconds


100%|██████████| 37324/37324 [39:27<00:00, 15.76it/s]


Finished chunk 6 in 6 minutes with 29.920094214379787 seconds
Processing remaining data
Finished chunk 7 in 3 minutes with 5.160846663638949 seconds
Finished processing Tweets. Now process users info
Finished saving users_information in 21 minutes with 20.4131678044796 seconds

Processed 37324 files.
Found 17 empty files.
Encountered problems with 0 files.
Created 7 tweet files.
Total users processed: 37307


# CHECKPOINT: PRESAMPLE

In [10]:
tweets = pd.read_pickle(os.path.join(path,"Tweets_DataFrames","tweets_presample.gzip"), compression = "gzip")

# Start counter
tic = perf_counter()

# Get just the columns that we need for the Graph construction
cols = [
    'ID',
    'Author ID',
    'Author Name',
    'Referenced Tweet Author ID',
    'Date',
    'Reference Type',
    'Referenced Tweet'
]

tweets_lite = tweets[cols].reset_index(drop = True)
tweets_lite.rename(columns={'ID': 'Tweet ID'}, inplace=True)

# Store results
# run sudo chmod 777 Data/Tweets_DataFrames in bash if it is needed
tweets_lite.to_pickle(os.path.join(path, "Tweets_DataFrames","tweets_lite_pre.gzip"), compression = "gzip")
del tweets_lite
toc = perf_counter()
time = toc-tic
print(f"Finished saving tweets_lite_pre of presample in {time//60:,.0f} minutes with {round(time%60,2):,.2f} seconds")

# Start counter
tic = perf_counter()

def get_reference_author_name(x):
    try:
        return x.split(': ')[0].split('@')[1]
    except:
        return np.nan

# Extraer Retweets
retweets = tweets[tweets['Reference Type'] == 'retweeted'].drop(columns='Reference Type')

# Extraer nombre del Usuario Retwiteado
retweets['Referenced Tweet Author Name'] = retweets['Text'].apply(get_reference_author_name)

# Get just the columns that we need for the Graph construction
cols = [
    'ID',
    'Author ID',
    'Author Name',
    'Referenced Tweet Author ID',
    'Referenced Tweet Author Name',
    'Referenced Tweet',
    'Date'
]

retweets = retweets[cols].reset_index(drop = True)
retweets.rename(columns={'ID': 'Tweet ID', 'Referenced Tweet':'Referenced Tweet ID'}, inplace=True)

# Store results
# run sudo chmod 777 Data/Tweets_DataFrames in bash if it is needed
retweets.to_pickle(os.path.join(path, "Tweets_DataFrames","retweets_pre.gzip"), compression = "gzip")
del retweets
toc = perf_counter()
time = toc - tic

print(f"Finished saving retweets_pre in {time//60:,.0f} minutes with {round(time%60,2):,.2f} seconds")

Finished saving tweets_lite_pre of presample in 5 minutes with 42.43 seconds
Finished saving retweets_pre in 3 minutes with 18.94 seconds


# CHECKPOINT: CARGAR TWEETS COMPLETOS

In [4]:
tic = perf_counter()
tweets_paro = glob('/mnt/disk2/Data/Tweets_DataFrames/Tweets_Paro_Total/tweets_paro_*')

tweets = pd.DataFrame()
tweets_aux=[]
tipos = {
        'Author ID': 'float64',
        'Referenced Tweet Author ID': 'float64',
        'ID': 'float64',
        'Referenced Tweet':'float64'
    }

for file in tqdm(tweets_paro):
    tweets_df = pd.read_pickle(file, compression = "gzip").astype(tipos)
    #tweets = pd.concat([tweets, tweets_df], axis = 0)
    tweets_aux.append(tweets_df)
    del tweets_df
tweets = pd.concat(tweets_aux,ignore_index=True)
# Fill tweets that doesn't reference anyone as origina
tweets["Reference Type"] = tweets["Reference Type"].fillna("original tweet")

# Drop Values we don't know anything about
tweets.dropna(subset='Author ID', inplace=True)

# Reporting time lapsus
toc = perf_counter()
time = toc-tic
print(f"Finished loading whole tweets in {time//60:,.0f} minutes with {round(time%60,2):,.2f} seconds")

tweets.head(3)

100%|██████████| 7/7 [02:42<00:00, 23.28s/it]


Finished loading whole tweets in 3 minutes with 15.39 seconds


Unnamed: 0,ID,Permalink,Author ID,Author Name,Author Location,Author Description,Author Followers,Author Following,Author Tweets,Author Profile Image,Author Verified,Date,Text,Replies,Retweets,Favorites,Quotes,is Retweet?,Reply To User Name,Mentions,Referenced Tweet,Reference Type,Referenced Tweet Author ID,Media URLs,Media Keys
0,1.409619e+18,/hmauriciojg/status/1409618955283668996,138377765.0,hmauriciojg,"Bucaramanga, Colombia",,22.0,558.0,873.0,https://pbs.twimg.com/profile_images/154468480...,False,2021/06/28 16:05:23,@DanielSamperO A vida hp!!. @IvanDuque fue y s...,0.0,0.0,0.0,0.0,False,DanielSamperO,DanielSamperO IvanDuque petrogustavo,1.409586e+18,replied_to,134855300.0,,
1,1.409575e+18,/hmauriciojg/status/1409574993596452867,138377765.0,hmauriciojg,"Bucaramanga, Colombia",,22.0,558.0,873.0,https://pbs.twimg.com/profile_images/154468480...,False,2021/06/28 13:10:41,@alejarojas_g A bueno de pronto si @petrogusta...,0.0,0.0,0.0,0.0,False,alejarojas_g,alejarojas_g petrogustavo,1.409192e+18,replied_to,1131821000.0,,
2,1.409302e+18,/hmauriciojg/status/1409302180847292417,138377765.0,hmauriciojg,"Bucaramanga, Colombia",,22.0,558.0,873.0,https://pbs.twimg.com/profile_images/154468480...,False,2021/06/27 19:06:38,@gabodelascasas Ahí la tiene https://t.co/2WJZ...,0.0,0.0,0.0,0.0,False,gabodelascasas,gabodelascasas,1.409298e+18,replied_to,62337500.0,https://pbs.twimg.com/media/E47Y3H4XMAMtHHu.jpg,3_1409302174933397507


In [5]:
tweets[tweets['ID'] == 1391113172245831680]

Unnamed: 0,ID,Permalink,Author ID,Author Name,Author Location,Author Description,Author Followers,Author Following,Author Tweets,Author Profile Image,Author Verified,Date,Text,Replies,Retweets,Favorites,Quotes,is Retweet?,Reply To User Name,Mentions,Referenced Tweet,Reference Type,Referenced Tweet Author ID,Media URLs,Media Keys
44542043,1.391113e+18,/radio1040am/status/1391113172245831691,2434157000.0,radio1040am,Popayán Colombia,Emisora de Red Sonora Radio. Pasión por el Cau...,6054.0,239.0,24269.0,https://pbs.twimg.com/profile_images/821124947...,False,2021/05/08 14:30:00,#Noticias1040 \nEl Fiscal General y el Defenso...,0.0,0.0,0.0,0.0,False,,,,original tweet,,,
45321239,1.391113e+18,/nuevodiaibague/status/1391113172245831692,61925350.0,nuevodiaibague,Ibague - Colombia,El periódico de los tolimenses.\n#Tolima #Ibagué,53192.0,1829.0,234956.0,https://pbs.twimg.com/profile_images/144258337...,False,2021/05/08 14:30:00,👉 El emprendimiento se convirtió en una altern...,0.0,0.0,0.0,0.0,False,,,,original tweet,,,


# Tweets Lite

In [5]:
# Start counter
tic = perf_counter()

# Get just the columns that we need for the Graph construction
cols = [
    'ID',
    'Author ID',
    'Author Name',
    'Referenced Tweet Author ID',
    'Date',
    'Reference Type',
    'Referenced Tweet'
]

tweets_lite = tweets[cols].reset_index(drop = True)
tweets_lite.rename(columns={'ID': 'Tweet ID'}, inplace=True)

# Store results
# run sudo chmod 777 Data/Tweets_DataFrames in bash if it is needed
tweets_lite.to_pickle(os.path.join(path, "Tweets_DataFrames/tweets_lite.gzip"), compression = "gzip")
del tweets_lite
toc = perf_counter()
time = toc-tic
print(f"Finished saving tweets_lite in {time//60:,.0f} minutes with {round(time%60,2):,.2f} seconds")

Finished saving tweets_lite in 10 minutes with 43.02 seconds


# Retweets


In [6]:
# Start counter
tic = perf_counter()

def get_reference_author_name(x):
    try:
        return x.split(': ')[0].split('@')[1]
    except:
        return np.nan

# Extraer Retweets
retweets = tweets[tweets['Reference Type'] == 'retweeted'].drop(columns='Reference Type')

# Extraer nombre del Usuario Retwiteado
retweets['Referenced Tweet Author Name'] = retweets['Text'].apply(get_reference_author_name)

# Get just the columns that we need for the Graph construction
cols = [
    'ID',
    'Author ID',
    'Author Name',
    'Referenced Tweet Author ID',
    'Referenced Tweet Author Name',
    'Referenced Tweet',
    'Date'
]

retweets = retweets[cols].reset_index(drop = True)
retweets.rename(columns={'ID': 'Tweet ID', 'Referenced Tweet':'Referenced Tweet ID'}, inplace=True)

# Store results
# run sudo chmod 777 Data/Tweets_DataFrames in bash if it is needed
retweets.to_pickle(os.path.join(path, "Tweets_DataFrames/retweets.gzip"), compression = "gzip")
del retweets
toc = perf_counter()
time = toc - tic

print(f"Finished saving retweets in {time//60:,.0f} minutes with {round(time%60,2):,.2f} seconds")

Finished saving retweets in 8 minutes with 17.75 seconds


# Original Tweets

In [7]:
# Start counter
tic = perf_counter()

# Extraer tweets originales
original_tweets = tweets[tweets['Reference Type'] == 'original tweet'].drop(columns='Reference Type')

# Get just the columns that we need for the Graph construction
cols = [
    'ID',
    'Author ID',
    'Author Name',
    'Date',
]

original_tweets = original_tweets[cols].reset_index(drop = True)
original_tweets.rename(columns={'ID': 'Tweet ID'}, inplace=True)

# Store results
# run sudo chmod 777 Data/Tweets_DataFrames in bash if it is needed
original_tweets.to_pickle(os.path.join(path, "Tweets_DataFrames/original_tweets.gzip"), compression = "gzip")
del original_tweets
toc = perf_counter()
time = toc - tic

print(f"Finished saving original tweets in {time//60:,.0f} minutes with {round(time%60,2):,.2f} seconds")

Finished saving original tweets in 0 minutes with 48.74 seconds


# CHECKPOINT: Cargar Tweets Originales y Retweets

In [8]:
# Load Retweets
retweets = pd.read_pickle(os.path.join(path, "Tweets_DataFrames/retweets.gzip"), compression='gzip')

retweets = retweets.astype({
    'Author ID': 'float64',
    'Referenced Tweet Author ID': 'float64'
})
retweets["Date"] = pd.to_datetime(retweets["Date"], errors='coerce')

# Load Original tweets
original_tweets = pd.read_pickle(os.path.join(path, "Tweets_DataFrames/original_tweets.gzip"), compression='gzip')

original_tweets = original_tweets.astype({
    'Author ID': 'float64'
})
original_tweets["Date"] = pd.to_datetime(original_tweets["Date"], errors='coerce')

In [12]:
# Vamos a buscar la primera instancia de cada retweet. Esto nos llevará al tweet original que tenemos que encontrar
cols = ['Referenced Tweet Author ID','Referenced Tweet Author Name','Referenced Tweet ID', 'Date']
original_retweets = retweets[cols].sort_values(by=['Referenced Tweet ID', 'Date'])
original_retweets['row_number'] = original_retweets.groupby('Referenced Tweet ID').cumcount()

# Seleccionamos la primera instancia de cada Retweet
original_retweets = original_retweets[original_retweets['row_number'] == 0]

# Nombrar columnas
original_retweets = original_retweets.rename(columns = {
    'Referenced Tweet ID': 'Tweet ID',
    'Referenced Tweet Author ID': 'Author ID',
    'Referenced Tweet Author Name': 'Author Name',
})

# Eliminar fila de rank
original_retweets = original_retweets.drop(columns = 'row_number')
original_retweets['Date'] = original_retweets['Date'].dt.date
original_retweets.head()

Unnamed: 0,Author ID,Author Name,Tweet ID,Date
5941999,11611502.0,KRLS,948650800.0,2021-05-26
19122990,20322929.0,wizkhalifa,1222422000.0,2021-04-30
9047346,18369876.0,manibeto,1307903000.0,2021-05-11
8870085,14497313.0,SonyPictures,1316942000.0,2021-05-08
20694672,31927467.0,pitbull,1756103000.0,2021-05-09


In [13]:
# Revisamos que no hayan tweets duplicados para coger el número exacto de tweets originales que sabemos fueron retweeteados
original_retweets[original_retweets.duplicated()]

Unnamed: 0,Author ID,Author Name,Tweet ID,Date


In [14]:
# Revisamos que hay Tweet IDs duplicados. Ya que encontramos duplicados, procedemos a borrarlos
original_tweets[original_tweets.duplicated(subset = 'Tweet ID')]

Unnamed: 0,Tweet ID,Author ID,Author Name,Date
75855,0.000000e+00,0.0,0,NaT
75862,0.000000e+00,0.0,0,NaT
75875,0.000000e+00,0.0,0,NaT
75886,0.000000e+00,0.0,0,NaT
83589,0.000000e+00,0.0,0,NaT
...,...,...,...,...
4531386,1.389584e+18,370873343.0,aleltbd,2021-05-04 09:14:36
4538223,1.403106e+18,455212894.0,leonacassiani7,2021-06-10 16:44:40
4541332,1.405531e+18,61925350.0,nuevodiaibague,2021-06-17 09:20:00
4541917,1.395057e+18,61925350.0,nuevodiaibague,2021-05-19 11:40:00


In [15]:
original_tweets.drop_duplicates(subset = 'Tweet ID', inplace=True)
original_tweets['Date'] = original_tweets['Date'].dt.date

# Obtenemos lo Tweets originales y los tweets originales Retweiteados
original = pd.concat([original_retweets, original_tweets])

print(f"Total de tweets originales (Retweeteados y no retweeteados) {len(original):,.0f}")
original.head()

Total de tweets originales (Retweeteados y no retweeteados) 10,455,352


Unnamed: 0,Author ID,Author Name,Tweet ID,Date
5941999,11611502.0,KRLS,948650800.0,2021-05-26
19122990,20322929.0,wizkhalifa,1222422000.0,2021-04-30
9047346,18369876.0,manibeto,1307903000.0,2021-05-11
8870085,14497313.0,SonyPictures,1316942000.0,2021-05-08
20694672,31927467.0,pitbull,1756103000.0,2021-05-09


In [16]:
# Revisamos por tweets duplicados
# Esto puede pasar por que en la tabla original_retweets algunos de esos tweets retweeteados estaban en base de datos
# Al cominar las tabla original_tweets con original_retweets, estos saldrán duplicados
original[original.duplicated(subset = 'Tweet ID')]

Unnamed: 0,Author ID,Author Name,Tweet ID,Date
137,7.778842e+07,JavierDMC_,1.393037e+18,2021-05-13
161,7.778842e+07,JavierDMC_,1.388348e+18,2021-04-30
168,1.211572e+08,andresmpn,1.395863e+18,2021-05-21
182,1.211572e+08,andresmpn,1.391517e+18,2021-05-09
192,7.608689e+17,DANILO25031974,1.404770e+18,2021-06-15
...,...,...,...,...
4543182,2.562833e+08,MJVGaray,1.401578e+18,2021-06-06
4543190,2.562833e+08,MJVGaray,1.390106e+18,2021-05-05
4543222,1.395573e+09,PerdomoPilar,1.398728e+18,2021-05-29
4543230,1.395573e+09,PerdomoPilar,1.396171e+18,2021-05-22


In [18]:
# Cuantos Tweets originales y retweets tenemos
retweets_id = retweets['Referenced Tweet ID'].unique()
original_tweets_id = original_tweets['Tweet ID'].unique()
print(f"Tenemos {len(retweets_id):,} Retweets únicos y {len(original_tweets_id):,} Tweets originales únicos.")

# De los retweets, cuantos de esos tenemos en su versión originales
original_tweets_retweeted = set(retweets_id).intersection(set(original_tweets_id))
print(f"De los {len(original_tweets_id):,} Tweets originales que tenemos, {len(original_tweets_retweeted):,} fueron retweeteados y los tenemos en base de datos.")

Tenemos 5,912,692 Retweets únicos y 4,542,660 Tweets originales únicos.
De los 4,542,660 Tweets originales que tenemos, 811,479 fueron retweeteados y los tenemos en base de datos.


In [19]:
# Guardar DataFrames
original.to_pickle(os.path.join(path, "Tweets_DataFrames/original.gzip"), compression = "gzip")
original_retweets.to_pickle(os.path.join(path, "Tweets_DataFrames/original_retweets.gzip"), compression = "gzip")
original_tweets.to_pickle(os.path.join(path, "Tweets_DataFrames/original_tweets.gzip"), compression = "gzip")