In [1]:
# Mathematical and Data Managment
import pandas as pd
import numpy as np
from datetime import datetime, timedelta

# Graph Managment
import graph_tool.all as gt
from utils.Functions import *

# Miscellaneous
import pickle
import os
from tqdm import tqdm
from glob import glob
from concurrent.futures import ProcessPoolExecutor



In [2]:
# Load Retweets
retweets = pd.read_pickle('../../../Data/Tweets_DataFrames/retweets_lite.gzip', compression='gzip')
retweets[retweets.isna()]
retweets = retweets.astype({
    'Author ID': 'Int64',
    'Referenced Tweet Author ID': 'Int64'
})
retweets["Date"] = pd.to_datetime(retweets["Date"], errors='coerce')

# Load Original tweets
original_tweets = pd.read_pickle('../../../Data/Tweets_DataFrames/original_tweets_lite.gzip', compression='gzip')
original_tweets[original_tweets.isna()]
original_tweets = original_tweets.astype({
    'Author ID': 'Int64'
})
original_tweets["Date"] = pd.to_datetime(original_tweets["Date"], errors='coerce')

In [4]:
retweets.head()

Unnamed: 0,Tweet ID,Author ID,Author Name,Referenced Tweet Author ID,Referenced Tweet Author Name,Date,Referenced Tweet
0,1.409515e+18,788250746,Laura_Milena98,142491250,Jokeraton,2021-06-28 09:11:18,1.408756e+18
1,1.408912e+18,788250746,Laura_Milena98,261704681,majogomez30,2021-06-26 17:15:34,1.408428e+18
2,1.408233e+18,788250746,Laura_Milena98,830539402103382016,ManuelBeltrn14,2021-06-24 20:16:16,1.407308e+18
3,1.408232e+18,788250746,Laura_Milena98,56713266,santorendon,2021-06-24 20:14:47,1.408212e+18
4,1.407883e+18,788250746,Laura_Milena98,1140705274174590976,PATATAdibujo,2021-06-23 21:06:16,1.407883e+18


In [6]:
# Vamos a buscar la primera instancia de cada retweet. Esto nos llevará al tweet original que tenemos que encontrar
cols = ['Referenced Tweet Author ID','Referenced Tweet Author Name','Referenced Tweet', 'Date']
original_retweets = retweets[cols].sort_values(by=['Referenced Tweet', 'Date'])
original_retweets['row_number'] = original_retweets.groupby('Referenced Tweet').cumcount()
original_retweets

Unnamed: 0,Referenced Tweet Author ID,Referenced Tweet Author Name,Referenced Tweet,Date,row_number
5941999,11611502,KRLS,9.486508e+08,2021-05-26 14:46:09,0
12046101,11611502,KRLS,9.486508e+08,2021-05-26 15:29:48,1
24750366,11611502,KRLS,9.486508e+08,2021-05-26 17:56:11,2
23694432,11611502,KRLS,9.486508e+08,2021-05-27 02:22:24,3
19122990,20322929,wizkhalifa,1.222422e+09,2021-04-30 17:19:52,0
...,...,...,...,...,...
7102721,141943866,JUANCAELBROKY,1.410100e+18,2021-06-29 23:59:45,0
8154557,141943866,JUANCAELBROKY,1.410100e+18,2021-06-29 23:59:47,1
3515228,35013719,NoticiasRCN,1.410100e+18,2021-06-29 23:59:44,0
14304641,1235982161938976768,TheCamilaDaily,1.410101e+18,2021-06-29 23:59:47,0


In [7]:
original_retweets = original_retweets[original_retweets['row_number'] == 0]
original_retweets

Unnamed: 0,Referenced Tweet Author ID,Referenced Tweet Author Name,Referenced Tweet,Date,row_number
5941999,11611502,KRLS,9.486508e+08,2021-05-26 14:46:09,0
19122990,20322929,wizkhalifa,1.222422e+09,2021-04-30 17:19:52,0
9047346,18369876,manibeto,1.307903e+09,2021-05-11 10:31:36,0
8870085,14497313,SonyPictures,1.316942e+09,2021-05-08 13:33:52,0
20694672,31927467,pitbull,1.756103e+09,2021-05-09 03:05:36,0
...,...,...,...,...,...
3515229,3075028654,HelenUribista,1.410100e+18,2021-06-29 23:59:35,0
7102721,141943866,JUANCAELBROKY,1.410100e+18,2021-06-29 23:59:45,0
3515228,35013719,NoticiasRCN,1.410100e+18,2021-06-29 23:59:44,0
14304641,1235982161938976768,TheCamilaDaily,1.410101e+18,2021-06-29 23:59:47,0


In [8]:
original_retweets = original_retweets.rename(columns = {
    'Referenced Tweet': 'Tweet ID',
    'Referenced Tweet Author ID': 'Author ID',
    'Referenced Tweet Author Name': 'Author Name',
})

original_retweets = original_retweets.drop(columns = 'row_number')
original_retweets['Date'] = original_retweets['Date'].dt.date
original_retweets.head()

Unnamed: 0,Author ID,Author Name,Tweet ID,Date
5941999,11611502,KRLS,948650800.0,2021-05-26
19122990,20322929,wizkhalifa,1222422000.0,2021-04-30
9047346,18369876,manibeto,1307903000.0,2021-05-11
8870085,14497313,SonyPictures,1316942000.0,2021-05-08
20694672,31927467,pitbull,1756103000.0,2021-05-09


In [9]:
# Revisamos que no hayan tweets duplicados para coger el número exacto de tweets originales que sabemos fueron rewteeteados
original_retweets[original_retweets.duplicated()]

Unnamed: 0,Author ID,Author Name,Tweet ID,Date


In [10]:
original_tweets.head()

Unnamed: 0,Tweet ID,Author ID,Author Name,Date
0,1.397298e+18,138377765,hmauriciojg,2021-05-25 16:06:23
1,1.394702e+18,138377765,hmauriciojg,2021-05-18 12:08:44
2,1.389576e+18,138377765,hmauriciojg,2021-05-04 08:41:29
3,1.389273e+18,138377765,hmauriciojg,2021-05-03 12:35:56
4,1.409909e+18,788250746,Laura_Milena98,2021-06-29 11:16:36


In [11]:
# Revisamos que hay duplicados. Ya que encontramos duplicados, procedemos a borrarlos
original_tweets[original_tweets.duplicated(subset = 'Tweet ID')]

Unnamed: 0,Tweet ID,Author ID,Author Name,Date
75855,0.000000e+00,0,0,NaT
75862,0.000000e+00,0,0,NaT
75875,0.000000e+00,0,0,NaT
75886,0.000000e+00,0,0,NaT
83589,0.000000e+00,0,0,NaT
...,...,...,...,...
4531386,1.389584e+18,370873343,aleltbd,2021-05-04 09:14:36
4538223,1.403106e+18,455212894,leonacassiani7,2021-06-10 16:44:40
4541332,1.405531e+18,61925350,nuevodiaibague,2021-06-17 09:20:00
4541917,1.395057e+18,61925350,nuevodiaibague,2021-05-19 11:40:00


In [12]:
original_tweets.drop_duplicates(subset = 'Tweet ID', inplace=True)
original_tweets['Date'] = original_tweets['Date'].dt.date
original_tweets.head()

Unnamed: 0,Tweet ID,Author ID,Author Name,Date
0,1.397298e+18,138377765,hmauriciojg,2021-05-25
1,1.394702e+18,138377765,hmauriciojg,2021-05-18
2,1.389576e+18,138377765,hmauriciojg,2021-05-04
3,1.389273e+18,138377765,hmauriciojg,2021-05-03
4,1.409909e+18,788250746,Laura_Milena98,2021-06-29


In [13]:
original = pd.concat([original_retweets, original_tweets])
print(original.shape)
original.head()

(10455352, 4)


Unnamed: 0,Author ID,Author Name,Tweet ID,Date
5941999,11611502,KRLS,948650800.0,2021-05-26
19122990,20322929,wizkhalifa,1222422000.0,2021-04-30
9047346,18369876,manibeto,1307903000.0,2021-05-11
8870085,14497313,SonyPictures,1316942000.0,2021-05-08
20694672,31927467,pitbull,1756103000.0,2021-05-09


In [14]:
# Revisamos por tweets duplicados (Estob puede pasar por que en la tabla original_retweets algunos de esos tweets retweeteados estaban en base de datos
# Al cominar las tabla original_tweets con original_retweets, estos saldrán duplicados
original[original.duplicated(subset = 'Tweet ID')]

Unnamed: 0,Author ID,Author Name,Tweet ID,Date
137,77788421,JavierDMC_,1.393037e+18,2021-05-13
161,77788421,JavierDMC_,1.388348e+18,2021-04-30
168,121157181,andresmpn,1.395863e+18,2021-05-21
182,121157181,andresmpn,1.391517e+18,2021-05-09
192,760868895293046784,DANILO25031974,1.404770e+18,2021-06-15
...,...,...,...,...
4543182,256283311,MJVGaray,1.401578e+18,2021-06-06
4543190,256283311,MJVGaray,1.390106e+18,2021-05-05
4543222,1395572754,PerdomoPilar,1.398728e+18,2021-05-29
4543230,1395572754,PerdomoPilar,1.396171e+18,2021-05-22


In [15]:
# Cuantos Tweets originales y retweets tenemos
retweets_id = retweets['Referenced Tweet'].unique()
original_tweets_id = original_tweets['Tweet ID'].unique()
print(f"Tenemos {len(retweets_id):,} Retweets únicos y {len(original_tweets_id):,} Tweets originales únicos.")

# De los retweets, cuantos de esos tenemos en su versión originales
original_tweets_retweeted = set(retweets_id).intersection(set(original_tweets_id))
print(f"De los {len(original_tweets_id):,} Tweets originales que tenemos, {len(original_tweets_retweeted):,} fueron retweeteados y los tenemos en base de datos.")

Tenemos 5,912,692 Retweets únicos y 4,542,660 Tweets originales únicos.
De los 4,542,660 Tweets originales que tenemos, 811,479 fueron retweeteados y los tenemos en base de datos.


In [16]:
original.drop_duplicates(subset = 'Tweet ID', inplace=True)
original

Unnamed: 0,Author ID,Author Name,Tweet ID,Date
5941999,11611502,KRLS,9.486508e+08,2021-05-26
19122990,20322929,wizkhalifa,1.222422e+09,2021-04-30
9047346,18369876,manibeto,1.307903e+09,2021-05-11
8870085,14497313,SonyPictures,1.316942e+09,2021-05-08
20694672,31927467,pitbull,1.756103e+09,2021-05-09
...,...,...,...,...
4543236,1395572754,PerdomoPilar,1.393022e+18,2021-05-13
4543237,1395572754,PerdomoPilar,1.390884e+18,2021-05-07
4543238,1395572754,PerdomoPilar,1.390039e+18,2021-05-05
4543240,1395572754,PerdomoPilar,1.390036e+18,2021-05-05


In [None]:
path = r"/mnt/disk2/Data"
original.to_pickle(os.path.join(path, "Tweets_DataFrames/original.gzip"), compression = "gzip")