In [1]:
import ast
import json
import time
import requests
import pandas as pd
import networkx as nx
from tqdm import tqdm
from collections import Counter
from pyvis.network import Network
from collections import defaultdict
from langchain_openai import ChatOpenAI
from langchain_core.messages import SystemMessage

import matplotlib.cm as cm
import matplotlib.pyplot as plt
import community.community_louvain as community_louvain

tqdm.pandas()

In [2]:
BEARER_TOKEN = "AAAAAAAAAAAAAAAAAAAAAOoVugEAAAAADL4x5tJZlW27pH%2FEwVzhWpiEcoU%3DwH48AVCg9driugNHvboWrRdJz7613trmWFqwz8R2KnyihgGnqM"

# Get data

In [3]:
def get_tweets_by_batch(query, max_total=100):
    ans = {
        'tweets': [],
        'users': []
    }
    next_token = None

    url = "https://api.twitter.com/2/tweets/search/recent"
    headers = {"Authorization": f"Bearer {BEARER_TOKEN}"}
    params = {
        "query": query,
        "tweet.fields": "created_at,author_id,public_metrics,entities",
        "expansions": "author_id",
        "user.fields": "username",
        "max_results": max_total if max_total < 100 else 100
    }

    while len(ans['tweets']) < max_total:
        if next_token:
            params["next_token"] = next_token

        response = requests.get(url, headers=headers, params=params)

        if response.status_code != 200:
            print(f"Error {response.status_code}: {response.text}")
            break

        data = response.json()

        if "data" in data:
            ans['tweets'].extend(data["data"])
            print(f"Descargando: {len(ans['tweets'])} tweets")

        if "includes" in data:
            if "users" in data['includes']:
                ans['users'].extend(data["includes"]["users"])

        meta = data.get("meta", {})
        next_token = meta.get("next_token")
        if not next_token:
            break
        
        time.sleep(2)

    return ans

In [4]:
def get_mentions(user_id, max_total=100):
    ans = {
        'tweets': [],
        'users': []
    }
    next_token = None

    url = f"https://api.x.com/2/users/{user_id}/mentions"
    headers = {"Authorization": f"Bearer {BEARER_TOKEN}"}
    params = {
        "tweet.fields": "created_at,author_id,public_metrics,entities",
        "expansions": "author_id",
        "user.fields": "username",
        "max_results": max_total if max_total < 100 else 100
    }

    while len(ans['tweets']) < max_total:
        if next_token:
            params["pagination_token"] = next_token

        response = requests.get(url, headers=headers, params=params)

        if response.status_code != 200:
            print(f"Error {response.status_code}: {response.text}")
            break

        data = response.json()

        if "data" in data:
            ans['tweets'].extend(data["data"])
            print(f"Descargando: {len(ans['tweets'])} tweets")

        if "includes" in data:
            if "users" in data['includes']:
                ans['users'].extend(data["includes"]["users"])

        meta = data.get("meta", {})
        next_token = meta.get("next_token")
        if not next_token:
            break
        
        time.sleep(2)

    return ans

In [5]:
set_search = 'search' # 'mentions' # 'search' or 'mentions'

if set_search == 'search':
    data = get_tweets_by_batch("@JDOviedoAr -is:retweet lang:es", max_total=5000)
elif set_search == 'mentions':
    data = get_mentions(219434063, 1000)

Descargando: 100 tweets
Descargando: 200 tweets
Descargando: 300 tweets
Descargando: 400 tweets
Descargando: 500 tweets
Descargando: 600 tweets
Descargando: 700 tweets
Descargando: 800 tweets
Descargando: 900 tweets
Descargando: 1000 tweets
Descargando: 1100 tweets
Descargando: 1200 tweets
Descargando: 1300 tweets
Descargando: 1400 tweets
Descargando: 1500 tweets
Descargando: 1600 tweets
Descargando: 1700 tweets
Descargando: 1800 tweets
Descargando: 1900 tweets
Descargando: 2000 tweets
Descargando: 2100 tweets
Descargando: 2200 tweets
Descargando: 2300 tweets
Descargando: 2400 tweets
Descargando: 2500 tweets
Descargando: 2600 tweets
Descargando: 2652 tweets


In [6]:
print(f'Descargados {len(data["tweets"])} tweets')

Descargados 2652 tweets


In [7]:
file_path = f"../data/{set_search}/JDOviedoAr.json"

with open(file_path, "w") as json_file:
    json.dump(data, json_file, ensure_ascii=False, indent=4)

In [8]:
tweets = data["tweets"]
users = {u["id"]: {'username': u["username"], 'name': u["name"]} for u in data["users"]}

In [9]:
df = pd.DataFrame(tweets)
df

Unnamed: 0,entities,author_id,id,text,created_at,public_metrics,edit_history_tweet_ids
0,"{'annotations': [{'start': 265, 'end': 272, 'p...",177823502,1973030623792296212,El RIDICULO que hicieron estos señores de la u...,2025-09-30T14:22:12.000Z,"{'retweet_count': 0, 'reply_count': 0, 'like_c...",[1973030623792296212]
1,"{'mentions': [{'start': 0, 'end': 15, 'usernam...",3145461383,1973029119152787676,@lavozdelojusto @JDOviedoAr @ConTodaPorCol @Me...,2025-09-30T14:16:13.000Z,"{'retweet_count': 0, 'reply_count': 0, 'like_c...",[1973029119152787676]
2,"{'annotations': [{'start': 76, 'end': 79, 'pro...",3145461383,1973028989192314940,@Palabrejo @JDOviedoAr @ConTodaPorCol @MejorJu...,2025-09-30T14:15:42.000Z,"{'retweet_count': 0, 'reply_count': 0, 'like_c...",[1973028989192314940]
3,"{'annotations': [{'start': 63, 'end': 71, 'pro...",3145461383,1973028873748238623,@527ElduroJadin @JDOviedoAr @ConTodaPorCol @Me...,2025-09-30T14:15:15.000Z,"{'retweet_count': 0, 'reply_count': 0, 'like_c...",[1973028873748238623]
4,"{'annotations': [{'start': 60, 'end': 64, 'pro...",3145461383,1973028724628238409,@sircamilopinto @JDOviedoAr @ConTodaPorCol @Me...,2025-09-30T14:14:39.000Z,"{'retweet_count': 0, 'reply_count': 0, 'like_c...",[1973028724628238409]
...,...,...,...,...,...,...,...
2647,"{'mentions': [{'start': 0, 'end': 7, 'username...",1597223287,1970534072402550958,@Hora20 @JDOviedoAr @susanamuhamad @PaolaHolgu...,2025-09-23T17:01:48.000Z,"{'retweet_count': 0, 'reply_count': 0, 'like_c...",[1970534072402550958]
2648,"{'mentions': [{'start': 0, 'end': 7, 'username...",1488698923,1970533639801786425,@Hora20 @JDOviedoAr @susanamuhamad @PaolaHolgu...,2025-09-23T17:00:05.000Z,"{'retweet_count': 0, 'reply_count': 0, 'like_c...",[1970533639801786425]
2649,"{'mentions': [{'start': 0, 'end': 7, 'username...",1378620511,1970533637004443968,@Hora20 @JDOviedoAr @susanamuhamad @PaolaHolgu...,2025-09-23T17:00:04.000Z,"{'retweet_count': 0, 'reply_count': 0, 'like_c...",[1970533637004443968]
2650,"{'mentions': [{'start': 111, 'end': 126, 'user...",1638556330976747520,1970522817612652670,Carta Solicitud Reconciderar Cupos Escolares E...,2025-09-23T16:17:04.000Z,"{'retweet_count': 0, 'reply_count': 0, 'like_c...",[1970522817612652670]


In [10]:
df[["username", "name"]] = df["author_id"].apply(lambda author_id: pd.Series([users[author_id]['username'], users[author_id]['name']]))
df

Unnamed: 0,entities,author_id,id,text,created_at,public_metrics,edit_history_tweet_ids,username,name
0,"{'annotations': [{'start': 265, 'end': 272, 'p...",177823502,1973030623792296212,El RIDICULO que hicieron estos señores de la u...,2025-09-30T14:22:12.000Z,"{'retweet_count': 0, 'reply_count': 0, 'like_c...",[1973030623792296212],mikeariza,Miguel Ariza Castro
1,"{'mentions': [{'start': 0, 'end': 15, 'usernam...",3145461383,1973029119152787676,@lavozdelojusto @JDOviedoAr @ConTodaPorCol @Me...,2025-09-30T14:16:13.000Z,"{'retweet_count': 0, 'reply_count': 0, 'like_c...",[1973029119152787676],08Rosareda,Andrea 🦕 🌌💋🐳🐙
2,"{'annotations': [{'start': 76, 'end': 79, 'pro...",3145461383,1973028989192314940,@Palabrejo @JDOviedoAr @ConTodaPorCol @MejorJu...,2025-09-30T14:15:42.000Z,"{'retweet_count': 0, 'reply_count': 0, 'like_c...",[1973028989192314940],08Rosareda,Andrea 🦕 🌌💋🐳🐙
3,"{'annotations': [{'start': 63, 'end': 71, 'pro...",3145461383,1973028873748238623,@527ElduroJadin @JDOviedoAr @ConTodaPorCol @Me...,2025-09-30T14:15:15.000Z,"{'retweet_count': 0, 'reply_count': 0, 'like_c...",[1973028873748238623],08Rosareda,Andrea 🦕 🌌💋🐳🐙
4,"{'annotations': [{'start': 60, 'end': 64, 'pro...",3145461383,1973028724628238409,@sircamilopinto @JDOviedoAr @ConTodaPorCol @Me...,2025-09-30T14:14:39.000Z,"{'retweet_count': 0, 'reply_count': 0, 'like_c...",[1973028724628238409],08Rosareda,Andrea 🦕 🌌💋🐳🐙
...,...,...,...,...,...,...,...,...,...
2647,"{'mentions': [{'start': 0, 'end': 7, 'username...",1597223287,1970534072402550958,@Hora20 @JDOviedoAr @susanamuhamad @PaolaHolgu...,2025-09-23T17:01:48.000Z,"{'retweet_count': 0, 'reply_count': 0, 'like_c...",[1970534072402550958],umabdalaziz3,BENJAMÍN
2648,"{'mentions': [{'start': 0, 'end': 7, 'username...",1488698923,1970533639801786425,@Hora20 @JDOviedoAr @susanamuhamad @PaolaHolgu...,2025-09-23T17:00:05.000Z,"{'retweet_count': 0, 'reply_count': 0, 'like_c...",[1970533639801786425],maha_875,lina maría
2649,"{'mentions': [{'start': 0, 'end': 7, 'username...",1378620511,1970533637004443968,@Hora20 @JDOviedoAr @susanamuhamad @PaolaHolgu...,2025-09-23T17:00:04.000Z,"{'retweet_count': 0, 'reply_count': 0, 'like_c...",[1970533637004443968],Pepiinno,ricardo
2650,"{'mentions': [{'start': 111, 'end': 126, 'user...",1638556330976747520,1970522817612652670,Carta Solicitud Reconciderar Cupos Escolares E...,2025-09-23T16:17:04.000Z,"{'retweet_count': 0, 'reply_count': 0, 'like_c...",[1970522817612652670],JeissonRayoR,Jeisson S. Rayo Rodríguez


In [11]:
df[['text', 'username']].duplicated().sum()

np.int64(1)

In [12]:
df[df[['text', 'username']].duplicated()]

Unnamed: 0,entities,author_id,id,text,created_at,public_metrics,edit_history_tweet_ids,username,name
2310,"{'mentions': [{'start': 0, 'end': 11, 'usernam...",1076986340,1971298152725774541,@JDOviedoAr Aja mí llave cómo cuántos votos cr...,2025-09-25T19:37:59.000Z,"{'retweet_count': 0, 'reply_count': 0, 'like_c...",[1971298152725774541],FERNANDORAVEM,FERNANDO RAVE M.


In [13]:
df["created_at"] = pd.to_datetime(df["created_at"])
df

Unnamed: 0,entities,author_id,id,text,created_at,public_metrics,edit_history_tweet_ids,username,name
0,"{'annotations': [{'start': 265, 'end': 272, 'p...",177823502,1973030623792296212,El RIDICULO que hicieron estos señores de la u...,2025-09-30 14:22:12+00:00,"{'retweet_count': 0, 'reply_count': 0, 'like_c...",[1973030623792296212],mikeariza,Miguel Ariza Castro
1,"{'mentions': [{'start': 0, 'end': 15, 'usernam...",3145461383,1973029119152787676,@lavozdelojusto @JDOviedoAr @ConTodaPorCol @Me...,2025-09-30 14:16:13+00:00,"{'retweet_count': 0, 'reply_count': 0, 'like_c...",[1973029119152787676],08Rosareda,Andrea 🦕 🌌💋🐳🐙
2,"{'annotations': [{'start': 76, 'end': 79, 'pro...",3145461383,1973028989192314940,@Palabrejo @JDOviedoAr @ConTodaPorCol @MejorJu...,2025-09-30 14:15:42+00:00,"{'retweet_count': 0, 'reply_count': 0, 'like_c...",[1973028989192314940],08Rosareda,Andrea 🦕 🌌💋🐳🐙
3,"{'annotations': [{'start': 63, 'end': 71, 'pro...",3145461383,1973028873748238623,@527ElduroJadin @JDOviedoAr @ConTodaPorCol @Me...,2025-09-30 14:15:15+00:00,"{'retweet_count': 0, 'reply_count': 0, 'like_c...",[1973028873748238623],08Rosareda,Andrea 🦕 🌌💋🐳🐙
4,"{'annotations': [{'start': 60, 'end': 64, 'pro...",3145461383,1973028724628238409,@sircamilopinto @JDOviedoAr @ConTodaPorCol @Me...,2025-09-30 14:14:39+00:00,"{'retweet_count': 0, 'reply_count': 0, 'like_c...",[1973028724628238409],08Rosareda,Andrea 🦕 🌌💋🐳🐙
...,...,...,...,...,...,...,...,...,...
2647,"{'mentions': [{'start': 0, 'end': 7, 'username...",1597223287,1970534072402550958,@Hora20 @JDOviedoAr @susanamuhamad @PaolaHolgu...,2025-09-23 17:01:48+00:00,"{'retweet_count': 0, 'reply_count': 0, 'like_c...",[1970534072402550958],umabdalaziz3,BENJAMÍN
2648,"{'mentions': [{'start': 0, 'end': 7, 'username...",1488698923,1970533639801786425,@Hora20 @JDOviedoAr @susanamuhamad @PaolaHolgu...,2025-09-23 17:00:05+00:00,"{'retweet_count': 0, 'reply_count': 0, 'like_c...",[1970533639801786425],maha_875,lina maría
2649,"{'mentions': [{'start': 0, 'end': 7, 'username...",1378620511,1970533637004443968,@Hora20 @JDOviedoAr @susanamuhamad @PaolaHolgu...,2025-09-23 17:00:04+00:00,"{'retweet_count': 0, 'reply_count': 0, 'like_c...",[1970533637004443968],Pepiinno,ricardo
2650,"{'mentions': [{'start': 111, 'end': 126, 'user...",1638556330976747520,1970522817612652670,Carta Solicitud Reconciderar Cupos Escolares E...,2025-09-23 16:17:04+00:00,"{'retweet_count': 0, 'reply_count': 0, 'like_c...",[1970522817612652670],JeissonRayoR,Jeisson S. Rayo Rodríguez


In [14]:
df.to_csv(f'../data/{set_search}/JDOviedoAr_raw.csv', index=False)

# Sentiment analysis

In [15]:
df = df.drop_duplicates(subset=['text', 'username'])
df

Unnamed: 0,entities,author_id,id,text,created_at,public_metrics,edit_history_tweet_ids,username,name
0,"{'annotations': [{'start': 265, 'end': 272, 'p...",177823502,1973030623792296212,El RIDICULO que hicieron estos señores de la u...,2025-09-30 14:22:12+00:00,"{'retweet_count': 0, 'reply_count': 0, 'like_c...",[1973030623792296212],mikeariza,Miguel Ariza Castro
1,"{'mentions': [{'start': 0, 'end': 15, 'usernam...",3145461383,1973029119152787676,@lavozdelojusto @JDOviedoAr @ConTodaPorCol @Me...,2025-09-30 14:16:13+00:00,"{'retweet_count': 0, 'reply_count': 0, 'like_c...",[1973029119152787676],08Rosareda,Andrea 🦕 🌌💋🐳🐙
2,"{'annotations': [{'start': 76, 'end': 79, 'pro...",3145461383,1973028989192314940,@Palabrejo @JDOviedoAr @ConTodaPorCol @MejorJu...,2025-09-30 14:15:42+00:00,"{'retweet_count': 0, 'reply_count': 0, 'like_c...",[1973028989192314940],08Rosareda,Andrea 🦕 🌌💋🐳🐙
3,"{'annotations': [{'start': 63, 'end': 71, 'pro...",3145461383,1973028873748238623,@527ElduroJadin @JDOviedoAr @ConTodaPorCol @Me...,2025-09-30 14:15:15+00:00,"{'retweet_count': 0, 'reply_count': 0, 'like_c...",[1973028873748238623],08Rosareda,Andrea 🦕 🌌💋🐳🐙
4,"{'annotations': [{'start': 60, 'end': 64, 'pro...",3145461383,1973028724628238409,@sircamilopinto @JDOviedoAr @ConTodaPorCol @Me...,2025-09-30 14:14:39+00:00,"{'retweet_count': 0, 'reply_count': 0, 'like_c...",[1973028724628238409],08Rosareda,Andrea 🦕 🌌💋🐳🐙
...,...,...,...,...,...,...,...,...,...
2647,"{'mentions': [{'start': 0, 'end': 7, 'username...",1597223287,1970534072402550958,@Hora20 @JDOviedoAr @susanamuhamad @PaolaHolgu...,2025-09-23 17:01:48+00:00,"{'retweet_count': 0, 'reply_count': 0, 'like_c...",[1970534072402550958],umabdalaziz3,BENJAMÍN
2648,"{'mentions': [{'start': 0, 'end': 7, 'username...",1488698923,1970533639801786425,@Hora20 @JDOviedoAr @susanamuhamad @PaolaHolgu...,2025-09-23 17:00:05+00:00,"{'retweet_count': 0, 'reply_count': 0, 'like_c...",[1970533639801786425],maha_875,lina maría
2649,"{'mentions': [{'start': 0, 'end': 7, 'username...",1378620511,1970533637004443968,@Hora20 @JDOviedoAr @susanamuhamad @PaolaHolgu...,2025-09-23 17:00:04+00:00,"{'retweet_count': 0, 'reply_count': 0, 'like_c...",[1970533637004443968],Pepiinno,ricardo
2650,"{'mentions': [{'start': 111, 'end': 126, 'user...",1638556330976747520,1970522817612652670,Carta Solicitud Reconciderar Cupos Escolares E...,2025-09-23 16:17:04+00:00,"{'retweet_count': 0, 'reply_count': 0, 'like_c...",[1970522817612652670],JeissonRayoR,Jeisson S. Rayo Rodríguez


In [16]:
llm = ChatOpenAI(
    api_key="EMPTY",
    model="Qwen/Qwen2.5-32B-Instruct-AWQ",
    base_url="http://10.20.21.118:8038/v1",
    temperature=0,
    model_kwargs={"response_format": {"type": "json_object"}}
)

In [17]:
def get_sentiment_llm(text):
    try:
        messages = [
            (
                "system",
                assistant_prompt,
            ),
            (
                "human",
                text
            )
        ]
        
        resp = llm.invoke(messages)
        classification = json.loads(resp.content)
        return classification['sentiment']
    except:
        return 'Error'

In [18]:
assistant_prompt = """
Eres un asistente encargado de analizar tweets.
Tu tarea es identificar el sentimiento expresado específicamente hacia el usuario de interés "JDOviedoAr".
Ten en cuenta que un tweet puede tener un sentimiento negativo frente a una situación u otro usuario, pero ser positiva en lo que corresponde a la cuenta de interés "JDOviedoAr".
Clasifica el comentario como POSITIVO, NEGATIVO o NEUTRO.
Si no es claro si el tweet es positivo o negativo con respecto a lo que menciona acerca de "JDOviedoAr", marca como NEUTRO.
Retorna un JSON con la clasificación correspondiente en la llave "sentiment"
""".strip()

In [19]:
sample_text = df.loc[0, 'text']
sentiment = get_sentiment_llm(sample_text)
print(sample_text, '::', sentiment)
print('-' * 10)
sample_text = df.loc[1, 'text']
sentiment = get_sentiment_llm(sample_text)
print(sample_text, '::', sentiment)

El RIDICULO que hicieron estos señores de la ultraderecha uribista @MauricioCard @pipecordoba @MauricioGomezCO @mclacouture @lunadavid @JDOviedoAr @EnriquePenalosa @mluciaramirez @juanmanuelgalan ARRODILLADOS ante un presidente CONVICTO, ninguno merece un voto de #Colombia. https://t.co/xu0bC6BXFs :: NEGATIVO
----------
@lavozdelojusto @JDOviedoAr @ConTodaPorCol @MejorJuntos_col Usted no sabe que el que es homofóbico algo le pasa..pues no sé eso dicen :: NEUTRO


In [20]:
df["sentiment_llm"] = df["text"].progress_apply(lambda text: get_sentiment_llm(text))
df

100%|███████████████████████████████████████| 2651/2651 [13:21<00:00,  3.31it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["sentiment_llm"] = df["text"].progress_apply(lambda text: get_sentiment_llm(text))


Unnamed: 0,entities,author_id,id,text,created_at,public_metrics,edit_history_tweet_ids,username,name,sentiment_llm
0,"{'annotations': [{'start': 265, 'end': 272, 'p...",177823502,1973030623792296212,El RIDICULO que hicieron estos señores de la u...,2025-09-30 14:22:12+00:00,"{'retweet_count': 0, 'reply_count': 0, 'like_c...",[1973030623792296212],mikeariza,Miguel Ariza Castro,NEGATIVO
1,"{'mentions': [{'start': 0, 'end': 15, 'usernam...",3145461383,1973029119152787676,@lavozdelojusto @JDOviedoAr @ConTodaPorCol @Me...,2025-09-30 14:16:13+00:00,"{'retweet_count': 0, 'reply_count': 0, 'like_c...",[1973029119152787676],08Rosareda,Andrea 🦕 🌌💋🐳🐙,NEUTRO
2,"{'annotations': [{'start': 76, 'end': 79, 'pro...",3145461383,1973028989192314940,@Palabrejo @JDOviedoAr @ConTodaPorCol @MejorJu...,2025-09-30 14:15:42+00:00,"{'retweet_count': 0, 'reply_count': 0, 'like_c...",[1973028989192314940],08Rosareda,Andrea 🦕 🌌💋🐳🐙,NEUTRO
3,"{'annotations': [{'start': 63, 'end': 71, 'pro...",3145461383,1973028873748238623,@527ElduroJadin @JDOviedoAr @ConTodaPorCol @Me...,2025-09-30 14:15:15+00:00,"{'retweet_count': 0, 'reply_count': 0, 'like_c...",[1973028873748238623],08Rosareda,Andrea 🦕 🌌💋🐳🐙,NEUTRO
4,"{'annotations': [{'start': 60, 'end': 64, 'pro...",3145461383,1973028724628238409,@sircamilopinto @JDOviedoAr @ConTodaPorCol @Me...,2025-09-30 14:14:39+00:00,"{'retweet_count': 0, 'reply_count': 0, 'like_c...",[1973028724628238409],08Rosareda,Andrea 🦕 🌌💋🐳🐙,NEUTRO
...,...,...,...,...,...,...,...,...,...,...
2647,"{'mentions': [{'start': 0, 'end': 7, 'username...",1597223287,1970534072402550958,@Hora20 @JDOviedoAr @susanamuhamad @PaolaHolgu...,2025-09-23 17:01:48+00:00,"{'retweet_count': 0, 'reply_count': 0, 'like_c...",[1970534072402550958],umabdalaziz3,BENJAMÍN,NEUTRO
2648,"{'mentions': [{'start': 0, 'end': 7, 'username...",1488698923,1970533639801786425,@Hora20 @JDOviedoAr @susanamuhamad @PaolaHolgu...,2025-09-23 17:00:05+00:00,"{'retweet_count': 0, 'reply_count': 0, 'like_c...",[1970533639801786425],maha_875,lina maría,NEUTRO
2649,"{'mentions': [{'start': 0, 'end': 7, 'username...",1378620511,1970533637004443968,@Hora20 @JDOviedoAr @susanamuhamad @PaolaHolgu...,2025-09-23 17:00:04+00:00,"{'retweet_count': 0, 'reply_count': 0, 'like_c...",[1970533637004443968],Pepiinno,ricardo,NEUTRO
2650,"{'mentions': [{'start': 111, 'end': 126, 'user...",1638556330976747520,1970522817612652670,Carta Solicitud Reconciderar Cupos Escolares E...,2025-09-23 16:17:04+00:00,"{'retweet_count': 0, 'reply_count': 0, 'like_c...",[1970522817612652670],JeissonRayoR,Jeisson S. Rayo Rodríguez,NEUTRO


In [21]:
df["sentiment_llm"].value_counts(normalize=True).to_frame('Count')

Unnamed: 0_level_0,Count
sentiment_llm,Unnamed: 1_level_1
NEUTRO,0.506978
NEGATIVO,0.431912
POSITIVO,0.061109


In [22]:
df.to_csv(f'../data/{set_search}/JDOviedoAr_sentiment.csv', index=False)

# Graph

In [23]:
G = nx.DiGraph()

for tweet in tweets:
    author = users[tweet["author_id"]]["username"]

    G.add_node(author)

    mentions = tweet.get("entities", {}).get("mentions", [])
    for m in mentions:
        mentioned_user = m["username"]
        G.add_node(mentioned_user)
        G.add_edge(author, mentioned_user, type="mention")

In [24]:
pagerank = nx.pagerank(G)
top_users = sorted(pagerank.items(), key=lambda x: x[1], reverse=True)[:20]
print("PageRank:")
for user, score in top_users:
    print(f"{user}: {score:.4f}")

PageRank:
JDOviedoAr: 0.1127
MauricioCard: 0.0377
juanmanuelgalan: 0.0345
ConTodaPorCol: 0.0341
lunadavid: 0.0321
EnriquePenalosa: 0.0297
petrogustavo: 0.0280
mluciaramirez: 0.0250
MauricioGomezCO: 0.0249
mclacouture: 0.0169
pipecordoba: 0.0123
UltimaHoraCR: 0.0111
PalomaValenciaL: 0.0074
FenalcoNacional: 0.0061
EfrainCepeda: 0.0050
VickyDavilaH: 0.0046
consejodeestado: 0.0042
Hora20: 0.0031
PaolaHolguin: 0.0021
MejorJuntos_col: 0.0019


In [25]:
pagerank = nx.pagerank(G)
top100_users_pr = sorted(pagerank.items(), key=lambda x: x[1], reverse=True)[:100]
print("PageRank:")
for user, score in top100_users_pr:
    print(f"{user}: {score:.4f}")

PageRank:
JDOviedoAr: 0.1127
MauricioCard: 0.0377
juanmanuelgalan: 0.0345
ConTodaPorCol: 0.0341
lunadavid: 0.0321
EnriquePenalosa: 0.0297
petrogustavo: 0.0280
mluciaramirez: 0.0250
MauricioGomezCO: 0.0249
mclacouture: 0.0169
pipecordoba: 0.0123
UltimaHoraCR: 0.0111
PalomaValenciaL: 0.0074
FenalcoNacional: 0.0061
EfrainCepeda: 0.0050
VickyDavilaH: 0.0046
consejodeestado: 0.0042
Hora20: 0.0031
PaolaHolguin: 0.0021
MejorJuntos_col: 0.0019
sergio_fajardo: 0.0013
ABDELAESPRIELLA: 0.0013
susanamuhamad: 0.0012
CaracolRadio: 0.0012
DianaCalderonF: 0.0011
cuervoji: 0.0009
paolaholguinm: 0.0009
PaulaMorenoV: 0.0008
PinzonBueno: 0.0006
MariaFdaCabal: 0.0006
German_Vargas: 0.0006
Antonio_Perry_: 0.0006
Borbon62610859: 0.0006
susana: 0.0006
DCoronell: 0.0005
carolinacorcho: 0.0005
IAelectoral: 0.0005
CadavidAnaMaria: 0.0005
sandranaranjo2: 0.0005
CarlosEForeroP: 0.0005
RevistaSemana: 0.0005
Raultheralphy: 0.0005
BluRadioCo: 0.0005
NoticiasCaracol: 0.0005
AngelicaLozanoC: 0.0004
PartidoVerdeCoL: 0.0

In [26]:
centrality = nx.degree_centrality(G)
top100_users_dc = sorted(centrality.items(), key=lambda x: x[1], reverse=True)[:100]
print("Top usuarios influyentes:", top100_users_dc)

Top usuarios influyentes: [('JDOviedoAr', 0.8481375358166189), ('juanmanuelgalan', 0.5583299222267704), ('lunadavid', 0.5423659435120753), ('EnriquePenalosa', 0.5251739664347114), ('mluciaramirez', 0.4838313548915268), ('MauricioGomezCO', 0.48260335652885794), ('ConTodaPorCol', 0.4760540319279574), ('petrogustavo', 0.4404420794105608), ('MauricioCard', 0.42570609905853457), ('mclacouture', 0.36348751534997953), ('pipecordoba', 0.17519443307408925), ('UltimaHoraCR', 0.1641424478100696), ('PalomaValenciaL', 0.07613589848546869), ('VickyDavilaH', 0.054441260744985676), ('EfrainCepeda', 0.04666393778141629), ('FenalcoNacional', 0.03643061809250921), ('consejodeestado', 0.02742529676627098), ('Hora20', 0.024559967253376995), ('mauricio_hdez', 0.022103970528039297), ('MejorJuntos_col', 0.020057306590257878), ('sergio_fajardo', 0.019647973802701595), ('PaolaHolguin', 0.01882930822758903), ('ABDELAESPRIELLA', 0.017191977077363897), ('nesandes', 0.014735980352026197), ('BunkerGlo', 0.0143266475

In [27]:
user_sentiments = {}

G = nx.DiGraph()

for _, row in df.iterrows():
    author = row['username']
    sentiment = row['sentiment_llm']
    if author not in user_sentiments:
        user_sentiments[author] = []
    user_sentiments[author].append(sentiment)
    
    G.add_node(author)

    mentions = row['entities'].get("mentions", [])
    for m in mentions:
        mentioned_user = m["username"]
        if mentioned_user not in user_sentiments:
            user_sentiments[mentioned_user] = []
        user_sentiments[mentioned_user].append(sentiment)
        G.add_node(mentioned_user)
        G.add_edge(author, mentioned_user, type="mention")

    if row["text"].startswith("RT @"):
        rt_user = row["text"].split("RT @")[1].split(":")[0]
        G.add_node(rt_user)
        G.add_edge(author, rt_user, type="retweet")

In [28]:
color_map = {"POSITIVO": "green", "NEGATIVO": "red", "NEUTRO": "gray"}

In [29]:
def majority_sentiment(sent_list):
    if not sent_list:
        return "NEUTRO"
    counts = Counter(sent_list)
    return counts.most_common(1)[0][0]

In [30]:
pagerank = nx.pagerank(G, alpha=0.85)
net = Network(height="100vh", width="100vw", bgcolor="#222222", font_color="white")
net.force_atlas_2based()

for node in G.nodes():
    if node == "JDOviedoAr":
        pr = pagerank[node] * 200
        net.add_node(node, label=node, size=pr*1.5, color="blue", shape="square")
    else:
        pr = pagerank[node] * 5000
        color = color_map[majority_sentiment(user_sentiments[node])]
        node_tweets = list(df[df['username'] == node]['text'])
        if len(node_tweets) > 0:
            tooltip = f"**{node}**\n"
            for item in node_tweets:
                tooltip += f"{item}\n\n"
            
            tooltip_html = f"""<div style="max-height:300px; max-width:300px; overflow:auto; text-align:left;"><br>{tooltip}</div>"""
            net.add_node(node, label=node, title=tooltip, size=pr, color=color)
        else:
            net.add_node(node, label=node, size=pr, color=color)

for source, target in G.edges():
    net.add_edge(source, target)

net.write_html(f"../data/{set_search}/JDOviedoAr_pagerank_sentiment_tweets.html")

# User profile

In [31]:
df.head()

Unnamed: 0,entities,author_id,id,text,created_at,public_metrics,edit_history_tweet_ids,username,name,sentiment_llm
0,"{'annotations': [{'start': 265, 'end': 272, 'p...",177823502,1973030623792296212,El RIDICULO que hicieron estos señores de la u...,2025-09-30 14:22:12+00:00,"{'retweet_count': 0, 'reply_count': 0, 'like_c...",[1973030623792296212],mikeariza,Miguel Ariza Castro,NEGATIVO
1,"{'mentions': [{'start': 0, 'end': 15, 'usernam...",3145461383,1973029119152787676,@lavozdelojusto @JDOviedoAr @ConTodaPorCol @Me...,2025-09-30 14:16:13+00:00,"{'retweet_count': 0, 'reply_count': 0, 'like_c...",[1973029119152787676],08Rosareda,Andrea 🦕 🌌💋🐳🐙,NEUTRO
2,"{'annotations': [{'start': 76, 'end': 79, 'pro...",3145461383,1973028989192314940,@Palabrejo @JDOviedoAr @ConTodaPorCol @MejorJu...,2025-09-30 14:15:42+00:00,"{'retweet_count': 0, 'reply_count': 0, 'like_c...",[1973028989192314940],08Rosareda,Andrea 🦕 🌌💋🐳🐙,NEUTRO
3,"{'annotations': [{'start': 63, 'end': 71, 'pro...",3145461383,1973028873748238623,@527ElduroJadin @JDOviedoAr @ConTodaPorCol @Me...,2025-09-30 14:15:15+00:00,"{'retweet_count': 0, 'reply_count': 0, 'like_c...",[1973028873748238623],08Rosareda,Andrea 🦕 🌌💋🐳🐙,NEUTRO
4,"{'annotations': [{'start': 60, 'end': 64, 'pro...",3145461383,1973028724628238409,@sircamilopinto @JDOviedoAr @ConTodaPorCol @Me...,2025-09-30 14:14:39+00:00,"{'retweet_count': 0, 'reply_count': 0, 'like_c...",[1973028724628238409],08Rosareda,Andrea 🦕 🌌💋🐳🐙,NEUTRO


In [32]:
all_users = list(df['username'].unique())
print(f'There are {len(all_users)} users')

There are 2072 users


In [33]:
chunks = [all_users[i:i+100] for i in range(0, len(all_users), 100)]

In [34]:
for idx, c in enumerate(chunks):
    print(f"Slice {idx+1} {len(c)}")

Slice 1 100
Slice 2 100
Slice 3 100
Slice 4 100
Slice 5 100
Slice 6 100
Slice 7 100
Slice 8 100
Slice 9 100
Slice 10 100
Slice 11 100
Slice 12 100
Slice 13 100
Slice 14 100
Slice 15 100
Slice 16 100
Slice 17 100
Slice 18 100
Slice 19 100
Slice 20 100
Slice 21 72


In [35]:
def get_account_info(accounts):
    url = "https://api.x.com/2/users/by"
    headers = {"Authorization": f"Bearer {BEARER_TOKEN}"}
    params = {
        "usernames": ','.join(accounts),
        "user.fields": "created_at,description,location,public_metrics,entities,affiliation,profile_banner_url"
    }
    
    response = requests.get(url, headers=headers, params=params)

    if response.status_code == 200:
        return response.json()['data']
    return None

In [36]:
users_desc = {
    'username': [],
    'account_info': []
}

for idx, c in enumerate(chunks):
    account_data = get_account_info(c)
    if account_data:
        for user, accdata in zip(c, account_data):
            users_desc['username'].append(user)
            users_desc['account_info'].append(accdata)

In [37]:
df_accounts = pd.DataFrame(users_desc)
df_accounts

Unnamed: 0,username,account_info
0,mikeariza,"{'id': '177823502', 'profile_banner_url': 'htt..."
1,08Rosareda,"{'id': '3145461383', 'profile_banner_url': 'ht..."
2,cpino527,"{'id': '520652537', 'profile_banner_url': 'htt..."
3,Boyacaindt,"{'id': '2883116211', 'public_metrics': {'follo..."
4,SlumdogUnderdog,"{'id': '1152411349', 'profile_banner_url': 'ht..."
...,...,...
2067,umabdalaziz3,"{'id': '1597223287', 'profile_banner_url': 'ht..."
2068,maha_875,"{'id': '1488698923', 'profile_banner_url': 'ht..."
2069,Pepiinno,"{'id': '1378620511', 'public_metrics': {'follo..."
2070,JeissonRayoR,"{'id': '1638556330976747520', 'location': 'Bog..."


In [38]:
system_prompt = """
Eres un asistente especializado en convertir datos extraidos de una cuenta de usuario proporcionado en formato JSON a lenguaje natural.
El objetivo es poder segmentar el usuario para una campaña política, resaltando también su alcance en redes sociales.
Tener en cuenta que Twitter ahora es X.
Utiliza únicamente la información proporcionada.
Genera el texto en formato HTML.
""".strip()

In [39]:
# llm_prof = ChatOpenAI(
#     api_key="EMPTY",
#     model="Qwen/Qwen2.5-32B-Instruct-AWQ",
#     base_url="http://10.20.21.137:11118/v1",
#     temperature=0
# )


llm_prof = ChatOpenAI(
    api_key="EMPTY",
    model="llama4:scout",
    base_url="http://10.20.21.137:8406/v1",
    temperature=0
)

In [40]:
def get_description(json_info):
    messages = [
        (
            "system",
            system_prompt,
        ),
        ("human", json.dumps(json_info, indent=2))
    ]
    ai_msg = llm_prof.invoke(messages)
    return ai_msg.content

In [41]:
get_description(df_accounts.loc[0, 'account_info'])

'<h1>Perfil de Usuario: Miguel Ariza Castro</h1>\n\n<p><strong>Información Básica:</strong></p>\n<ul>\n  <li><strong>Nombre:</strong> Miguel Ariza Castro</li>\n  <li><strong>Usuario:</strong> @mikeariza</li>\n  <li><strong>Ubicación:</strong> Colombia</li>\n  <li><strong>Fecha de Creación de la Cuenta:</strong> 13 de agosto de 2010</li>\n</ul>\n\n<p><strong>Métricas Públicas:</strong></p>\n<ul>\n  <li><strong>Seguidores:</strong> 453</li>\n  <li><strong>Siguiendo:</strong> 154 cuentas</li>\n  <li><strong>Tweets Publicados:</strong> 59,251</li>\n  <li><strong>Listas en las que está incluido:</strong> 2</li>\n  <li><strong>Me gusta dados:</strong> 75,809</li>\n  <li><strong>Medios publicados:</strong> 11,984</li>\n</ul>\n\n<p><strong>Alcance en Redes Sociales:</strong></p>\n<p>Con 453 seguidores y una actividad considerable en la plataforma (59,251 tweets y 11,984 medios publicados), el usuario @mikeariza tiene un alcance moderado en X (anteriormente Twitter). Su influencia puede verse r

In [42]:
df_accounts.head()['account_info'].progress_apply(lambda json_info: get_description(json_info))

100%|█████████████████████████████████████████████| 5/5 [00:30<00:00,  6.11s/it]


0    <h1>Perfil de Usuario: Miguel Ariza Castro</h1...
1    <h1>Perfil de usuario en X (anteriormente Twit...
2    <h1>Perfil de usuario en X (anteriormente Twit...
3    <h1>Perfil de usuario en X (anteriormente Twit...
4    <h1>Perfil de Usuario: Edwin Delgado Riaño</h1...
Name: account_info, dtype: object

In [43]:
df_accounts['profile'] = df_accounts['account_info'].progress_apply(lambda json_info: get_description(json_info))
df_accounts

100%|█████████████████████████████████████| 2072/2072 [3:25:39<00:00,  5.96s/it]


Unnamed: 0,username,account_info,profile
0,mikeariza,"{'id': '177823502', 'profile_banner_url': 'htt...",<h1>Perfil de Usuario: Miguel Ariza Castro</h1...
1,08Rosareda,"{'id': '3145461383', 'profile_banner_url': 'ht...",<h1>Segmentación de usuario para campaña polít...
2,cpino527,"{'id': '520652537', 'profile_banner_url': 'htt...",<h1>Perfil de usuario en X (anteriormente Twit...
3,Boyacaindt,"{'id': '2883116211', 'public_metrics': {'follo...",<h1>Perfil de usuario en X (anteriormente Twit...
4,SlumdogUnderdog,"{'id': '1152411349', 'profile_banner_url': 'ht...",<h1>Perfil de Usuario: Edwin Delgado Riaño</h1...
...,...,...,...
2067,umabdalaziz3,"{'id': '1597223287', 'profile_banner_url': 'ht...",<h1>Perfil de Usuario en X (anteriormente Twit...
2068,maha_875,"{'id': '1488698923', 'profile_banner_url': 'ht...",<h1>Perfil de Usuario: lina maría</h1>\n\n<p><...
2069,Pepiinno,"{'id': '1378620511', 'public_metrics': {'follo...",<h1>Perfil de Usuario: Pepiinno</h1>\n\n<p><st...
2070,JeissonRayoR,"{'id': '1638556330976747520', 'location': 'Bog...",<h1>Perfil de Usuario en X (Twitter)</h1>\n\n<...


In [44]:
df_accounts.to_csv(f'../data/{set_search}/df_accounts_profile.csv', index=False)

# With user profile

In [45]:
pagerank = nx.pagerank(G, alpha=0.85)
net = Network(height="100vh", width="100vw", bgcolor="#222222", font_color="white")
net.force_atlas_2based()

for node in G.nodes():
    if node == "JDOviedoAr":
        pr = pagerank[node] * 200
        net.add_node(node, label=node, size=pr*1.5, color="blue", shape="square")
    else:
        pr = pagerank[node] * 5000
        color = color_map[majority_sentiment(user_sentiments[node])]
        if len(df_accounts[df_accounts['username'] == node]) > 0:
            tooltip = df_accounts[df_accounts['username'] == node]['profile'].iloc[0]
            tooltip_html = f"""
            <div style="max-height:300px; max-width:500px; overflow:auto; text-align:left;">
            {tooltip}
            </div>
            """
            net.add_node(node, label=node, title=tooltip_html, size=pr, color=color)
        else:
            net.add_node(node, label=node, size=pr, color=color)

for source, target in G.edges():
    net.add_edge(source, target)

net.write_html(f"../data/{set_search}/JDOviedoAr_pagerank_sentiment_profile.html")