In [None]:
# Importacion de librerias:
import pandas as pd
import numpy as np
import statsmodels.api as sm
from scipy.stats import norm
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objects as go

In [None]:
# Elimina NumPy 2.x y reemplaza por NumPy 1.24.4 (compatible con fasttext)
!pip uninstall -y numpy
!pip install numpy==1.24.4

# Reinicia el entorno para aplicar los cambios de NumPy
import os
os.kill(os.getpid(), 9)


Found existing installation: numpy 1.24.4
Uninstalling numpy-1.24.4:
  Successfully uninstalled numpy-1.24.4
Collecting numpy==1.24.4
  Using cached numpy-1.24.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (5.6 kB)
Using cached numpy-1.24.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (17.3 MB)
Installing collected packages: numpy
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
pymc 5.23.0 requires numpy>=1.25.0, but you have numpy 1.24.4 which is incompatible.
tensorflow 2.18.0 requires numpy<2.1.0,>=1.26.0, but you have numpy 1.24.4 which is incompatible.
xarray-einstats 0.9.0 requires numpy>=1.25, but you have numpy 1.24.4 which is incompatible.
jaxlib 0.5.1 requires numpy>=1.25, but you have numpy 1.24.4 which is incompatible.
jax 0.5.2 requires numpy>=1.25, but you have numpy 1.24.4 which is incompatible.
treescop

# **Notes Data**

### Importación local

In [None]:
# 🔗 Montar Google Drive
from google.colab import drive
drive.mount('/content/drive')

# 📄 Ruta del archivo en tu Drive (ajusta si está en otra subcarpeta)
file_path = '/content/drive/MyDrive/TFM/notes-00000.tsv'

# 🧾 Leer el archivo TSV de forma robusta
import pandas as pd

notesdata_df = pd.read_csv(
    file_path,
    sep='\t',             # Separador tabulado
    engine='python',      # Motor más tolerante
    quoting=3,            # Ignora comillas como texto
    encoding='utf-8',     # Cambia a 'latin1' si da error
    on_bad_lines='skip'   # Ignora líneas problemáticas (pandas >= 1.3)
)

# ✅ Confirmar carga
print(f"Archivo cargado correctamente: {notesdata_df.shape[0]} filas, {notesdata_df.shape[1]} columnas")


Mounted at /content/drive
Archivo cargado correctamente: 1858582 filas, 23 columnas


In [None]:
!wc -l /content/drive/MyDrive/TFM/notes-00000.tsv


1858583 /content/drive/MyDrive/TFM/notes-00000.tsv


Nulos:

In [None]:
notesdata_df.shape

(1858582, 23)

In [None]:
notesdata_df.isna().sum()

Detección lengua

In [None]:
# Instalar fasttext (después del reinicio)
!pip install fasttext

# Descargar el modelo de detección de idiomas
!wget -q https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.ftz

# Cargar el modelo
import fasttext
model = fasttext.load_model("lid.176.ftz")


Collecting fasttext
  Downloading fasttext-0.9.3.tar.gz (73 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/73.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m73.4/73.4 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting pybind11>=2.2 (from fasttext)
  Using cached pybind11-2.13.6-py3-none-any.whl.metadata (9.5 kB)
Using cached pybind11-2.13.6-py3-none-any.whl (243 kB)
Building wheels for collected packages: fasttext
  Building wheel for fasttext (pyproject.toml) ... [?25l[?25hdone
  Created wheel for fasttext: filename=fasttext-0.9.3-cp311-cp311-linux_x86_64.whl size=4313510 sha256=1c4f231a5ae875b61dbefc815577f877214f82ed36af9f6ed726213346507912
  Stored in directory: /root/.cache/pip/wheels/65/4f/35/5057db0249224e9ab55a51

In [None]:
def clean_text(text):
    """Limpia espacios y controla errores de tipo"""
    if not isinstance(text, str):
        return ''
    return ' '.join(text.strip().split())

def truncate(text, max_len=1000):
    """Limita texto para evitar errores con textos largos"""
    return text[:max_len]

def detect_language_fasttext(text):
    """Detección robusta de idioma con fastText"""
    try:
        clean = truncate(clean_text(text))
        if clean:
            prediction = model.predict(clean)
            return prediction[0][0].replace("__label__", "")
        else:
            return 'unknown'
    except Exception as e:
        return 'error'


In [None]:
print(notesdata_df['summary'].iloc[0])
print(detect_language_fasttext(notesdata_df['summary'].iloc[0]))


The House failed to pass a border protection law that was designed to go along with these other bills.  It was intentionally set with a higher 2/3rds threshold requirement and did not pass even though it had a majority.       https://sourcenm.com/2024/04/22/u-s-house-votes-down-border-bill-favored-by-conservatives/    
en


In [None]:
notesdata_df['summary_language'] = notesdata_df['summary'].apply(detect_language_fasttext)


In [None]:
notesdata_df['summary_language'].value_counts()

Unnamed: 0_level_0,count
summary_language,Unnamed: 1_level_1
en,1202354
es,174464
ja,145282
pt,96402
fr,89774
...,...
jv,1
pms,1
mt,1
war,1


##DATASET EN ESPAÑOL:

In [None]:
notesdataspanish_df=notesdata_df[notesdata_df['summary_language']=='es']

In [None]:
notesdataspanish_df.shape

(174464, 24)

In [None]:
notesdataspanish_df.describe()

Unnamed: 0,noteId,createdAtMillis,tweetId,misleadingOther,misleadingFactualError,misleadingManipulatedMedia,misleadingOutdatedInformation,misleadingMissingImportantContext,misleadingUnverifiedClaimAsFact,misleadingSatire,notMisleadingOther,notMisleadingFactuallyCorrect,notMisleadingOutdatedButNotWhenWritten,notMisleadingClearlySatire,notMisleadingPersonalOpinion,trustworthySources,isMediaNote
count,174464.0,174464.0,174464.0,174464.0,174464.0,174464.0,174464.0,174464.0,174464.0,174464.0,174464.0,174464.0,174464.0,174464.0,174464.0,174464.0,174464.0
mean,1.79982e+18,1717946000000.0,1.798517e+18,0.089084,0.439288,0.084275,0.256872,0.476517,0.329203,0.063595,0.033136,0.118517,0.009922,0.03583,0.068209,0.83502,0.018061
std,6.713553e+16,16006360000.0,7.065833e+16,0.284866,0.496302,0.277801,0.43691,0.49945,0.469925,0.24403,0.178992,0.323221,0.099113,0.185866,0.252105,0.371163,0.133173
min,1.35485e+18,1611856000000.0,6.251483e+16,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,1.745628e+18,1705025000000.0,1.744895e+18,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
50%,1.803785e+18,1718891000000.0,1.803039e+18,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
75%,1.856241e+18,1731397000000.0,1.855313e+18,0.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
max,1.916296e+18,1745716000000.0,1.916282e+18,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [None]:
notesdataspanish_df.summary

Unnamed: 0,summary
237,Es falso que Begoña Gómez esté procesada. Se i...
241,El vídeo del departamento corresponde al sismo...
318,"El autor pide rectificación, sin embargo están..."
319,"NNN, es un clásico clickbait que se revela al ..."
320,No es un verso. Los fuegos artificiales afecta...
...,...
1858403,El usuario quiere promocionar su curso sin pag...
1858404,Según documentos desclasificados de la CIA (NO...
1858405,La justicia penal entendió que los médicos int...
1858406,La justicia penal entendió que los médicos int...


In [None]:
from google.colab import drive
drive.mount('/content/drive')

# Ruta donde quieres guardar el archivo en tu Drive
output_path = '/content/drive/MyDrive/TFM/notesdataspanish_.csv'

# Guardar el DataFrame como CSV
notesdataspanish_df.to_csv(output_path, index=False)

print(f"Archivo guardado en: {output_path}")

# **Web Scraping** --> al final sirvió de algo? borrar o nombrarlo en el informe como que no ha funcionado

In [None]:
tweet_ids = notesdataspanish_df['tweetId'].tolist()
print(f"Total de tweets a analizar: {len(tweet_ids)}")

In [None]:
# imprimir 10 primeros
print(tweet_ids[:10])

In [None]:

start_urls = [f"https://twitter.com/i/web/status/{tweet_id}" for tweet_id in tweet_ids]

# Guardar en formato JSON para Apify
import json
with open("apify_input.json", "w") as file:
    json.dump({"startUrls": start_urls}, file)


In [None]:
print(start_urls[:10])

In [None]:
#Cada tweetId en tu dataset representa un tweet específico en Twitter/X. Para ver el tweet en el navegador, usa la siguiente URL:

In [None]:
print(f"Número de URLs en la lista: {len(start_urls)}") # Usa len() para obtener el número de elementos

In [None]:
tweet_id = notesdata_df['tweetId'].iloc[0]  # Selecciona un tweet del dataset
tweet_url = f"https://x.com/i/web/status/{tweet_id}"
print("Accede al tweet aquí:", tweet_url)
print("Tweet ID:", tweet_id)


In [None]:
# 🔓 Desactivar verificación SSL (necesario en Colab)
import ssl
ssl._create_default_https_context = ssl._create_unverified_context

# 📥 Importar snscrape
import snscrape.modules.twitter as sntwitter

# 🆔 Tweet ID que quieres scrapear
tweet_id = '1783159712986382830'

# 🧼 Scraping del contenido
try:
    tweet = next(sntwitter.TwitterTweetScraper(tweet_id).get_items())
    print(tweet.content)
except Exception as e:
    print("❌ Error al obtener el tweet:", e)


In [None]:
!pip install snscrape

import snscrape.modules.twitter as sntwitter
import pandas as pd

# Pega aquí tu lista de URLs
start_urls
# Extraer tweet ID desde URL
def extract_id(url):
    return url.strip().split("/")[-1]

# Guardar textos
tweets_data = []
for url in start_urls:
    tweet_id = extract_id(url)
    try:
        tweet = next(sntwitter.TwitterTweetScraper(tweet_id).get_items())
        tweets_data.append({
            "url": url,
            "text": tweet.content
        })
    except Exception as e:
        tweets_data.append({
            "url": url,
            "text": f"[ERROR] {str(e)}"
        })

# Exportar a JSON y CSV
df = pd.DataFrame(tweets_data)
df.to_json("tweets_output.json", orient="records", force_ascii=False)
df.to_csv("tweets_output.csv", index=False, encoding="utf-8")

print("✅ Tweets exportados a 'tweets_output.json' y 'tweets_output.csv'")
