In [63]:
import requests
import xml.etree.ElementTree as ET
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords
import nltk
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import cosine_similarity

In [35]:
nltk.download("stopwords")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\joaop\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [64]:
# Funções

def get_arxiv_data(query, max_results=250, sort_by="relevance"):
    """
    Obtém artigos da arXiv API com base numa query.
    
    Args:
        query (str): Palavra-chave ou expressão de pesquisa.
        max_results (int): Número máximo de resultados a retornar
        sort_by (str): Critério de ordenação ('relevance', 'submittedDate', 'lastUpdatedDate').
    
    Returns:
        DataFrame: Dados dos artigos recolhidos.
    """
    base_url = "http://export.arxiv.org/api/query"
    results = []

    # Dividir as pesquisas em lotes de 1000 (limitação da arXiv API)
    for start in range(0, max_results, 1000):
        params = {
            "search_query": query,
            "start": start,
            "max_results": min(max_results - start, 1000),
            "sortBy": sort_by,  # Critério de ordenação
        }

        # Requisição à arXiv API
        response = requests.get(base_url, params=params)

        if response.status_code == 200:
            # Parsear o XML retornado
            root = ET.fromstring(response.text)
            for entry in root.findall("{http://www.w3.org/2005/Atom}entry"):
                # Extrair campos importantes do artigo
                title = entry.find("{http://www.w3.org/2005/Atom}title").text.strip()
                summary = entry.find("{http://www.w3.org/2005/Atom}summary").text.strip()
                link = entry.find("{http://www.w3.org/2005/Atom}id").text.strip()
                published = entry.find("{http://www.w3.org/2005/Atom}published").text.strip()
                updated = entry.find("{http://www.w3.org/2005/Atom}updated").text.strip()
                authors = [
                    author.find("{http://www.w3.org/2005/Atom}name").text.strip()
                    for author in entry.findall("{http://www.w3.org/2005/Atom}author")
                ]

                # Adicionar aos resultados
                results.append({
                    "Title": title,
                    "Summary": summary,
                    "Authors": ", ".join(authors),
                    "Link": link,
                    "Published": published,
                    "Updated": updated,
                })
        else:
            print(f"Erro na API: {response.status_code}")
            break

    # Converter para DataFrame
    return pd.DataFrame(results)


def extract_keywords(df, top_n=10):
    """
    Gera três DataFrames separados para palavras individuais, bigramas e trigramas mais frequentes.
    
    Args:
        df (DataFrame): DataFrame com os artigos.
        top_n (int): Número de palavras-chave mais frequentes a retornar para cada n-grama.
    
    Returns:
        tuple: DataFrames para unigramas, bigramas e trigramas.
    """
    # Concatenar títulos e resumos em um único texto
    combined_text = " ".join(df["Title"]) + " " + " ".join(df["Summary"])
    
    # Remover stopwords
    stop_words = list(stopwords.words("english"))

    # Criar vetorizadores para unigramas, bigramas e trigramas
    vectorizer_uni = CountVectorizer(ngram_range=(1, 1), stop_words=stop_words)
    vectorizer_bi = CountVectorizer(ngram_range=(2, 2), stop_words=stop_words)
    vectorizer_tri = CountVectorizer(ngram_range=(3, 3), stop_words=stop_words)
    
    # Criar vetores de palavras
    word_counts_uni = vectorizer_uni.fit_transform([combined_text])
    word_counts_bi = vectorizer_bi.fit_transform([combined_text])
    word_counts_tri = vectorizer_tri.fit_transform([combined_text])
    
    # Criar DataFrames para unigramas
    word_list_uni = vectorizer_uni.get_feature_names_out()
    count_list_uni = word_counts_uni.toarray().flatten()
    unigrams_df = pd.DataFrame({"Keyword": word_list_uni, "Count": count_list_uni})
    unigrams_df = unigrams_df.sort_values(by="Count", ascending=False).head(top_n)
    
    # Criar DataFrames para bigramas
    word_list_bi = vectorizer_bi.get_feature_names_out()
    count_list_bi = word_counts_bi.toarray().flatten()
    bigrams_df = pd.DataFrame({"Keyword": word_list_bi, "Count": count_list_bi})
    bigrams_df = bigrams_df.sort_values(by="Count", ascending=False).head(top_n)
    
    # Criar DataFrames para trigramas
    word_list_tri = vectorizer_tri.get_feature_names_out()
    count_list_tri = word_counts_tri.toarray().flatten()
    trigrams_df = pd.DataFrame({"Keyword": word_list_tri, "Count": count_list_tri})
    trigrams_df = trigrams_df.sort_values(by="Count", ascending=False).head(top_n)
    
    return unigrams_df, bigrams_df, trigrams_df


def filter_articles_by_keywords(df, keywords):
    """
    Filtra artigos que contêm palavras-chave relevantes no título ou resumo. 

    Args:
        df (DataFrame): DataFrame com os artigos.
        keywords (list): Lista de palavras-chave relevantes (tem de estar tudo em lower-case).

    Returns:
        DataFrame: Apenas os artigos relevantes.
    """
    def is_relevant(row):
        text = (row["Title"] + " " + row["Summary"]).lower()
        return any(keyword in text for keyword in keywords)
    
    df["Is_Relevant"] = df.apply(is_relevant, axis=1)
    return df[df["Is_Relevant"]]


def analyze_publication_trends(df):
    """
    Analisa a frequência de publicações ao longo do tempo.

    Args:
        df (DataFrame): DataFrame com os artigos.

    Returns:
        DataFrame: Frequência de publicações por ano.
    """
    df["Published_Year"] = pd.to_datetime(df["Published"]).dt.year
    trends = df.groupby("Published_Year").size().reset_index(name="Publication_Count")
    return trends


def top_authors(df, top_n=10):
    """
    Identifica os autores com mais publicações.

    Args:
        df (DataFrame): DataFrame com os artigos.
        top_n (int): Número de autores a listar.

    Returns:
        DataFrame: Autores mais frequentes e o número de publicações.
    """
    authors_series = df["Authors"].str.split(", ").explode()
    author_counts = authors_series.value_counts().head(top_n).reset_index()
    author_counts.columns = ["Author", "Publication_Count"]
    return author_counts


def cluster_articles(df, n_clusters=5):
    """
    Agrupa os artigos em clusters temáticos.

    Args:
        df (DataFrame): DataFrame com os artigos.
        n_clusters (int): Número de clusters.

    Returns:
        DataFrame: DataFrame com uma nova coluna "Cluster".
    """
    vectorizer = TfidfVectorizer(stop_words="english")
    tfidf_matrix = vectorizer.fit_transform(df["Summary"])
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    df["Cluster"] = kmeans.fit_predict(tfidf_matrix)
    return df


def find_similar_articles_with_url(df, article_url, top_n=5):
    """
    Identifica os artigos mais similares a um artigo específico com base na URL e mostra as URLs nos resultados.

    Args:
        df (DataFrame): DataFrame com os artigos.
        article_url (str): URL do artigo para o qual encontrar similares.
        top_n (int): Número de artigos similares a retornar.

    Returns:
        DataFrame: DataFrame com os artigos similares, suas pontuações de similaridade e URLs.
    """
    # Verificar se a URL existe no DataFrame
    if article_url not in df["Link"].values:
        raise ValueError("A URL fornecida não está no conjunto de dados.")
    
    # Obter o índice do artigo de interesse
    article_index = df[df["Link"] == article_url].index[0]

    # Vetorização TF-IDF
    vectorizer = TfidfVectorizer(stop_words="english")
    tfidf_matrix = vectorizer.fit_transform(df["Summary"])
    
    # Similaridade do artigo selecionado com todos os outros
    similarity_scores = cosine_similarity(tfidf_matrix[article_index], tfidf_matrix).flatten()
    
    # Ordenar pelos artigos mais similares (excluindo o próprio)
    similar_indices = similarity_scores.argsort()[-top_n-1:-1][::-1]
    similar_scores = similarity_scores[similar_indices]
    
    # Criar DataFrame com resultados
    similar_articles = df.iloc[similar_indices].copy()
    similar_articles["Similarity_Score"] = similar_scores
    return similar_articles[["Title", "Similarity_Score", "Link"]]


In [70]:
query = "reinforcement learning"  # Palavra-chave para pesquisas
max_results = 5000  # Quantidade máxima de artigos

articles = get_arxiv_data(query, max_results, sort_by='relevance')
unigrams, bigrams, trigrams = extract_keywords(articles, top_n=50)

In [72]:
articles.head()

Unnamed: 0,Title,Summary,Authors,Link,Published,Updated
0,Some Insights into Lifelong Reinforcement Lear...,A lifelong reinforcement learning system is a ...,Changjian Li,http://arxiv.org/abs/2001.09608v1,2020-01-27T07:26:12Z,2020-01-27T07:26:12Z
1,Counterexample-Guided Repair of Reinforcement ...,Naively trained Deep Reinforcement Learning ag...,"David Boetius, Stefan Leue",http://arxiv.org/abs/2405.15430v1,2024-05-24T10:56:51Z,2024-05-24T10:56:51Z
2,Deep Reinforcement Learning in Computer Vision...,Deep reinforcement learning augments the reinf...,"Ngan Le, Vidhiwar Singh Rathour, Kashu Yamazak...",http://arxiv.org/abs/2108.11510v1,2021-08-25T23:01:48Z,2021-08-25T23:01:48Z
3,Causal Reinforcement Learning: A Survey,Reinforcement learning is an essential paradig...,"Zhihong Deng, Jing Jiang, Guodong Long, Chengq...",http://arxiv.org/abs/2307.01452v2,2023-07-04T03:00:43Z,2023-11-21T03:43:15Z
4,Distributed Deep Reinforcement Learning: A Sur...,"With the breakthrough of AlphaGo, deep reinfor...","Qiyue Yin, Tongtong Yu, Shengqi Shen, Jun Yang...",http://arxiv.org/abs/2212.00253v1,2022-12-01T03:39:24Z,2022-12-01T03:39:24Z


In [74]:
article_url = "http://arxiv.org/abs/2405.15430v1" 
similar_articles_with_urls = find_similar_articles_with_url(articles, article_url, top_n=10)
similar_articles_with_urls

Unnamed: 0,Title,Similarity_Score,Link
1283,Deep Reinforcement Learning with Enhanced Safe...,0.221889,http://arxiv.org/abs/1910.12905v2
935,Context-Aware Safe Reinforcement Learning for ...,0.2166,http://arxiv.org/abs/2101.00531v1
2541,A Safety Modulator Actor-Critic Method in Mode...,0.216551,http://arxiv.org/abs/2410.06847v1
2880,SCPO: Safe Reinforcement Learning with Safety ...,0.214891,http://arxiv.org/abs/2311.00880v1
534,Safe Reinforcement Learning by Imagining the N...,0.197567,http://arxiv.org/abs/2202.07789v1
592,Learning-based Model Predictive Control for Sa...,0.196762,http://arxiv.org/abs/1906.12189v1
1168,Improving Safety in Deep Reinforcement Learnin...,0.192122,http://arxiv.org/abs/2109.14325v1
1250,Weakly Supervised Reinforcement Learning for A...,0.187187,http://arxiv.org/abs/2103.09726v1
1501,Verifiably Safe Off-Model Reinforcement Learning,0.180749,http://arxiv.org/abs/1902.05632v1
3555,MAMPS: Safe Multi-Agent Reinforcement Learning...,0.177638,http://arxiv.org/abs/1910.12639v2
