In [1]:
import requests
import xml.etree.ElementTree as ET
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords
import nltk
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import cosine_similarity
from langchain_ollama import OllamaLLM

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
nltk.download("stopwords")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\joaop\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [4]:
# Funções

def get_arxiv_data(query, max_results=250, sort_by="relevance"):
    """
    Obtém artigos da arXiv API com base numa query.
    
    Args:
        query (str): Palavra-chave ou expressão de pesquisa.
        max_results (int): Número máximo de resultados a retornar
        sort_by (str): Critério de ordenação ('relevance', 'submittedDate', 'lastUpdatedDate').
    
    Returns:
        DataFrame: Dados dos artigos recolhidos.
    """
    base_url = "http://export.arxiv.org/api/query"
    results = []

    # Dividir as pesquisas em lotes de 1000 (limitação da arXiv API)
    for start in range(0, max_results, 1000):
        params = {
            "search_query": query,
            "start": start,
            "max_results": min(max_results - start, 1000),
            "sortBy": sort_by,  # Critério de ordenação
        }

        # Requisição à arXiv API
        response = requests.get(base_url, params=params)

        if response.status_code == 200:
            # Parsear o XML retornado
            root = ET.fromstring(response.text)
            for entry in root.findall("{http://www.w3.org/2005/Atom}entry"):
                # Extrair campos importantes do artigo
                title = entry.find("{http://www.w3.org/2005/Atom}title").text.strip()
                summary = entry.find("{http://www.w3.org/2005/Atom}summary").text.strip()
                link = entry.find("{http://www.w3.org/2005/Atom}id").text.strip()
                published = entry.find("{http://www.w3.org/2005/Atom}published").text.strip()
                updated = entry.find("{http://www.w3.org/2005/Atom}updated").text.strip()
                authors = [
                    author.find("{http://www.w3.org/2005/Atom}name").text.strip()
                    for author in entry.findall("{http://www.w3.org/2005/Atom}author")
                ]

                primary_category = entry.find("{http://arxiv.org/schemas/atom}primary_category")
                journal_ref = entry.find("{http://arxiv.org/schemas/atom}journal_ref")
                comment = entry.find("{http://arxiv.org/schemas/atom}comment")

                # Adicionar aos resultados
                results.append({
                    "Title": title,
                    "Summary": summary,
                    "Authors": ", ".join(authors),
                    "Link": link,
                    "Published": published,
                    "Updated": updated,
                    "Primary_Category": primary_category.attrib["term"] if primary_category is not None else None,
                    "Journal_Reference": journal_ref.text.strip() if journal_ref is not None else None,
                    "Comment": comment.text.strip() if comment is not None else None,
                })
        else:
            print(f"Erro na API: {response.status_code}")
            break

    # Converter para DataFrame
    return pd.DataFrame(results)


def extract_keywords(df, top_n=10):
    """
    Gera três DataFrames separados para palavras individuais, bigramas e trigramas mais frequentes.
    
    Args:
        df (DataFrame): DataFrame com os artigos.
        top_n (int): Número de palavras-chave mais frequentes a retornar para cada n-grama.
    
    Returns:
        tuple: DataFrames para unigramas, bigramas e trigramas.
    """
    # Concatenar títulos e resumos em um único texto
    combined_text = " ".join(df["Title"]) + " " + " ".join(df["Summary"])
    
    # Remover stopwords
    stop_words = list(stopwords.words("english"))

    # Criar vetorizadores para unigramas, bigramas e trigramas
    vectorizer_uni = CountVectorizer(ngram_range=(1, 1), stop_words=stop_words)
    vectorizer_bi = CountVectorizer(ngram_range=(2, 2), stop_words=stop_words)
    vectorizer_tri = CountVectorizer(ngram_range=(3, 3), stop_words=stop_words)
    
    # Criar vetores de palavras
    word_counts_uni = vectorizer_uni.fit_transform([combined_text])
    word_counts_bi = vectorizer_bi.fit_transform([combined_text])
    word_counts_tri = vectorizer_tri.fit_transform([combined_text])
    
    # Criar DataFrames para unigramas
    word_list_uni = vectorizer_uni.get_feature_names_out()
    count_list_uni = word_counts_uni.toarray().flatten()
    unigrams_df = pd.DataFrame({"Keyword": word_list_uni, "Count": count_list_uni})
    unigrams_df = unigrams_df.sort_values(by="Count", ascending=False).head(top_n)
    
    # Criar DataFrames para bigramas
    word_list_bi = vectorizer_bi.get_feature_names_out()
    count_list_bi = word_counts_bi.toarray().flatten()
    bigrams_df = pd.DataFrame({"Keyword": word_list_bi, "Count": count_list_bi})
    bigrams_df = bigrams_df.sort_values(by="Count", ascending=False).head(top_n)
    
    # Criar DataFrames para trigramas
    word_list_tri = vectorizer_tri.get_feature_names_out()
    count_list_tri = word_counts_tri.toarray().flatten()
    trigrams_df = pd.DataFrame({"Keyword": word_list_tri, "Count": count_list_tri})
    trigrams_df = trigrams_df.sort_values(by="Count", ascending=False).head(top_n)
    
    return unigrams_df, bigrams_df, trigrams_df


def filter_articles_by_keywords(df, keywords):
    """
    Filtra artigos que contêm palavras-chave relevantes no título ou resumo. 

    Args:
        df (DataFrame): DataFrame com os artigos.
        keywords (list): Lista de palavras-chave relevantes (tem de estar tudo em lower-case).

    Returns:
        DataFrame: Apenas os artigos relevantes.
    """
    def is_relevant(row):
        text = (row["Title"] + " " + row["Summary"]).lower()
        return any(keyword in text for keyword in keywords)
    
    df["Is_Relevant"] = df.apply(is_relevant, axis=1)
    return df[df["Is_Relevant"]]


def analyze_publication_trends(df):
    """
    Analisa a frequência de publicações ao longo do tempo.

    Args:
        df (DataFrame): DataFrame com os artigos.

    Returns:
        DataFrame: Frequência de publicações por ano.
    """
    df["Published_Year"] = pd.to_datetime(df["Published"]).dt.year
    trends = df.groupby("Published_Year").size().reset_index(name="Publication_Count")
    return trends


def top_authors(df, top_n=10):
    """
    Identifica os autores com mais publicações.

    Args:
        df (DataFrame): DataFrame com os artigos.
        top_n (int): Número de autores a listar.

    Returns:
        DataFrame: Autores mais frequentes e o número de publicações.
    """
    authors_series = df["Authors"].str.split(", ").explode()
    author_counts = authors_series.value_counts().head(top_n).reset_index()
    author_counts.columns = ["Author", "Publication_Count"]
    return author_counts


def cluster_articles(df, n_clusters=5):
    """
    Agrupa os artigos em clusters temáticos.

    Args:
        df (DataFrame): DataFrame com os artigos.
        n_clusters (int): Número de clusters.

    Returns:
        DataFrame: DataFrame com uma nova coluna "Cluster".
    """
    vectorizer = TfidfVectorizer(stop_words="english")
    tfidf_matrix = vectorizer.fit_transform(df["Summary"])
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    df["Cluster"] = kmeans.fit_predict(tfidf_matrix)
    return df


def find_similar_articles_with_url(df, article_url, top_n=5):
    """
    Identifica os artigos mais similares a um artigo específico com base na URL e mostra as URLs nos resultados.

    Args:
        df (DataFrame): DataFrame com os artigos.
        article_url (str): URL do artigo para o qual encontrar similares.
        top_n (int): Número de artigos similares a retornar.

    Returns:
        DataFrame: DataFrame com os artigos similares, suas pontuações de similaridade e URLs.
    """
    # Verificar se a URL existe no DataFrame
    if article_url not in df["Link"].values:
        raise ValueError("A URL fornecida não está no conjunto de dados.")
    
    # Obter o índice do artigo de interesse
    article_index = df[df["Link"] == article_url].index[0]

    # Vetorização TF-IDF
    vectorizer = TfidfVectorizer(stop_words="english")
    tfidf_matrix = vectorizer.fit_transform(df["Summary"])
    
    # Similaridade do artigo selecionado com todos os outros
    similarity_scores = cosine_similarity(tfidf_matrix[article_index], tfidf_matrix).flatten()
    
    # Ordenar pelos artigos mais similares (excluindo o próprio)
    similar_indices = similarity_scores.argsort()[-top_n-1:-1][::-1]
    similar_scores = similarity_scores[similar_indices]
    
    # Criar DataFrame com resultados
    similar_articles = df.iloc[similar_indices].copy()
    similar_articles["Similarity_Score"] = similar_scores
    return similar_articles[["Title", "Similarity_Score", "Link"]]


def generate_summary_ollama(df, n=3):
    """
    Generates a concise summary from the first N summaries using Ollama (Llama3).

    Args:
        df (DataFrame): DataFrame containing articles (from get_arxiv_data).
        n (int): Number of summaries to include in the generated summary.

    Returns:
        str: Generated summary.
    """
    # Get the first N summaries
    summaries = df["Summary"].head(n).tolist()

    # Construct the summaries in a formatted list
    formatted_summaries = "\n\n".join([f"{i+1}. {summary}" for i, summary in enumerate(summaries)])

    # Create the prompt separately
    prompt = (
        f"Below are {n} scientific article summaries extracted from arXiv:\n\n"
        f"{formatted_summaries}\n\n"
        "Please generate a concise and coherent summary that combines the key ideas from these summaries."
    )

    # Initialize the Ollama LLM model
    model = OllamaLLM(model="llama3")

    # Invoke the model with the prompt
    result = model.invoke(prompt)
    
    return result

In [5]:
query = "text mining"  # Palavra-chave para pesquisas
max_results = 250  # Quantidade máxima de artigos

articles = get_arxiv_data(query, max_results, sort_by='relevance')
unigrams, bigrams, trigrams = extract_keywords(articles, top_n=50)

In [6]:
articles

Unnamed: 0,Title,Summary,Authors,Link,Published,Updated,Primary_Category,Journal_Reference,Comment
0,"Data, text and web mining for business intelli...",The Information and Communication Technologies...,Abdul-Aziz Rashid Al-Azmi,http://arxiv.org/abs/1304.3563v1,2013-04-12T08:04:31Z,2013-04-12T08:04:31Z,cs.IR,International Journal of Data Mining & Knowled...,"21 page, journal paper"
1,Text Data Mining from the Author's Perspective...,"Given the many technical, social, and policy s...",Christine L. Borgman,http://arxiv.org/abs/1803.04552v1,2018-03-12T22:11:40Z,2018-03-12T22:11:40Z,cs.DL,,Forum Statement: Data Mining with Limited Acce...
2,Very Large Language Model as a Unified Methodo...,Text data mining is the process of deriving es...,Meng Jiang,http://arxiv.org/abs/2212.09271v2,2022-12-19T06:52:13Z,2022-12-20T17:03:30Z,cs.DB,,"4 pages, 3 figures"
3,Pbm: A new dataset for blog mining,Text mining is becoming vital as Web 2.0 offer...,"Mehwish Aziz, Muhammad Rafi",http://arxiv.org/abs/1201.2073v1,2012-01-10T15:18:38Z,2012-01-10T15:18:38Z,cs.AI,,6; Internet and Web Engineering from: Internat...
4,Typesafe Modeling in Text Mining,Based on the concept of annotation-based agent...,Fabian Steeg,http://arxiv.org/abs/1108.0363v1,2011-07-28T17:46:20Z,2011-07-28T17:46:20Z,cs.PL,,"63 pages, in German"
...,...,...,...,...,...,...,...,...,...
245,Text data mining and data quality management f...,In the implementation and use of research info...,"Otmane Azeroual, Gunter Saake, Mohammad Abuosb...",http://arxiv.org/abs/1812.04298v1,2018-12-11T09:39:55Z,2018-12-11T09:39:55Z,cs.DL,ICOA 2018 3e colloque international sur le lib...,
246,A Semi-Supervised Deep Clustering Pipeline for...,Mining the latent intentions from large volume...,"Xinyu Chen, Ian Beaver",http://arxiv.org/abs/2202.00802v1,2022-02-01T23:01:05Z,2022-02-01T23:01:05Z,cs.CL,,Submitted to The Thirty-Fourth Annual Conferen...
247,A Case Study in Text Mining: Interpreting Twit...,Cluster analysis is a field of data analysis t...,"Daniel Godfrey, Caley Johns, Carl Meyer, Shain...",http://arxiv.org/abs/1408.5427v1,2014-08-21T17:58:33Z,2014-08-21T17:58:33Z,stat.ML,,
248,Analyses of Multi-collection Corpora via Compo...,As electronically stored data grow in daily li...,"Clint P. George, Wei Xia, George Michailidis",http://arxiv.org/abs/1907.01636v1,2019-06-17T06:59:25Z,2019-06-17T06:59:25Z,cs.IR,,


In [8]:
final_summary = generate_summary_ollama(articles, n=5)

In [9]:
final_summary

"Here's a concise and coherent summary that combines the key ideas from the five article summaries:\n\nThe rapid growth of digital data has driven the development of text data mining, a process of deriving essential information from language text. Various techniques, including data mining, text mining, and web mining, are used to uncover hidden knowledge in large databases or the Internet. Text mining involves tasks such as text categorization, clustering, topic modeling, information extraction, and summarization.\n\nAs the Web 2.0 era has brought collaborative content creation and sharing, researchers have increasingly focused on text mining methods for discovering knowledge. A typical text mining application involves preprocessing, stemming and lemmatization, tagging and annotation, deriving knowledge patterns, evaluating, and interpreting results. Standard datasets are crucial for evaluating these tasks, but there is a growing need to standardize the evaluation of text mining tasks.

In [25]:
trigrams

Unnamed: 0,Keyword,Count
15337,natural language processing,39
2937,biomedical text mining,20
23590,text mining techniques,20
23533,text mining methods,16
23587,text mining tasks,14
23294,text data mining,12
17605,pre trained language,12
2171,association rule mining,11
10308,https github com,9
23990,time series data,9


In [27]:
article_url = "http://arxiv.org/abs/1201.2073v1" 
similar_articles_with_urls = find_similar_articles_with_url(articles, article_url, top_n=10)
similar_articles_with_urls

Unnamed: 0,Title,Similarity_Score,Link
2,Very Large Language Model as a Unified Methodo...,0.218881,http://arxiv.org/abs/2212.09271v2
38,Multi-Task Learning Improves Performance In De...,0.169973,http://arxiv.org/abs/2307.01401v1
174,Semantic Web Requirements through Web Mining T...,0.160314,http://arxiv.org/abs/1208.0690v1
70,Probabilistic Semantic Web Mining Using Artifi...,0.155123,http://arxiv.org/abs/1004.1794v1
0,"Data, text and web mining for business intelli...",0.152596,http://arxiv.org/abs/1304.3563v1
42,Overview of Web Content Mining Tools,0.14912,http://arxiv.org/abs/1307.1024v1
46,Advancing Chinese biomedical text mining with ...,0.14619,http://arxiv.org/abs/2403.04261v2
146,Opinion Mining In Hindi Language: A Survey,0.135362,http://arxiv.org/abs/1404.4935v1
14,Sentiment Analysis: A Survey,0.134661,http://arxiv.org/abs/1405.2584v1
92,Meta-learning of textual representations,0.132407,http://arxiv.org/abs/1906.08934v2
