## Libraries

In [62]:
import requests
from pathlib import Path
from functools import lru_cache
import logging

import numpy as np
import pandas as pd
import spacy
import spacy.lang.pt
from spacy.tokens import Doc

from gensim.models import Word2Vec, KeyedVectors

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

from joblib import dump, load

## Helper Functions

In [39]:
@lru_cache(maxsize=None)
def download_csv(url, data_dir='./../data') -> pd.DataFrame:
    # Create cache directory if not exists
    Path(data_dir).mkdir(parents=True, exist_ok=True)
    
    # Generate a safe filename from URL
    filename = Path(data_dir) / url.split('/')[-1]
    
    # Check if file already exists locally
    if filename.exists():
        return pd.read_csv(filename)
    
    try:
        # Advanced request with timeout and proper headers
        headers = {
            'User-Agent': 'Mozilla/5.0 Academic Data Retrieval',
            'Accept': 'text/csv',
        }
        response = requests.get(
            url, 
            headers=headers, 
            timeout=300,  # 300 seconds timeout
            stream=True  # Memory efficient for large files
        )
        
        # Raise an exception for bad status codes
        response.raise_for_status()
        
        # Save to local cache
        with open(filename, 'wb') as f:
            for chunk in response.iter_content(chunk_size=8192):
                f.write(chunk)
        
        return pd.read_csv(filename)
    
    except requests.exceptions.RequestException as e:
        print(f"Network error: {e}.")
        raise

In [40]:
def get_text_from_valid_tokens(doc: Doc) -> str | None:
    valid_tokens = []

    for token in doc:
        is_valid = token.is_alpha and not token.is_stop
        if is_valid:
            valid_tokens.append(token.text)

    if len(valid_tokens) > 2:
        return " ".join(valid_tokens)
    return None

In [41]:
def tokenizer(text: str, nlp: spacy.lang.pt.Portuguese) -> list[str]:
    doc = nlp(text)
    tokens = []

    for token in doc:
        is_valid = token.is_alpha and not token.is_stop
        if is_valid:
            tokens.append(token.text.lower())

    return tokens

In [42]:
def sum_tokens_vector(tokens: list[str], model: KeyedVectors) -> list[float]:
    vector = np.zeros((1, 300))

    for token in tokens:
        try:
            vector += model.get_vector(token)
        except KeyError:
            continue # If the token doesn't exist on the vocab, jump to the next token

    return vector

In [43]:
def get_vectors(texts: pd.Series, model: KeyedVectors, nlp: spacy.lang.pt.Portuguese):
    vectors = np.zeros((len(texts), 300))

    for i, text in enumerate(texts):
        tokens = tokenizer(text, nlp)
        vectors[i] = sum_tokens_vector(tokens, model)

    return vectors

In [52]:
def lr_classifier(x_train: pd.Series | np.ndarray,
                  y_train: pd.Series | np.ndarray,
                  x_test: pd.Series | np.ndarray,
                  y_test: pd.Series | np.ndarray) -> LogisticRegression:

    lr_model = LogisticRegression(max_iter=800)
    lr_model.fit(x_train, y_train)
    labels = lr_model.predict(x_test)
    results = classification_report(y_test, labels)
    print(results)
    return lr_model

## Loading data

In [44]:
df_train = download_csv("https://cdn3.gnarususercontent.com.br/1638-word-embedding/treino.csv")
df_test = download_csv("https://cdn3.gnarususercontent.com.br/1638-word-embedding/teste.csv")

In [5]:
df_train.sample(5)

Unnamed: 0,title,text,date,category,subcategory,link
29095,Animação 'Vida de Abobrinha' é afetuosa sem se...,"Icare –ou Abobrinha, como prefere ser chamado–...",2017-02-19,ilustrada,,http://www1.folha.uol.com.br/ilustrada/2017/02...
22458,"Em cinco minutos, croata vence jogo interrompi...",O croata Borna Coric precisou de cinco minutos...,2015-09-19,esporte,,http://www1.folha.uol.com.br/esporte/2015/09/1...
45655,Polícia filipina usa hospitais para ocultar mo...,Os residentes do bairro Old Balara se esconder...,2017-06-30,mundo,,http://www1.folha.uol.com.br/mundo/2017/06/189...
89662,"Em raro pronunciamento, imperador japonês indi...","O imperador do Japão, Akihito, expressou preoc...",2016-08-08,mundo,,http://www1.folha.uol.com.br/mundo/2016/08/180...
13516,Casa Folha trará conversas gratuitas com autor...,"Durante a Flip, a Casa Folha manterá uma progr...",2017-07-26,ilustrada,,http://www1.folha.uol.com.br/ilustrada/2017/07...


## Processing Data

In [6]:
train_texts = df_train["title"].str.lower()

In [14]:
nlp = spacy.load("pt_core_news_sm")

cleaned_texts = []
for doc in nlp.pipe(train_texts, batch_size=1000, n_process=-1):
    cleaned_texts.append(get_text_from_valid_tokens(doc))

In [15]:
df_train_cleaned = pd.DataFrame({"Title": cleaned_texts})
df_train_cleaned.sample(5)

Unnamed: 0,Title
7630,escritor joão gilberto noll morre anos porto a...
5295,exportação games brasileiros aumentar ano
83436,robinho ressalta atuação ceni fala ansiedade h...
80114,terry crews defende comédia feminismo brooklyn
81537,


In [16]:
df_train_cleaned.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 90000 entries, 0 to 89999
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Title   84680 non-null  object
dtypes: object(1)
memory usage: 703.3+ KB


In [17]:
df_train_cleaned = df_train_cleaned.dropna()
df_train_cleaned = df_train_cleaned.drop_duplicates()
df_train_cleaned.info()

<class 'pandas.core.frame.DataFrame'>
Index: 84466 entries, 0 to 89999
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Title   84466 non-null  object
dtypes: object(1)
memory usage: 1.3+ MB


In [22]:
tokens = df_train_cleaned["Title"].str.split(" ")

## Building the Model

### CBOW Model

In [10]:
model = Word2Vec(sg=0, window=2, vector_size=300, min_count=5, alpha=0.03, min_alpha=0.007)

In [19]:
logging.basicConfig(format="%(asctime)s : - %(message)s", level=logging.INFO)

In [None]:
model.build_vocab(tokens, progress_per=5000)

2024-12-02 15:12:18,964 : - collecting all words and their counts
2024-12-02 15:12:18,964 : - PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2024-12-02 15:12:18,969 : - PROGRESS: at sentence #5000, processed 31930 words, keeping 10193 word types
2024-12-02 15:12:18,973 : - PROGRESS: at sentence #10000, processed 63848 words, keeping 14989 word types
2024-12-02 15:12:18,976 : - PROGRESS: at sentence #15000, processed 95753 words, keeping 18279 word types
2024-12-02 15:12:18,982 : - PROGRESS: at sentence #20000, processed 127689 words, keeping 21033 word types
2024-12-02 15:12:18,988 : - PROGRESS: at sentence #25000, processed 159589 words, keeping 23491 word types
2024-12-02 15:12:18,992 : - PROGRESS: at sentence #30000, processed 191554 words, keeping 25494 word types
2024-12-02 15:12:18,997 : - PROGRESS: at sentence #35000, processed 223412 words, keeping 27330 word types
2024-12-02 15:12:19,001 : - PROGRESS: at sentence #40000, processed 255282 words, keeping 29053

In [23]:
model.train(tokens, total_examples=model.corpus_count, epochs=30)

2024-12-02 15:32:09,352 : - Word2Vec lifecycle event {'msg': 'training model with 3 workers on 12924 vocabulary and 300 features, using sg=0 hs=0 sample=0.001 negative=5 window=2 shrink_windows=True', 'datetime': '2024-12-02T15:32:09.352773', 'gensim': '4.3.3', 'python': '3.12.7 (main, Oct  1 2024, 02:05:46) [Clang 15.0.0 (clang-1500.3.9.4)]', 'platform': 'macOS-14.6.1-arm64-arm-64bit', 'event': 'train'}
2024-12-02 15:32:09,789 : - EPOCH 0: training on 540242 raw words (485990 effective words) took 0.4s, 1184025 effective words/s
2024-12-02 15:32:10,192 : - EPOCH 1: training on 540242 raw words (486141 effective words) took 0.4s, 1234346 effective words/s
2024-12-02 15:32:10,576 : - EPOCH 2: training on 540242 raw words (486209 effective words) took 0.4s, 1282820 effective words/s
2024-12-02 15:32:10,966 : - EPOCH 3: training on 540242 raw words (486093 effective words) took 0.4s, 1262589 effective words/s
2024-12-02 15:32:11,337 : - EPOCH 4: training on 540242 raw words (485969 effect

(14583855, 16207260)

### Skip Gram Model

In [25]:
model_sg = Word2Vec(sg=1,
                    window=5,
                    vector_size=300,
                    min_count=5,
                    alpha=0.03,
                    min_alpha=0.007)

model_sg.build_vocab(tokens, progress_per=5000)

model_sg.train(tokens,
               total_examples=model_sg.corpus_count,
               epochs=30)

2024-12-02 16:00:20,778 : - Word2Vec lifecycle event {'params': 'Word2Vec<vocab=0, vector_size=300, alpha=0.03>', 'datetime': '2024-12-02T16:00:20.778046', 'gensim': '4.3.3', 'python': '3.12.7 (main, Oct  1 2024, 02:05:46) [Clang 15.0.0 (clang-1500.3.9.4)]', 'platform': 'macOS-14.6.1-arm64-arm-64bit', 'event': 'created'}
2024-12-02 16:00:20,779 : - collecting all words and their counts
2024-12-02 16:00:20,781 : - PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2024-12-02 16:00:20,793 : - PROGRESS: at sentence #5000, processed 31930 words, keeping 10193 word types
2024-12-02 16:00:20,839 : - PROGRESS: at sentence #10000, processed 63848 words, keeping 14989 word types
2024-12-02 16:00:20,851 : - PROGRESS: at sentence #15000, processed 95753 words, keeping 18279 word types
2024-12-02 16:00:20,857 : - PROGRESS: at sentence #20000, processed 127689 words, keeping 21033 word types
2024-12-02 16:00:20,862 : - PROGRESS: at sentence #25000, processed 159589 words, keeping 234

(14584300, 16207260)

## Saving the models

In [56]:
model.wv.save_word2vec_format("./../models/model_cbow.txt", binary=False)
model_sg.wv.save_word2vec_format("./../models/model_skipgram.txt", binary=False)

2024-12-03 11:19:37,679 : - storing 12924x300 projection weights into ./../models/model_cbow.txt
2024-12-03 11:19:39,009 : - storing 12924x300 projection weights into ./../models/model_skipgram.txt


## Vectorizing the data

In [57]:
cbow_model = KeyedVectors.load_word2vec_format("./../models/model_cbow.txt")
skipgram_model = KeyedVectors.load_word2vec_format("./../models/model_skipgram.txt")

2024-12-03 11:19:43,773 : - loading projection weights from ./../models/model_cbow.txt
2024-12-03 11:19:44,809 : - KeyedVectors lifecycle event {'msg': 'loaded (12924, 300) matrix of type float32 from ./../models/model_cbow.txt', 'binary': False, 'encoding': 'utf8', 'datetime': '2024-12-03T11:19:44.809474', 'gensim': '4.3.3', 'python': '3.12.7 (main, Oct  1 2024, 02:05:46) [Clang 15.0.0 (clang-1500.3.9.4)]', 'platform': 'macOS-14.6.1-arm64-arm-64bit', 'event': 'load_word2vec_format'}
2024-12-03 11:19:44,811 : - loading projection weights from ./../models/model_skipgram.txt
2024-12-03 11:19:45,843 : - KeyedVectors lifecycle event {'msg': 'loaded (12924, 300) matrix of type float32 from ./../models/model_skipgram.txt', 'binary': False, 'encoding': 'utf8', 'datetime': '2024-12-03T11:19:45.843738', 'gensim': '4.3.3', 'python': '3.12.7 (main, Oct  1 2024, 02:05:46) [Clang 15.0.0 (clang-1500.3.9.4)]', 'platform': 'macOS-14.6.1-arm64-arm-64bit', 'event': 'load_word2vec_format'}


In [45]:
nlp = spacy.load("pt_core_news_sm", disable=["paser", "ner", "tagger", "textcat"])

In [47]:
cbow_vectors_train = get_vectors(df_train["title"], cbow_model, nlp)
cbow_vectors_test = get_vectors(df_test["title"], cbow_model, nlp)

print(cbow_vectors_train.shape)
print(cbow_vectors_test.shape)

(90000, 300)
(20513, 300)


In [58]:
skipgram_vectors_train = get_vectors(df_train["title"], skipgram_model, nlp)
skipgram_vectors_test = get_vectors(df_test["title"], skipgram_model, nlp)

print(skipgram_vectors_train.shape)
print(skipgram_vectors_test.shape)

(90000, 300)
(20513, 300)


## Training a Logistic Regression model

In [53]:
cbow_lr_model = lr_classifier(cbow_vectors_train, df_train["category"], cbow_vectors_test, df_test["category"])

              precision    recall  f1-score   support

     colunas       0.80      0.71      0.75      6103
   cotidiano       0.64      0.80      0.71      1698
     esporte       0.93      0.87      0.90      4663
   ilustrada       0.13      0.86      0.23       131
     mercado       0.84      0.78      0.81      5867
       mundo       0.74      0.83      0.79      2051

    accuracy                           0.79     20513
   macro avg       0.68      0.81      0.70     20513
weighted avg       0.82      0.79      0.80     20513



In [59]:
skipgram_lr_model = lr_classifier(skipgram_vectors_train, df_train["category"], skipgram_vectors_test, df_test["category"])

              precision    recall  f1-score   support

     colunas       0.81      0.72      0.76      6103
   cotidiano       0.64      0.81      0.71      1698
     esporte       0.93      0.88      0.90      4663
   ilustrada       0.14      0.87      0.24       131
     mercado       0.84      0.79      0.81      5867
       mundo       0.76      0.84      0.80      2051

    accuracy                           0.79     20513
   macro avg       0.69      0.82      0.70     20513
weighted avg       0.82      0.79      0.80     20513



## Saving the models

In [63]:
dump(cbow_lr_model, "./../models/cbow_lr_model.joblib")
dump(skipgram_lr_model, "./../models/skipgram_lr_model.joblib")

['./../models/skipgram_lr_model.joblib']