In [1]:
categories = ['sci.med', 'talk.politics.guns', 'rec.sport.baseball']

In [2]:
from sklearn.datasets import fetch_20newsgroups

newsgroups = fetch_20newsgroups(subset='all', categories=categories)

In [7]:
import torch
from transformers import BertTokenizer, BertModel

  from .autonotebook import tqdm as notebook_tqdm


In [8]:
model_name = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertModel.from_pretrained(model_name)

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


In [9]:
import re
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

def preprocess_text(text):
    tokens = word_tokenize(text)
    
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    
    stop_words = set(stopwords.words("english"))
    tokens = [token for token in tokens if token.lower() not in stop_words]
    
    tokens = [re.sub(r"[^\w\s]", "", token) for token in tokens]
    tokens = [token.lower() for token in tokens]
    
    preprocessed_text = " ".join(tokens)
    
    return preprocessed_text

preprocessed_text = [preprocess_text(text) for text in newsgroups.data]

Предобработанный текст:


In [14]:
tokenized_texts = [tokenizer(text, padding=True, truncation=True, return_tensors="pt") for text in preprocessed_text]

In [17]:
with torch.no_grad():
    outputs = [model(**i).last_hidden_state[:, 0, :] for i in tokenized_texts]

In [27]:
outputs = [output.squeeze() for output in outputs]

In [28]:
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split

In [29]:
X_train, X_test, y_train, y_test = train_test_split(outputs, newsgroups.target, test_size=0.2, random_state=42)

In [30]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

In [32]:
clf_rand = RandomForestClassifier()
clf_grad = GradientBoostingClassifier()

clf_rand.fit(X_train, y_train)
y_pred = clf_rand.predict(X_test)
print(f"F1 Score: {f1_score(y_test, y_pred, average='weighted')}")


clf_grad.fit(X_train, y_train)
y_pred = clf_grad.predict(X_test)
print(f"F1 Score: {f1_score(y_test, y_pred, average='weighted')}")

F1 Score: 0.9325317493364089
F1 Score: 0.9360931371907868


In [34]:
from transformers import RobertaTokenizer, RobertaModel


model_name = 'roberta-base'
tokenizerRob = RobertaTokenizer.from_pretrained(model_name)
modelRob = RobertaModel.from_pretrained(model_name)

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [35]:
tokenized_texts_R = [tokenizerRob(text, padding=True, truncation=True, return_tensors="pt") for text in preprocessed_text]

In [36]:
with torch.no_grad():
    outputsR = [modelRob(**i).last_hidden_state[:, 0, :] for i in tokenized_texts_R]

In [37]:
outputsR = [output.squeeze() for output in outputsR]

In [38]:
X_trainR, X_testR, y_trainR, y_testR = train_test_split(outputsR, newsgroups.target, test_size=0.2, random_state=42)

In [40]:
clf_rand_Rob = RandomForestClassifier()
clf_grad_Rob = GradientBoostingClassifier()

clf_rand_Rob.fit(X_trainR, y_trainR)
y_pred = clf_rand_Rob.predict(X_testR)
print(f"F1 Score: {f1_score(y_testR, y_pred, average='weighted')}")


clf_grad_Rob.fit(X_trainR, y_trainR)
y_pred = clf_grad_Rob.predict(X_testR)
print(f"F1 Score: {f1_score(y_testR, y_pred, average='weighted')}")

F1 Score: 0.9394880819378268
F1 Score: 0.9394855372330663
