### 1. Загрузка данных

In [22]:
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm

In [23]:
df = pd.read_csv(r"D:\НЕТОЛОГИЯ\NLP\файлы\Constraint_Train.csv")
print(df.shape)
df.head()

(6420, 3)


Unnamed: 0,id,tweet,label
0,1,The CDC currently reports 99031 deaths. In gen...,real
1,2,States reported 1121 deaths a small rise from ...,real
2,3,Politically Correct Woman (Almost) Uses Pandem...,fake
3,4,#IndiaFightsCorona: We have 1524 #COVID testin...,real
4,5,Populous states can generate large case counts...,real


### 2. Предобработка текста

In [25]:
# Приведение к нижнему регистру. Оставлем только буквы
import re

en_words = re.compile('[a-zA-z]+')
df['tweet'] = df['tweet'].apply(lambda x: ' '.join(en_words.findall(x.lower())))

In [26]:
#Удаления стоп-слов, токенизация.
from nltk.tokenize import word_tokenize
from string import punctuation
from nltk.corpus import stopwords
from collections import Counter

stop_words = stopwords.words('english')
stopwords_dict = Counter(stop_words)

def pre_process(array):
    new_array = []
    for text in tqdm(array):
        text_update = ' '.join([word for word in word_tokenize(text) if word not in stopwords_dict])
        new_array.append(text_update)
    return new_array

df['tweet'] = pre_process(df['tweet'])

  0%|          | 0/6420 [00:00<?, ?it/s]

## 3. Векторизация текста

In [28]:
# Обучающая и тестовая выбоки
from sklearn.model_selection import train_test_split

y = df['label']
x = df['tweet']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 42)

In [29]:
# Count Vectorizer
from sklearn.feature_extraction.text import CountVectorizer

vec = CountVectorizer(ngram_range = (1, 1))
x_train_count = vec.fit_transform(x_train)
x_test_count = vec.transform(x_test)

## 4. Библиотека sklearn

*4.1. Логистическая регрессия*

In [32]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score
import warnings
warnings.filterwarnings('ignore')

lr_model = LogisticRegression(random_state = 42)
lr_model.fit(x_train_count, y_train)
lr_pred = lr_model.predict(x_test_count)
f1_lr = f1_score(y_test, lr_pred, average ='binary', pos_label = 'fake')
print(f'Логистическая регрессия. F1-score: {f1_lr:.4f}')

Логистическая регрессия. F1-score: 0.9123


*4.2. Случайный лес*

In [34]:
from sklearn.ensemble import RandomForestClassifier

rf_model = RandomForestClassifier(random_state = 42)
rf_model.fit(x_train_count, y_train)
rf_pred = rf_model.predict(x_test_count)
f1_rf = f1_score(y_test, rf_pred, average ='binary', pos_label = 'fake')
print(f'Случайный лес. F1-score: {f1_rf:.4f}')

Случайный лес. F1-score: 0.9130


*4.3. Бэггинг*

In [36]:
from sklearn.ensemble import BaggingClassifier

bagging = BaggingClassifier(LogisticRegression(random_state = 42), 
                            n_estimators = 20,
                           max_samples = 0.8,
                           max_features = 0.7,
                           random_state = 42)

bagging.fit(x_train_count, y_train)
bag_pred = bagging.predict(x_test_count)
f1_bagging = f1_score(y_test, bag_pred, average ='binary', pos_label = 'fake')
print(f'Бэггинг. F1-score: {f1_bagging:.4f}')

Бэггинг. F1-score: 0.9106


*4.4. Градиентный бустинг*

In [38]:
from sklearn.ensemble import GradientBoostingClassifier

gb_model = GradientBoostingClassifier(n_estimators = 800, max_depth = 8)
gb_model.fit(x_train_count, y_train)
gb_pred = gb_model.predict(x_test_count)
f1_rf = f1_score(y_test, gb_pred, average ='binary', pos_label = 'fake')
print(f'Градиентный бустинг. F1-score: {f1_rf:.4f}')

Градиентный бустинг. F1-score: 0.9159


## 5. PyTorch

*5.1. Подготовка данных*

In [41]:
# Кодирование текстовых значений классов в числовые
labels = (df['label'] == 'real').astype(int).to_list()

# Токенизация текстов
token_lists = [word_tokenize(text) for text in df['tweet']]
sentences = [word_tokenize(text.lower()) for text in tqdm(df.tweet)]

  0%|          | 0/6420 [00:00<?, ?it/s]

In [42]:
# Оценка максимального размера текстов (по количеству слов)
max_len = len(max(token_lists, key=len))
print('Макс. размер:', max_len)

Макс. размер: 871


In [43]:
# Распредление текстов по длине
df['len_words'] = [len(tokens) for tokens in token_lists]
print('Медиана кол-ва слов:', df['len_words'].median())

len_texts_distr = df['len_words'].value_counts(normalize = True).reset_index()
len_texts_distr = len_texts_distr.sort_values('index').rename(columns = {'index': 'длина текста', 'len_words': 'частота в %'})
len_texts_distr['частота в %'] = round(len_texts_distr['частота в %']*100, 2)

print()
print('Распределение текство по длине')
len_texts_distr

Медиана кол-ва слов: 17.0

Распределение текство по длине


Unnamed: 0,длина текста,частота в %
52,1,0.02
40,2,0.05
34,3,0.31
29,4,1.09
26,5,1.73
22,6,2.63
19,7,2.99
10,8,3.41
5,9,4.02
8,10,3.85


In [44]:
# Получение эмбеддингов слов через Word2vec
from gensim.models.word2vec import Word2Vec

model_tweets = Word2Vec(token_lists, workers = 4, vector_size = 300, min_count = 3, window = 5)

def get_word_embedding(tokens, max_len):
    result = []
    for i in range(max_len):
        if i < len(tokens):
            word = tokens[i]
            if word in model_tweets.wv:
                result.append(model_tweets.wv[word])
            else:
                result.append(np.zeros(300))
        else:
            result.append(np.zeros(300))
    return result

# Судя по распределению оптимальным розмером вектора слова будет 50
features_w = [get_word_embedding(text, 50) for text in tqdm(token_lists)]

  0%|          | 0/6420 [00:00<?, ?it/s]

In [48]:
# Получение эмбеддингов текстов через Word2vec
def get_text_embedding(text):
    result = []
    for word in word_tokenize(text):
        if word in model_tweets.wv:
            result.append(model_tweets.wv[word])

    if len(result):
        result = np.average(result, axis=0)
    else:
        result = np.zeros(300)
    return result

features_t = [get_text_embedding(text) for text in tqdm(df['tweet'])]

  0%|          | 0/6420 [00:00<?, ?it/s]

*5.2. Создание нейросети. Однослойная LSTM*

In [50]:
class lstm1(nn.Module):

    def __init__(self):
        super(lstm1, self).__init__()
        self.lstm = nn.LSTM(300, 100)
        self.out = nn.Linear(100, 1)

    def forward(self, x):
        embeddings, (shortterm, longterm) = self.lstm(x.transpose(0, 1))
        prediction = torch.sigmoid(self.out(longterm))
        return prediction
lstm1 = lstm1()
print(lstm1)

lstm1(
  (lstm): LSTM(300, 100)
  (out): Linear(in_features=100, out_features=1, bias=True)
)


In [52]:
# Разбиение данных, перевод в тензоры
x_train1, x_test1, y_train1, y_test1 = train_test_split(features_w, labels, test_size = 0.2)

in_data1 = torch.tensor(x_train1).float()
targets1 = torch.tensor(y_train1).float()

In [53]:
# Задаем оптимизатор и фунцию потеть
optimizer1 = optim.SGD(lstm1.parameters(), lr = 0.01)
criterion = nn.BCELoss()

In [54]:
# Функиция для обучении одной эпохи НС
def train_one_epoch(net, optimizer, criterion, in_data, targets, batch_size=16):
    for i in tqdm(range(0, in_data.shape[0], batch_size)):
        batch_x = in_data[i:i + batch_size]
        batch_y = targets[i:i + batch_size]
        optimizer.zero_grad() 
        output = net(batch_x) 
        loss = criterion(output.reshape(-1), batch_y) 
        loss.backward() 
        optimizer.step() 

In [58]:
# Функция для подсчета точности и f1-score
from torcheval.metrics import BinaryAccuracy
from torcheval.metrics.functional import binary_f1_score

def net_acc_f1(net, x_test, y_test):
    in_data_test = torch.tensor(x_test).float()
    targets_test = torch.tensor(y_test).float()
    
    with torch.no_grad():
        output = net(in_data_test).reshape(-1)
        
    metric = BinaryAccuracy()
    metric.update(output, targets_test)
    acc = metric.compute()
    f1_score = binary_f1_score(output, targets_test)
    print(f'Accuracy:, {acc:.4f}, F1-score: {f1_score:.4f}')

In [60]:
# Обучим 1 эпоху
train_one_epoch(lstm1, optimizer1, criterion, in_data1, targets1, batch_size=16)

net_acc_f1(lstm1, x_test1, y_test1)

  0%|          | 0/321 [00:00<?, ?it/s]

Accuracy:, 0.5288, F1-score: 0.6912


In [62]:
# Обучим 30 эпох
for i in range(30):
    train_one_epoch(lstm1, optimizer1, criterion, in_data1, targets1, batch_size=16)

  0%|          | 0/321 [00:00<?, ?it/s]

  0%|          | 0/321 [00:00<?, ?it/s]

  0%|          | 0/321 [00:00<?, ?it/s]

  0%|          | 0/321 [00:00<?, ?it/s]

  0%|          | 0/321 [00:00<?, ?it/s]

  0%|          | 0/321 [00:00<?, ?it/s]

  0%|          | 0/321 [00:00<?, ?it/s]

  0%|          | 0/321 [00:00<?, ?it/s]

  0%|          | 0/321 [00:00<?, ?it/s]

  0%|          | 0/321 [00:00<?, ?it/s]

  0%|          | 0/321 [00:00<?, ?it/s]

  0%|          | 0/321 [00:00<?, ?it/s]

  0%|          | 0/321 [00:00<?, ?it/s]

  0%|          | 0/321 [00:00<?, ?it/s]

  0%|          | 0/321 [00:00<?, ?it/s]

  0%|          | 0/321 [00:00<?, ?it/s]

  0%|          | 0/321 [00:00<?, ?it/s]

  0%|          | 0/321 [00:00<?, ?it/s]

  0%|          | 0/321 [00:00<?, ?it/s]

  0%|          | 0/321 [00:00<?, ?it/s]

  0%|          | 0/321 [00:00<?, ?it/s]

  0%|          | 0/321 [00:00<?, ?it/s]

  0%|          | 0/321 [00:00<?, ?it/s]

  0%|          | 0/321 [00:00<?, ?it/s]

  0%|          | 0/321 [00:00<?, ?it/s]

  0%|          | 0/321 [00:00<?, ?it/s]

  0%|          | 0/321 [00:00<?, ?it/s]

  0%|          | 0/321 [00:00<?, ?it/s]

  0%|          | 0/321 [00:00<?, ?it/s]

  0%|          | 0/321 [00:00<?, ?it/s]

In [63]:
net_acc_f1(lstm1, x_test1, y_test1)

Accuracy:, 0.5304, F1-score: 0.6919


*5.3. Создание нейросети. Многослойный перцептрон*

In [87]:
class mlp(nn.Module):

    def __init__(self):
        super(mlp, self).__init__()
        self.out = nn.Linear(300, 1)
        self.out = nn.Linear(300, 1)
        self.out = nn.Linear(300, 1)
        self.out = nn.Linear(300, 1)
        self.out = nn.Linear(300, 1)

    def forward(self, x):
        return torch.sigmoid(self.out(x))

mlp = mlp()
print(mlp)

mlp(
  (out): Linear(in_features=300, out_features=1, bias=True)
)


In [69]:
# Разбиение данных, перевод в тензоры
x_train2, x_test2, y_train2, y_test2 = train_test_split(features_t, labels, test_size = 0.2)

in_data2 = torch.tensor(x_train2).float()
targets2 = torch.tensor(y_train2).float()

In [95]:
# Задаем оптимизатор и фунцию потеть
optimizer2 = optim.Adam(mlp.parameters(), lr=0.01)
criterion = nn.BCEWithLogitsLoss()

In [97]:
# Обучим 30 эпох
for i in range(30):
    train_one_epoch(mlp, optimizer2, criterion, in_data2, targets2, batch_size=16)

  0%|          | 0/321 [00:00<?, ?it/s]

  0%|          | 0/321 [00:00<?, ?it/s]

  0%|          | 0/321 [00:00<?, ?it/s]

  0%|          | 0/321 [00:00<?, ?it/s]

  0%|          | 0/321 [00:00<?, ?it/s]

  0%|          | 0/321 [00:00<?, ?it/s]

  0%|          | 0/321 [00:00<?, ?it/s]

  0%|          | 0/321 [00:00<?, ?it/s]

  0%|          | 0/321 [00:00<?, ?it/s]

  0%|          | 0/321 [00:00<?, ?it/s]

  0%|          | 0/321 [00:00<?, ?it/s]

  0%|          | 0/321 [00:00<?, ?it/s]

  0%|          | 0/321 [00:00<?, ?it/s]

  0%|          | 0/321 [00:00<?, ?it/s]

  0%|          | 0/321 [00:00<?, ?it/s]

  0%|          | 0/321 [00:00<?, ?it/s]

  0%|          | 0/321 [00:00<?, ?it/s]

  0%|          | 0/321 [00:00<?, ?it/s]

  0%|          | 0/321 [00:00<?, ?it/s]

  0%|          | 0/321 [00:00<?, ?it/s]

  0%|          | 0/321 [00:00<?, ?it/s]

  0%|          | 0/321 [00:00<?, ?it/s]

  0%|          | 0/321 [00:00<?, ?it/s]

  0%|          | 0/321 [00:00<?, ?it/s]

  0%|          | 0/321 [00:00<?, ?it/s]

  0%|          | 0/321 [00:00<?, ?it/s]

  0%|          | 0/321 [00:00<?, ?it/s]

  0%|          | 0/321 [00:00<?, ?it/s]

  0%|          | 0/321 [00:00<?, ?it/s]

  0%|          | 0/321 [00:00<?, ?it/s]

In [98]:
net_acc_f1(mlp, x_test2, y_test2)

Accuracy:, 0.7368, F1-score: 0.7081
