In [None]:
from google.colab import files

uploaded = files.upload()

In [None]:
import pandas as pd
df = pd.read_csv('train.csv', index_col='id')

In [None]:
import re

pattern = r'@[^ ]+'
df['tweet'] = df['tweet'].str.replace(pattern, '', regex=True)

def clean_text(text):
    # Удаление пунктуации, цифр, хэштегов и скобок
    text = re.sub(r'[^\w\s]|[\d]|[(].*?[)]|#', '', text)
    # Удаление всех символов, не относящихся к ASCII
    text = re.sub(r'[^x00-x7F]+', ' ', text)

    return text

df['tweet'] = df['tweet'].apply(lambda x: clean_text(x))
df.head(10)

Unnamed: 0_level_0,label,tweet
id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,0,when a father is d sfunctional and is so self...
2,0,thanks for l ft credit i cant use cause the d...
3,0,bihda our majest
4,0,model i love u take with u all the time in ur
5,0,factsguide societ now motivation
6,0,huge fan fare and big talking before the leav...
7,0,camping tomorrow dann
8,0,the next school ear is the ear for exams cant ...
9,0,we won love the land allin cavs champions clev...
10,0,welcome here im its so gr


In [None]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

# Преобразование текста в числовые векторы
tokenizer = Tokenizer(num_words=1000)
tokenizer.fit_on_texts(df['tweet'])
sequences = tokenizer.texts_to_sequences(df['tweet'])
X = pad_sequences(sequences)

# Разделение на тренировочный и тестовый наборы
X_train, X_test, y_train, y_test = train_test_split(X, df['label'], test_size=0.2, random_state=42)

In [None]:
import keras
from keras.layers import Embedding, LSTM, Dense, AdditiveAttention, Input
from keras.models import Model
from sklearn.model_selection import train_test_split


# Задаем архитектуру модели
input_layer = Input(shape=(X_train.shape[1],))
embedding_layer = Embedding(input_dim=1000, output_dim=64)(input_layer)
lstm_layer = LSTM(64, return_sequences=True)(embedding_layer)
attention_output = AdditiveAttention()([lstm_layer, lstm_layer])
output_layer = Dense(1, activation='sigmoid')(attention_output)

model = Model(inputs=input_layer, outputs=output_layer)

# Компилируем модель
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Обучение модели
model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_test, y_test))

# Оценка модели
loss, accuracy = model.evaluate(X_test, y_test)
print(f'Loss: {loss}, Accuracy: {accuracy}')

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Loss: 0.18879234790802002, Accuracy: 0.9436571002006531


In [None]:
pip install keras-self-attention


Collecting keras-self-attention
  Downloading keras-self-attention-0.51.0.tar.gz (11 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: keras-self-attention
  Building wheel for keras-self-attention (setup.py) ... [?25l[?25hdone
  Created wheel for keras-self-attention: filename=keras_self_attention-0.51.0-py3-none-any.whl size=18894 sha256=70acac9258988a5a95a641e3682021c9252333cf828e1aeb1f1fa69b6a539a3b
  Stored in directory: /root/.cache/pip/wheels/b8/f7/24/607b483144fb9c47b4ba2c5fba6b68e54aeee2d5bf6c05302e
Successfully built keras-self-attention
Installing collected packages: keras-self-attention
Successfully installed keras-self-attention-0.51.0


In [None]:
import numpy as np
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Embedding, Dense
from keras_self_attention import SeqSelfAttention

tokenizer = Tokenizer()
tokenizer.fit_on_texts(df['tweet'])
sequences = tokenizer.texts_to_sequences(df['tweet'])
max_len = max([len(seq) for seq in sequences])
X = pad_sequences(sequences, maxlen=max_len)

model = Sequential()
model.add(Embedding(input_dim=len(tokenizer.word_index)+1, output_dim=100, input_length=max_len))
model.add(SeqSelfAttention(attention_activation='sigmoid'))
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

history = model.fit(X, df['label'], batch_size=32, epochs=10, validation_split=0.2)

loss, accuracy = model.evaluate(X, df['label'])
print(f'Loss: {loss}')
print(f'Accuracy: {accuracy}')

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Loss: 0.033407583832740784
Accuracy: 0.9913060069084167


In [None]:
from transformers import BartForSequenceClassification, BartTokenizer
import torch

# Загрузка заранее обученной модели BART
model = BartForSequenceClassification.from_pretrained('facebook/bart-base')

# Загрузка токенизатора для BART
tokenizer = BartTokenizer.from_pretrained('facebook/bart-base')

# Указание использования процессора (CPU)
device = torch.device("cpu")

# Перемещение модели и токенизатора на CPU
model.to(device)

# Перебор данных по батчам и инференс каждого батча
max_length = df['tweet'].apply(lambda x: len(tokenizer.encode(x))).max()
for i in range(1, len(df) + 1, max_length):
    batch_texts = df.loc[i:i+max_length-1, 'tweet'].tolist()
    inputs = tokenizer(batch_texts, max_length=64, padding="max_length", truncation=True, return_tensors="pt").to(device)
    outputs = model(**inputs)
    predictions = torch.sigmoid(outputs.logits).cpu().detach().numpy()

    # Обновление столбца 'Predicted Probability' на основе индексов 'id'
    df.loc[df.index.isin(range(i, i+max_length)), 'Predicted Probability'] = [prediction[0] for prediction in predictions]

    # Вывод предсказанных результатов для каждого текста в текущем батче
    for prediction, tweet in zip(predictions, batch_texts):
        print(f"Tweet: {tweet}")
        print(f"Predicted Probability: {prediction[0]}")

df.to_csv('predicted_data.csv')


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/1.72k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/558M [00:00<?, ?B/s]

Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-base and are newly initialized: ['classification_head.dense.bias', 'classification_head.dense.weight', 'classification_head.out_proj.bias', 'classification_head.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

[1;30;43mВыходные данные были обрезаны до нескольких последних строк (5000).[0m
Tweet:  because ever ones life is a constant persuit for happiness but no one have been able to find a wa to remai 
Predicted Probability: 0.2783881723880768
Tweet: this diagram explains trumps response to orlando rip theresistance resistance resist maga
Predicted Probability: 0.41966331005096436
Tweet: cloudlovers attack bull game d do ou reall think that his head was empt around the cit each side i 
Predicted Probability: 0.3494111895561218
Tweet: babies evenflo lux travel s stem with litemax infant car seat deep lake bouncingbab 
Predicted Probability: 0.3172985911369324
Tweet: when ou order a margarita and its pink drink it before an one see ou drinking it 
Predicted Probability: 0.3415636420249939
Tweet:  saturda alwa s sta blessed and positive 
Predicted Probability: 0.2965677082538605
Tweet: i agree with bernice the items are precious possessions and should not be sold onl handed down to next gen 


In [None]:

model.save_pretrained("/Users/margo/Desktop/archive")


Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}


In [None]:
df_1 = pd.read_csv('predicted_data.csv', index_col='id')

In [None]:
df_1

Unnamed: 0_level_0,label,tweet,Predicted Probability
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,0,when a father is d sfunctional and is so self...,0.469131
2,0,thanks for l ft credit i cant use cause the d...,0.357465
3,0,bihda our majest,0.319570
4,0,model i love u take with u all the time in ur,0.300749
5,0,factsguide societ now motivation,0.512455
...,...,...,...
31958,0,ate is that ouuu,0.316073
31959,0,to see nina turner on the airwaves tr ing to ...,0.400379
31960,0,listening to sad songs on a monda morning otw ...,0.287180
31961,1,sikh temple vandalised in in calgar wso conde...,0.303345


In [None]:
import numpy as np

df_1['Predicted Probability'] = np.where(df_1['Predicted Probability'] >= 0.5, 1, 0)
df_1

Unnamed: 0_level_0,label,tweet,Predicted Probability
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,0,when a father is d sfunctional and is so self...,0
2,0,thanks for l ft credit i cant use cause the d...,0
3,0,bihda our majest,0
4,0,model i love u take with u all the time in ur,0
5,0,factsguide societ now motivation,1
...,...,...,...
31958,0,ate is that ouuu,0
31959,0,to see nina turner on the airwaves tr ing to ...,0
31960,0,listening to sad songs on a monda morning otw ...,0
31961,1,sikh temple vandalised in in calgar wso conde...,0


In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

true_labels = df_1['label']
predicted_labels = df_1['Predicted Probability']

accuracy = accuracy_score(true_labels, predicted_labels)
precision = precision_score(true_labels, predicted_labels)
recall = recall_score(true_labels, predicted_labels)
f1 = f1_score(true_labels, predicted_labels)

print(f'Accuracy: {accuracy}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1 Score: {f1}')

Accuracy: 0.9271322195106689
Precision: 0.12173913043478261
Recall: 0.006244424620874219
F1 Score: 0.011879507848960542
