In [None]:
import pandas as pd
import json
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Загрузка датасета
url = "https://storage.yandexcloud.net/academy.ai/Sarcasm_Headlines_Dataset_v2.json.zip"
df = pd.read_json(url, lines=True)

# Выделение признаков и целевой переменной
X = df['headline']
y = df['is_sarcastic']

# Предобработка данных
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    # Токенизация
    tokens = word_tokenize(text.lower())
    # Удаление стоп-слов и лемматизация
    tokens = [lemmatizer.lemmatize(token) for token in tokens if token.isalnum() and token not in stop_words]
    return ' '.join(tokens)

X = X.apply(preprocess_text)

# Векторизация текста
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(X)

# Разделение данных на обучающую и тестовую выборки
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Обучение модели
model = LogisticRegression()
model.fit(X_train, y_train)

# Оценка модели
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Точность модели: {accuracy:.2f}")

# Проверка модели на новых данных
new_headlines = [
    "Scientists discover new species of unicorn",
    "Government announces plan to solve world hunger by 2025",
    "Man survives 30 days on nothing but pizza and soda",
    "Study finds that exercise is bad for your health",
    "New study shows that coffee is the secret to long life"
]

new_headlines_processed = [preprocess_text(headline) for headline in new_headlines]
new_headlines_vectorized = vectorizer.transform(new_headlines_processed)

predictions = model.predict(new_headlines_vectorized)

for headline, prediction in zip(new_headlines, predictions):
    print(f"Заголовок: {headline} | Сарказм: {'Да' if prediction else 'Нет'}")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Точность модели: 0.79
Заголовок: Scientists discover new species of unicorn | Сарказм: Да
Заголовок: Government announces plan to solve world hunger by 2025 | Сарказм: Да
Заголовок: Man survives 30 days on nothing but pizza and soda | Сарказм: Да
Заголовок: Study finds that exercise is bad for your health | Сарказм: Да
Заголовок: New study shows that coffee is the secret to long life | Сарказм: Да


In [None]:
import pandas as pd
import json
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from kerastuner.tuners import RandomSearch

# Загрузка датасета
url = "https://storage.yandexcloud.net/academy.ai/Sarcasm_Headlines_Dataset_v2.json.zip"
df = pd.read_json(url, lines=True)

# Выделение признаков и целевой переменной
X = df['headline'].tolist()
y = df['is_sarcastic'].tolist()

# Предобработка данных
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    # Токенизация
    tokens = word_tokenize(text.lower())
    # Удаление стоп-слов и лемматизация
    tokens = [lemmatizer.lemmatize(token) for token in tokens if token.isalnum() and token not in stop_words]
    return ' '.join(tokens)

X = [preprocess_text(text) for text in X]

# Разделение данных на обучающую и тестовую выборки
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Преобразование y_train и y_test в массивы NumPy
y_train = np.array(y_train)
y_test = np.array(y_test)

# Токенизация и векторизация текста
tokenizer = Tokenizer(num_words=10000)
tokenizer.fit_on_texts(X_train)

X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

X_train_pad = pad_sequences(X_train_seq, maxlen=128)
X_test_pad = pad_sequences(X_test_seq, maxlen=128)

# Использование KerasTuner для настройки гиперпараметров
def build_model(hp):
    model = tf.keras.Sequential()
    model.add(tf.keras.layers.Embedding(input_dim=10000, output_dim=hp.Int('embedding_dim', min_value=32, max_value=128, step=32), input_length=128))
    model.add(tf.keras.layers.LSTM(units=hp.Int('lstm_units', min_value=32, max_value=128, step=32)))
    model.add(tf.keras.layers.Dropout(rate=hp.Float('dropout_rate', min_value=0.0, max_value=0.5, step=0.1)))
    model.add(tf.keras.layers.Dense(1, activation='sigmoid'))

    model.compile(optimizer=tf.keras.optimizers.Adam(hp.Choice('learning_rate', values=[1e-2, 1e-3, 1e-4])),
                  loss='binary_crossentropy',
                  metrics=['accuracy'])
    return model

tuner = RandomSearch(
    build_model,
    objective='val_accuracy',
    max_trials=6,
    executions_per_trial=3,
    directory='my_dir',
    project_name='sarcasm_detection')

tuner.search(X_train_pad, y_train, epochs=8, validation_data=(X_test_pad, y_test))

# Получение лучшей модели
best_model = tuner.get_best_models(num_models=1)[0]

# Оценка модели
y_pred = best_model.predict(X_test_pad)
y_pred = [1 if pred > 0.5 else 0 for pred in y_pred]
accuracy = accuracy_score(y_test, y_pred)
print(f"Точность модели (KerasTuner): {accuracy:.2f}")

# Проверка модели на новых данных
new_headlines = [
    "Scientists discover new species of unicorn",
    "Government announces plan to solve world hunger by 2025",
    "Man survives 30 days on nothing but pizza and soda",
    "Study finds that exercise is bad for your health",
    "New study shows that coffee is the secret to long life"
]

new_headlines_processed = [preprocess_text(headline) for headline in new_headlines]
new_headlines_seq = tokenizer.texts_to_sequences(new_headlines_processed)
new_headlines_pad = pad_sequences(new_headlines_seq, maxlen=128)

predictions = best_model.predict(new_headlines_pad)
predictions = [1 if pred > 0.5 else 0 for pred in predictions]

for headline, prediction in zip(new_headlines, predictions):
    print(f"Заголовок: {headline} | Сарказм: {'Да' if prediction else 'Нет'}")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Reloading Tuner from my_dir/sarcasm_detection/tuner0.json


  saveable.load_own_variables(weights_store.get(inner_path))


[1m179/179[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step
Точность модели (KerasTuner): 0.81
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step
Заголовок: Scientists discover new species of unicorn | Сарказм: Да
Заголовок: Government announces plan to solve world hunger by 2025 | Сарказм: Да
Заголовок: Man survives 30 days on nothing but pizza and soda | Сарказм: Да
Заголовок: Study finds that exercise is bad for your health | Сарказм: Да
Заголовок: New study shows that coffee is the secret to long life | Сарказм: Да


In [None]:
!pip install transformers
!pip install torch
import pandas as pd
import json
import torch
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from torch.utils.data import DataLoader, TensorDataset, random_split
from sklearn.metrics import accuracy_score

# Загрузка датасета
url = "https://storage.yandexcloud.net/academy.ai/Sarcasm_Headlines_Dataset_v2.json.zip"
df = pd.read_json(url, lines=True)

# Выделение признаков и целевой переменной
X = df['headline'].tolist()
y = df['is_sarcastic'].tolist()

# Загрузка токенизатора и модели BERT
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

# Токенизация данных
encoded_inputs = tokenizer(X, padding=True, truncation=True, max_length=128, return_tensors='pt')
input_ids = encoded_inputs['input_ids']
attention_masks = encoded_inputs['attention_mask']
labels = torch.tensor(y)

# Создание DataLoader
dataset = TensorDataset(input_ids, attention_masks, labels)
train_size = int(0.8 * len(dataset))
test_size = len(dataset) - train_size
train_dataset, test_dataset = random_split(dataset, [train_size, test_size])

train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=32, shuffle=False)

# Оптимизатор и функция потерь
optimizer = AdamW(model.parameters(), lr=2e-5)

# Обучение модели
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

epochs = 3
for epoch in range(epochs):
    model.train()
    total_loss = 0
    for batch in train_dataloader:
        input_ids, attention_masks, labels = tuple(t.to(device) for t in batch)
        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_masks, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()
        loss.backward()
        optimizer.step()
    avg_train_loss = total_loss / len(train_dataloader)
    print(f"Epoch {epoch + 1}/{epochs}, Average training loss: {avg_train_loss:.2f}")

# Оценка модели
model.eval()
y_true = []
y_pred = []

with torch.no_grad():
    for batch in test_dataloader:
        input_ids, attention_masks, labels = tuple(t.to(device) for t in batch)
        outputs = model(input_ids, attention_mask=attention_masks)
        logits = outputs.logits
        predictions = torch.argmax(logits, dim=1)
        y_true.extend(labels.cpu().numpy())
        y_pred.extend(predictions.cpu().numpy())

accuracy = accuracy_score(y_true, y_pred)
print(f"Точность модели: {accuracy:.2f}")

# Проверка модели на новых данных
new_headlines = [
    "Scientists discover new species of unicorn",
    "Government announces plan to solve world hunger by 2025",
    "Man survives 30 days on nothing but pizza and soda",
    "Study finds that exercise is bad for your health",
    "New study shows that coffee is the secret to long life"
]

new_headlines_encoded = tokenizer(new_headlines, padding=True, truncation=True, max_length=128, return_tensors='pt')
new_input_ids = new_headlines_encoded['input_ids'].to(device)
new_attention_masks = new_headlines_encoded['attention_mask'].to(device)

model.eval()
with torch.no_grad():
    outputs = model(new_input_ids, attention_mask=new_attention_masks)
    logits = outputs.logits
    predictions = torch.argmax(logits, dim=1)

for headline, prediction in zip(new_headlines, predictions):
    print(f"Заголовок: {headline} | Сарказм: {'Да' if prediction.item() else 'Нет'}")



The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/3, Average training loss: 0.28
Epoch 2/3, Average training loss: 0.12
Epoch 3/3, Average training loss: 0.05
Точность модели: 0.93
Заголовок: Scientists discover new species of unicorn | Сарказм: Да
Заголовок: Government announces plan to solve world hunger by 2025 | Сарказм: Да
Заголовок: Man survives 30 days on nothing but pizza and soda | Сарказм: Да
Заголовок: Study finds that exercise is bad for your health | Сарказм: Да
Заголовок: New study shows that coffee is the secret to long life | Сарказм: Нет


In [None]:
#
#
#Выводы:
#
#Используя логическую регрессию получилось довести точность до 79 процентов. Kerastuner дал результат в 81%. Вместо этого бвзяли BERT и получили точность 93 процента.

#Время обучения: Логическая регрессия, KT занимают небольшое время в сравнении с BERT, однако точно сразу же разительно отличается без подбора параметров.

#Гиперпараметры: Можно попробовать настроить гиперпараметры модели (например, количество эпох, размер батча, скорость обучения) для дальнейшего улучшения точности в случае с Keras.

