In [1]:
!pip install datasets -q

In [2]:
from datasets import load_dataset
import pandas as pd

from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import tensorflow as tf

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense

print(tf.__version__)

ModuleNotFoundError: No module named 'tensorflow'

In [None]:
dataset = load_dataset('nanyy1025/covid_fake_news')
dataset

In [None]:
train_texts = dataset['train']['tweet']
train_labels = dataset['train']['label']

test_texts = dataset['test']['tweet']
test_labels = dataset['test']['label']

validation_texts = dataset['validation']['tweet']
validation_labels = dataset['validation']['label']

print(len(train_texts))
print(len(test_texts))
print(len(validation_texts))

In [None]:
df = pd.DataFrame()
df['text'] = train_texts + test_texts + validation_texts
df['label'] = train_labels + test_labels + validation_texts
df


-----------------------------------------

In [None]:
def encoding(label):
  return 1 if label == 'real' else 0

df['y'] =df['label'].apply(lambda x: encoding(x))

In [None]:
df['text'] = df['text'].str.replace('[^\w\s]', '') # removing punctuation
df['text'] = df['text'].str.replace('\d+', '') # removing numbers
df['text'] = df['text'].str.replace('@', '') # removing leading whitespace '@'
df['text'] = df['text'].str.replace('#', '') # removing trailing '#'
df['text'] = df['text'].str.replace('\s+', ' ') # removing extra whitespace
df['text'] = df['text'].str.lower() # lowercase

df.head()

In [None]:
train_texts, test_texts, train_labels, test_labels = train_test_split(df['text'], df['y'], test_size=0.2, random_state=42)

In [None]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_texts)
train_seq = tokenizer.texts_to_sequences(train_texts)
test_seq = tokenizer.texts_to_sequences(test_texts)


In [None]:
print(len(train_seq[1]))
print(len(train_seq[5]))
print(len(train_seq[20]))

In [None]:
max_seq_length = max(max(map(len, train_seq)),max(map(len, train_seq)) )
print(max_seq_length)

In [None]:
train_data = pad_sequences(train_seq, maxlen = max_seq_length)
test_data = pad_sequences(test_seq, maxlen = max_seq_length)

In [None]:
print('GPU available: ', tf.config.list_physical_devices('GPU'))

In [None]:
model = Sequential()

model.add(Embedding(len(tokenizer.word_index)+1, 100, input_length = max_seq_length))
model.add(LSTM(128))
model.add(Dense(1, activation='sigmoid'))

In [None]:
model.compile(loss= 'binary_crossentropy', optimizer ='adam', metrics=['accuracy'])

In [None]:
history = model.fit(train_data, train_labels, validation_data=(test_data,test_labels), epochs=6, batch_size=64)

In [None]:
loss, acc = model.evaluate(test_data, test_labels)
print(loss)
print(acc)

In [None]:
import matplotlib.pyplot as plt

def plot(history, string):
  plt.plot(history.history[string])
  plt.xlabel("Epoch")
  plt.ylabel(string)
  plt.show()

In [None]:
plot(history, 'accuracy')

In [None]:
plot(history, 'loss')

In [None]:
import numpy as np

def classify_text(model, tokenizer, text):
  seq = tokenizer.texts_to_sequences([text])
  seq =  pad_sequences(seq, maxlen = model.input_shape[1])

  pred = model.predict(seq)
  pred_label = np.round(pred(0).astype(int))

  if pred_label == 0:
    return "fake news"
  else:
    return "real news"