In [2]:
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel

tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
model = GPT2LMHeadModel.from_pretrained("gpt2")

def is_fake_news_gpt2(news_text):
    prompt = f"Is the following news article real or fake? Please provide a brief explanation as well.\n\nNews article:\n{news_text}\n\nAnswer: "
    input_ids = tokenizer.encode(prompt, return_tensors="pt")

    with torch.no_grad():
        output = model.generate(input_ids, max_length=100, num_return_sequences=1)

    answer = tokenizer.decode(output[0], skip_special_tokens=True)
    answer = answer.replace(prompt, "").strip()

    return answer

news_text = """Scientists have discovered a new species of spider in the Amazon rainforest. The spider is unique because it can fly using wings that extend from its abdomen. Researchers believe this adaptation allows the spider to escape predators and search for food more efficiently."""

result = is_fake_news_gpt2(news_text)
print(result)



The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Is the following news article real or fake? Please provide a brief explanation as well.

News article:
Scientists have discovered a new species of spider in the Amazon rainforest. The spider is unique because it can fly using wings that extend from its abdomen. Researchers believe this adaptation allows the spider to escape predators and search for food more efficiently.

Answer:????

The spider is a member of the genus Apis mellifera. It is a member of the genus Ap


In [None]:
import pandas as pd
import numpy as np
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from gensim.models import Word2Vec
import string

# Load the dataset
def load_data(file_path):
    data = pd.read_csv(file_path)
    return data

# Preprocessing function
def preprocess_text(text):
    # Tokenization
    tokens = word_tokenize(text)

    # Convert to lowercase
    tokens = [token.lower() for token in tokens]

    # Remove punctuation
    tokens = [''.join(c for c in token if c not in string.punctuation) for token in tokens]
    tokens = [token for token in tokens if token]

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]

    # Stemming/Lemmatization
    stemmer = PorterStemmer()
    tokens = [stemmer.stem(token) for token in tokens]

    return tokens

# Load and preprocess data
data = load_data("C:\\Users\\Alex\\Downloads\\WELFake_Dataset.csv")


In [11]:
data['text'] = data['text'].astype(str)

In [12]:
data['tokens'] = data['text'].apply(preprocess_text)

# Train Word2Vec model
word2vec_model = Word2Vec(sentences=data['tokens'], vector_size=100, window=5, min_count=1, workers=4)

# Create document vectors using the mean of word vectors
def document_vector(tokens, model):
    vectors = [model.wv[token] for token in tokens if token in model.wv]
    return np.mean(vectors, axis=0) if vectors else np.zeros(model.vector_size)

data['document_vector'] = data['tokens'].apply(lambda x: document_vector(x, word2vec_model))

# Extract features (document vectors) and labels
X = np.stack(data['document_vector'].values)
y = data['label'].values


In [18]:
print(X)
print(y)

[[ 0.48924097  0.12713179 -0.27337244 ... -0.26956478 -0.13909675
   1.03828049]
 [ 2.66600227  0.05235812 -0.88148677 ... -0.13322905 -0.52463275
   0.99650037]
 [ 0.70893848 -0.34301457  0.21790981 ... -0.39881271  0.02146312
  -0.21664295]
 ...
 [-0.27936357 -0.64543819  0.28852254 ... -0.36916664 -0.30971491
   0.52842015]
 [ 0.05290093 -0.29853812 -0.04654996 ... -0.12153742  0.16454835
   0.71198934]
 [ 1.42836714 -0.61658293  0.15999073 ... -0.16453208  0.36877975
   0.72680187]]
[1 1 1 ... 0 0 1]


In [16]:
# Save X and y to .npy files
np.save('X.npy', X)
np.save('y.npy', y)

In [17]:
# Load X and y from .npy files
X = np.load('X.npy')
y = np.load('y.npy')

In [None]:
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, GRU, Embedding, Dropout
from tensorflow.keras.preprocessing.sequence import pad_sequences
from transformers import TFDistilBertForSequenceClassification

# Define models
models = {
    "Logistic Regression": LogisticRegression(),
    "Multinomial Naive Bayes": MultinomialNB(),
    "Gaussian Naive Bayes": GaussianNB(),
    "SVM": SVC(),
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(),
    "Gradient Boosting": GradientBoostingClassifier(),
}

# Define a simple RNN model (use with padded sequences)
def create_rnn_model(input_length):
    model = Sequential()
    model.add(Embedding(input_dim=10000, output_dim=128, input_length=input_length))
    model.add(SimpleRNN(128))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

models["RNN"] = create_rnn_model(input_length=your_padded_sequence_length)

# Define a simple LSTM model (use with padded sequences)
def create_lstm_model(input_length):
    model = Sequential()
    model.add(Embedding(input_dim=10000, output_dim=128, input_length=input_length))
    model.add(LSTM(128))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

models["LSTM"] = create_lstm_model(input_length=your_padded_sequence_length)

# Define a simple GRU model (use with padded sequences)
def create_gru_model(input_length):
    model = Sequential()
    model.add(Embedding(input_dim=10000, output_dim=128, input_length=input_length))
    model.add(GRU(128))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

models["GRU"] = create_gru_model(input_length=your_padded_sequence_length)

# Define a DistilBERT model (use with encoded input from the Hugging Face tokenizer)
def create_distilbert_model():
    model = TFDistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2)
    model.compile(optimizer='adam', loss=model.compute_loss, metrics=['accuracy'])
    return model

models["DistilBERT"] = create_distilbert_model()

# You can also include other transformer-based models like BERT, GPT, RoBERTa, etc.


In [None]:
from transformers import BertTokenizer, TFBertForSequenceClassification
from tensorflow.keras.optimizers import Adam

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def tokenize(text):
    inputs = tokenizer(text, return_tensors='tf', padding='max_length', truncation=True, max_length=512)
    return inputs

# Assuming 'data' is a pandas DataFrame with columns 'text' (article text) and 'label' (binary label for genuine/fake news)
data['inputs'] = data['text'].apply(tokenize)

def create_bert_model():
    model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
    optimizer = Adam(learning_rate=2e-5, epsilon=1e-08)
    model.compile(optimizer=optimizer, loss=model.compute_loss, metrics=['accuracy'])
    return model

models["BERT"] = create_bert_model()

X = np.stack(data['inputs'].values)
y = data['label'].values

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Train the model
models["BERT"].fit(X_train, y_train, batch_size=16, epochs=3, validation_split=0.1)


In [None]:
from transformers import GPT2Tokenizer, TFGPT2LMHeadModel, GPT2Config
from tensorflow.keras.layers import Dense
from tensorflow.keras.models import Model
import tensorflow as tf

tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

def tokenize(text):
    inputs = tokenizer(text, return_tensors='tf', padding='max_length', truncation=True, max_length=512)
    return inputs

# Assuming 'data' is a pandas DataFrame with columns 'text' (article text) and 'label' (binary label for genuine/fake news)
data['inputs'] = data['text'].apply(tokenize)

def create_gpt2_model():
    gpt2_config = GPT2Config.from_pretrained('gpt2', output_hidden_states=True)
    gpt2_model = TFGPT2LMHeadModel.from_pretrained('gpt2', config=gpt2_config)

    # Define the classification head
    input_ids = tf.keras.layers.Input(shape=(512,), dtype=tf.int32)
    attention_mask = tf.keras.layers.Input(shape=(512,), dtype=tf.int32)
    outputs = gpt2_model(input_ids, attention_mask=attention_mask)
    hidden_states = outputs.hidden_states[-1][:, 0]
    classification_output = Dense(2, activation='softmax')(hidden_states)

    # Combine GPT-2 and the classification head
    model = Model(inputs=[input_ids, attention_mask], outputs=classification_output)
    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    return model

models["GPT-2"] = create_gpt2_model()
X = np.stack(data['inputs'].values)
y = data['label'].values

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Train the model
models["GPT-2"].fit(X_train, y_train, batch_size=8, epochs=3, validation_split=0.1)
