<a href="https://colab.research.google.com/github/hemialisaaas/Fake-News-Detection/blob/main/Fake_News_Detection_LSTM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install textstat
!pip install datasets

Collecting textstat
  Downloading textstat-0.7.4-py3-none-any.whl.metadata (14 kB)
Collecting pyphen (from textstat)
  Downloading pyphen-0.17.0-py3-none-any.whl.metadata (3.2 kB)
Downloading textstat-0.7.4-py3-none-any.whl (105 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m105.1/105.1 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyphen-0.17.0-py3-none-any.whl (2.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m33.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pyphen, textstat
Successfully installed pyphen-0.17.0 textstat-0.7.4
Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from dat

# **Library**

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, confusion_matrix
from sklearn.utils import resample
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, SpatialDropout1D
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.regularizers import l2

# Load dataset
df = pd.read_csv('/content/drive/MyDrive/Fake News Detection/Shuffle_50k_acak.csv')
print("Dataset loaded successfully.")

# Map labels to integers
label_to_int = {"Fake": 0, "Real": 1}
df['Label'] = df['Label'].map(label_to_int)
print("Labels mapped successfully.")

# Split data into train and test
X = df['stemmed_content']
y = df['Label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# Tokenize the text
tokenizer = Tokenizer(num_words=5000, oov_token='<OOV>')
tokenizer.fit_on_texts(X_train)

X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

# Pad sequences
max_length = 100
X_train_pad = pad_sequences(X_train_seq, maxlen=max_length, padding='post', truncating='post')
X_test_pad = pad_sequences(X_test_seq, maxlen=max_length, padding='post', truncating='post')

# Oversample without SMOTE
train_data = pd.DataFrame(X_train_pad)
train_data['label'] = y_train.values

majority = train_data[train_data['label'] == 1]
minority = train_data[train_data['label'] == 0]

minority_upsampled = resample(minority, replace=True, n_samples=len(majority), random_state=42)
balanced_data = pd.concat([majority, minority_upsampled])

X_train_resampled = balanced_data.drop(columns=['label']).values
y_train_resampled = balanced_data['label'].values

# Display label distribution after oversampling
print("Label distribution after oversampling:")
print(pd.Series(y_train_resampled).value_counts())

def create_model():
    model = Sequential()
    model.add(Embedding(input_dim=5000, output_dim=32, trainable=True))
    model.add(SpatialDropout1D(0.6))
    model.add(LSTM(units=8, kernel_regularizer=l2(0.01)))
    model.add(Dropout(0.6))
    model.add(Dense(1, activation='sigmoid', kernel_regularizer=l2(0.01)))

    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

# Callbacks
early_stopping = EarlyStopping(monitor='val_loss', patience=1, restore_best_weights=True)
lr_scheduler = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=1, min_lr=1e-6)

# Update training loop
kf = KFold(n_splits=5, shuffle=True, random_state=42)
fold = 1

for train_idx, val_idx in kf.split(X_train_resampled):
    print(f"\nTraining Fold {fold}...")
    X_train_fold, X_val_fold = X_train_resampled[train_idx], X_train_resampled[val_idx]
    y_train_fold, y_val_fold = y_train_resampled[train_idx], y_train_resampled[val_idx]

    model = create_model()
    model.fit(
        X_train_fold, y_train_fold,
        validation_data=(X_val_fold, y_val_fold),
        epochs=10,
        batch_size=128,
        callbacks=[early_stopping, lr_scheduler],
        verbose=1
    )
    fold += 1

# Final evaluation
y_pred_prob = model.predict(X_test_pad)
y_pred = (y_pred_prob > 0.5).astype(int)

precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
accuracy = accuracy_score(y_test, y_pred)
cm = confusion_matrix(y_test, y_pred)

print("\nFinal Metrics:")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1-score: {f1}")
print(f"Accuracy: {accuracy}")
print("Confusion Matrix:")
print(cm)


# Save predictions
prediction_df = pd.DataFrame({
    'id_berita': df.iloc[y_test.index].index,
    'actual_label': y_test.values,
    'predicted_label': y_pred.flatten(),
    'predicted_prob': y_pred_prob.flatten()
})
int_to_label = {0: 'Fake', 1: 'Real'}
prediction_df['predicted_label'] = prediction_df['predicted_label'].map(int_to_label)
prediction_df['actual_label'] = prediction_df['actual_label'].map(int_to_label)

# Sort by predicted probability and display top 10 predictions
top_predictions = prediction_df.sort_values(by='predicted_prob', ascending=False).head(10)
print("\nTop 10 Predictions:")
print(top_predictions)

# Save all predictions and top 10 predictions
prediction_df.to_csv('prediksi_berita_lstm_oversample.csv', index=False)
top_predictions.to_csv('top_10_predictions.csv', index=False)

print("Predictions saved to 'prediksi_berita_lstm_oversample.csv'.")
print("Top 10 predictions saved to 'top_10_predictions.csv'.")




Dataset loaded successfully.
Labels mapped successfully.
Label distribution after oversampling:
1    26480
0    26480
Name: count, dtype: int64

Training Fold 1...
Epoch 1/10
[1m331/331[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 25ms/step - accuracy: 0.7711 - loss: 0.7453 - val_accuracy: 0.9944 - val_loss: 0.1512 - learning_rate: 0.0010
Epoch 2/10
[1m331/331[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 24ms/step - accuracy: 0.9794 - loss: 0.1834 - val_accuracy: 0.9945 - val_loss: 0.0941 - learning_rate: 0.0010
Epoch 3/10
[1m331/331[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 25ms/step - accuracy: 0.9743 - loss: 0.1845 - val_accuracy: 0.9951 - val_loss: 0.0874 - learning_rate: 0.0010
Epoch 4/10
[1m331/331[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 25ms/step - accuracy: 0.9635 - loss: 0.2158 - val_accuracy: 0.9950 - val_loss: 0.1026 - learning_rate: 0.0010

Training Fold 2...
Epoch 1/10
[1m331/331[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m 

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, confusion_matrix
from sklearn.utils import resample
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, SpatialDropout1D
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.regularizers import l2

# Load dataset
df = pd.read_csv('/content/drive/MyDrive/Fake News Detection/Shuffle_50k_acak.csv')
print("Dataset loaded successfully.")

# Map labels to integers
label_to_int = {"Fake": 0, "Real": 1}
df['Label'] = df['Label'].map(label_to_int)
print("Labels mapped successfully.")

# Split data into train (65%), validation (15%), and test (20%)
X = df['stemmed_content']
y = df['Label']

# Step 1: Split into train (65%) and temp (35%)
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.35, stratify=y, random_state=42)

# Step 2: Split temp into validation (15%) and test (20%)
X_valid, X_test, y_valid, y_test = train_test_split(X_temp, y_temp, test_size=0.5714, stratify=y_temp, random_state=42)  # 0.5714 = 20% / 35%

print(f"Train size: {len(X_train)}, Validation size: {len(X_valid)}, Test size: {len(X_test)}")

# Tokenize the text
tokenizer = Tokenizer(num_words=5000, oov_token='<OOV>')
tokenizer.fit_on_texts(X_train)

X_train_seq = tokenizer.texts_to_sequences(X_train)
X_valid_seq = tokenizer.texts_to_sequences(X_valid)
X_test_seq = tokenizer.texts_to_sequences(X_test)

# Pad sequences
max_length = 100
X_train_pad = pad_sequences(X_train_seq, maxlen=max_length, padding='post', truncating='post')
X_valid_pad = pad_sequences(X_valid_seq, maxlen=max_length, padding='post', truncating='post')
X_test_pad = pad_sequences(X_test_seq, maxlen=max_length, padding='post', truncating='post')

# Oversample without SMOTE
train_data = pd.DataFrame(X_train_pad)
train_data['label'] = y_train.values

majority = train_data[train_data['label'] == 1]
minority = train_data[train_data['label'] == 0]

minority_upsampled = resample(minority, replace=True, n_samples=len(majority), random_state=42)
balanced_data = pd.concat([majority, minority_upsampled])

X_train_resampled = balanced_data.drop(columns=['label']).values
y_train_resampled = balanced_data['label'].values

# Display label distribution after oversampling
print("Label distribution after oversampling:")
print(pd.Series(y_train_resampled).value_counts())

def create_model():
    model = Sequential()
    model.add(Embedding(input_dim=5000, output_dim=64, trainable=True))
    model.add(SpatialDropout1D(0.4))  # Dropout untuk mengurangi overfitting
    model.add(LSTM(units=32, kernel_regularizer=l2(0.01), return_sequences=True))
    model.add(Dropout(0.4))  # Dropout tambahan setelah LSTM
    model.add(LSTM(units=16, kernel_regularizer=l2(0.01)))  # LSTM tambahan untuk representasi lebih dalam
    model.add(Dense(1, activation='sigmoid', kernel_regularizer=l2(0.01)))

    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

# Callbacks
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
lr_scheduler = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=2, min_lr=1e-6)

# Train the model
model = create_model()
history = model.fit(
    X_train_resampled, y_train_resampled,
    validation_data=(X_valid_pad, y_valid),
    epochs=20,
    batch_size=128,
    callbacks=[early_stopping, lr_scheduler],
    verbose=1
)

# Evaluate on the test set
y_pred_prob = model.predict(X_test_pad)
y_pred = (y_pred_prob > 0.5).astype(int)

precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
accuracy = accuracy_score(y_test, y_pred)
cm = confusion_matrix(y_test, y_pred)

print("\nFinal Metrics:")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1-score: {f1}")
print(f"Accuracy: {accuracy}")
print("Confusion Matrix:")
print(cm)

# Save predictions
prediction_df = pd.DataFrame({
    'id_berita': df.iloc[y_test.index].index,
    'actual_label': y_test.values,
    'predicted_label': y_pred.flatten(),
    'predicted_prob': y_pred_prob.flatten()
})
int_to_label = {0: 'Fake', 1: 'Real'}
prediction_df['predicted_label'] = prediction_df['predicted_label'].map(int_to_label)
prediction_df['actual_label'] = prediction_df['actual_label'].map(int_to_label)

# Sort by predicted probability and display top 10 predictions
top_predictions = prediction_df.sort_values(by='predicted_prob', ascending=False).head(10)
print("\nTop 10 Predictions:")
print(top_predictions)

# Save all predictions and top 10 predictions
prediction_df.to_csv('prediksi_berita_lstm_oversample.csv', index=False)
top_predictions.to_csv('top_10_predictions.csv', index=False)

print("Predictions saved to 'prediksi_berita_lstm_oversample.csv'.")
print("Top 10 predictions saved to 'top_10_predictions.csv'.")


Dataset loaded successfully.
Labels mapped successfully.
Train size: 32730, Validation size: 7554, Test size: 10071
Label distribution after oversampling:
1    21515
0    21515
Name: count, dtype: int64
Epoch 1/20
[1m337/337[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m32s[0m 86ms/step - accuracy: 0.8876 - loss: 0.8990 - val_accuracy: 0.9011 - val_loss: 0.3691 - learning_rate: 0.0010
Epoch 2/20
[1m337/337[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 75ms/step - accuracy: 0.9262 - loss: 0.2805 - val_accuracy: 0.9697 - val_loss: 0.1675 - learning_rate: 0.0010
Epoch 3/20
[1m337/337[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 72ms/step - accuracy: 0.9736 - loss: 0.1622 - val_accuracy: 0.9673 - val_loss: 0.1721 - learning_rate: 0.0010
Epoch 4/20
[1m337/337[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 72ms/step - accuracy: 0.9710 - loss: 0.1656 - val_accuracy: 0.9611 - val_loss: 0.1813 - learning_rate: 0.0010
Epoch 5/20
[1m337/337[0m [32m━━━━━━━━━━━━━

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, confusion_matrix
from gensim.models import Word2Vec
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, SpatialDropout1D
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
import numpy as np

# Load dataset
df = pd.read_csv('/content/drive/MyDrive/Fake News Detection/Shuffle_50k_acak.csv')
print("Dataset loaded successfully.")

# Map labels to integers
label_to_int = {"Fake": 0, "Real": 1}
df['Label'] = df['Label'].map(label_to_int)
print("Labels mapped successfully.")

# Split data into train (65%), validation (15%), and test (20%)
X = df['stemmed_content']
y = df['Label']

# Step 1: Split into train (65%) and temp (35%)
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.35, stratify=y, random_state=42)

# Step 2: Split temp into validation (15%) and test (20%)
X_valid, X_test, y_valid, y_test = train_test_split(X_temp, y_temp, test_size=0.5714, stratify=y_temp, random_state=42)

print(f"Train size: {len(X_train)}, Validation size: {len(X_valid)}, Test size: {len(X_test)}")

# Preprocess text for Word2Vec
def preprocess_text(text):
    return text.lower().split()  # Split text into words (you can add more preprocessing here)

X_train_preprocessed = X_train.apply(preprocess_text)
X_valid_preprocessed = X_valid.apply(preprocess_text)
X_test_preprocessed = X_test.apply(preprocess_text)

# Train Word2Vec model
print("Training Word2Vec model...")
word2vec_model = Word2Vec(sentences=X_train_preprocessed, vector_size=100, window=5, min_count=1, workers=4)
print("Word2Vec model trained.")

# Build embedding matrix
tokenizer = Tokenizer(num_words=5000, oov_token='<OOV>')
tokenizer.fit_on_texts(X_train_preprocessed)

word_index = tokenizer.word_index
vocab_size = len(word_index) + 1

embedding_dim = 100  # Same as Word2Vec vector size
embedding_matrix = np.zeros((vocab_size, embedding_dim))

for word, i in word_index.items():
    if word in word2vec_model.wv:
        embedding_matrix[i] = word2vec_model.wv[word]

print(f"Embedding matrix shape: {embedding_matrix.shape}")

# Tokenize and pad sequences
X_train_seq = tokenizer.texts_to_sequences(X_train_preprocessed)
X_valid_seq = tokenizer.texts_to_sequences(X_valid_preprocessed)
X_test_seq = tokenizer.texts_to_sequences(X_test_preprocessed)

max_length = 100
X_train_pad = pad_sequences(X_train_seq, maxlen=max_length, padding='post', truncating='post')
X_valid_pad = pad_sequences(X_valid_seq, maxlen=max_length, padding='post', truncating='post')
X_test_pad = pad_sequences(X_test_seq, maxlen=max_length, padding='post', truncating='post')

# Define LSTM model using Word2Vec embedding
def create_model():
    model = Sequential()
    model.add(Embedding(input_dim=vocab_size,
                        output_dim=embedding_dim,
                        weights=[embedding_matrix],
                        input_length=max_length,
                        trainable=False))  # Freeze the embedding layer
    model.add(SpatialDropout1D(0.3))
    model.add(LSTM(units=32, return_sequences=True))
    model.add(Dropout(0.3))
    model.add(LSTM(units=16))
    model.add(Dense(1, activation='sigmoid'))

    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

# Callbacks
early_stopping = EarlyStopping(monitor='val_loss', patience=2, restore_best_weights=True)
lr_scheduler = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=2, min_lr=1e-6)

# Train the model
model = create_model()
history = model.fit(
    X_train_pad, y_train,
    validation_data=(X_valid_pad, y_valid),
    epochs=10,
    batch_size=64,
    callbacks=[early_stopping, lr_scheduler],
    verbose=1
)

# Evaluate on the test set
y_pred_prob = model.predict(X_test_pad)
y_pred = (y_pred_prob > 0.5).astype(int)

# Calculate metrics
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
accuracy = accuracy_score(y_test, y_pred)
cm = confusion_matrix(y_test, y_pred)

print("\nFinal Metrics:")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1-score: {f1}")
print(f"Accuracy: {accuracy}")
print("Confusion Matrix:")
print(cm)

# Save predictions
prediction_df = pd.DataFrame({
    'id_berita': df.iloc[y_test.index].index,
    'actual_label': y_test.values,
    'predicted_label': y_pred.flatten(),
    'predicted_prob': y_pred_prob.flatten()
})
int_to_label = {0: 'Fake', 1: 'Real'}
prediction_df['predicted_label'] = prediction_df['predicted_label'].map(int_to_label)
prediction_df['actual_label'] = prediction_df['actual_label'].map(int_to_label)

# Sort by predicted probability and display top 10 predictions
top_predictions = prediction_df.sort_values(by='predicted_prob', ascending=False).head(10)
print("\nTop 10 Predictions:")
print(top_predictions)

# Save all predictions and top 10 predictions
prediction_df.to_csv('prediksi_berita_word2vec_lstm.csv', index=False)
top_predictions.to_csv('top_10_predictions_word2vec_lstm.csv', index=False)

print("Predictions saved to 'prediksi_berita_word2vec_lstm.csv'.")
print("Top 10 predictions saved to 'top_10_predictions_word2vec_lstm.csv'.")


Dataset loaded successfully.
Labels mapped successfully.
Train size: 32730, Validation size: 7554, Test size: 10071
Training Word2Vec model...
Word2Vec model trained.
Embedding matrix shape: (68136, 100)
Epoch 1/10




[1m512/512[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m33s[0m 59ms/step - accuracy: 0.8911 - loss: 0.2345 - val_accuracy: 0.9950 - val_loss: 0.0292 - learning_rate: 0.0010
Epoch 2/10
[1m512/512[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 59ms/step - accuracy: 0.9904 - loss: 0.0442 - val_accuracy: 0.9981 - val_loss: 0.0125 - learning_rate: 0.0010
Epoch 3/10
[1m512/512[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m31s[0m 60ms/step - accuracy: 0.9947 - loss: 0.0245 - val_accuracy: 0.9995 - val_loss: 0.0052 - learning_rate: 0.0010
Epoch 4/10
[1m512/512[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 59ms/step - accuracy: 0.9952 - loss: 0.0202 - val_accuracy: 0.9999 - val_loss: 0.0022 - learning_rate: 0.0010
Epoch 5/10
[1m512/512[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 59ms/step - accuracy: 0.9982 - loss: 0.0083 - val_accuracy: 0.9999 - val_loss: 0.0011 - learning_rate: 0.0010
Epoch 6/10
[1m512/512[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3