In [5]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

# Download necessary NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')

# Load datasets
fake_df = pd.read_csv("Fake.csv")
real_df = pd.read_csv("True.csv")

# Add labels: 0 = Fake, 1 = Real
fake_df["label"] = 0
real_df["label"] = 1

# Combine datasets
df = pd.concat([fake_df, real_df]).reset_index(drop=True)

# Shuffle data
df = df.sample(frac=1, random_state=42).reset_index(drop=True)

# Drop unnecessary columns if they exist
df = df[['text', 'label']]

# Define text preprocessing function
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    text = text.lower()  # Lowercase
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    text = re.sub(r'\d+', '', text)  # Remove numbers
    words = text.split()
    words = [word for word in words if word not in stop_words]  # Remove stopwords
    words = [lemmatizer.lemmatize(word) for word in words]  # Lemmatization
    return " ".join(words)

# Apply preprocessing
df["clean_text"] = df["text"].apply(preprocess_text)

# Convert text to numerical features (TF-IDF)
vectorizer = TfidfVectorizer(max_features=5000)
X = vectorizer.fit_transform(df["clean_text"])
y = df["label"]

# Split into train/test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Training samples: {X_train.shape[0]}, Testing samples: {X_test.shape[0]}")


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Training samples: 35918, Testing samples: 8980


In [6]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
import joblib

# Train Naïve Bayes
nb_model = MultinomialNB()
nb_model.fit(X_train, y_train)

# Evaluate
y_pred_nb = nb_model.predict(X_test)
print("Naïve Bayes Accuracy:", accuracy_score(y_test, y_pred_nb))
print(classification_report(y_test, y_pred_nb))

# Save model
joblib.dump(nb_model, "naive_bayes_model.pkl")


Naïve Bayes Accuracy: 0.9300668151447662
              precision    recall  f1-score   support

           0       0.93      0.93      0.93      4710
           1       0.93      0.93      0.93      4270

    accuracy                           0.93      8980
   macro avg       0.93      0.93      0.93      8980
weighted avg       0.93      0.93      0.93      8980



['naive_bayes_model.pkl']

In [7]:
from sklearn.ensemble import RandomForestClassifier

# Train Random Forest
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Evaluate
y_pred_rf = rf_model.predict(X_test)
print("Random Forest Accuracy:", accuracy_score(y_test, y_pred_rf))
print(classification_report(y_test, y_pred_rf))

# Save model
joblib.dump(rf_model, "random_forest_model.pkl")


Random Forest Accuracy: 0.9974387527839643
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      4710
           1       1.00      1.00      1.00      4270

    accuracy                           1.00      8980
   macro avg       1.00      1.00      1.00      8980
weighted avg       1.00      1.00      1.00      8980



['random_forest_model.pkl']

In [26]:
import joblib
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

# Split into training & testing sets
train_texts, test_texts, train_labels, test_labels = train_test_split(
    df['text'], df['label'], test_size=0.2, random_state=42
)

# Create and fit TF-IDF vectorizer
vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
vectorizer.fit(train_texts)

# Save TF-IDF vectorizer
joblib.dump(vectorizer, "tfidf_vectorizer.pkl")
print("TF-IDF Vectorizer has been created and saved successfully!")


TF-IDF Vectorizer has been created and saved successfully!


In [8]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.optimizers import Adam

# Tokenization
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(df['clean_text'])

# Convert text to sequences
X_sequences = tokenizer.texts_to_sequences(df['clean_text'])
X_padded = pad_sequences(X_sequences, maxlen=500, padding='post')

# Split data
X_train_lstm, X_test_lstm, y_train_lstm, y_test_lstm = train_test_split(X_padded, y, test_size=0.2, random_state=42)

# Define LSTM model
embedding_dim = 128

lstm_model = Sequential([
    Embedding(input_dim=5000, output_dim=embedding_dim, input_length=500),
    LSTM(100, return_sequences=True),
    Dropout(0.2),
    LSTM(100),
    Dropout(0.2),
    Dense(1, activation='sigmoid')
])

# Compile model
lstm_model.compile(loss='binary_crossentropy', optimizer=Adam(learning_rate=0.001), metrics=['accuracy'])

# Train model
lstm_model.fit(X_train_lstm, y_train_lstm, epochs=5, batch_size=32, validation_data=(X_test_lstm, y_test_lstm))

# Save model
lstm_model.save("lstm_model.h5")
joblib.dump(tokenizer, "tokenizer.pkl")




Epoch 1/5
[1m1123/1123[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1278s[0m 1s/step - accuracy: 0.5194 - loss: 0.6892 - val_accuracy: 0.5415 - val_loss: 0.8501
Epoch 2/5
[1m1123/1123[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1255s[0m 1s/step - accuracy: 0.5478 - loss: 0.6623 - val_accuracy: 0.9616 - val_loss: 0.1282
Epoch 3/5
[1m1123/1123[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1354s[0m 1s/step - accuracy: 0.9638 - loss: 0.1228 - val_accuracy: 0.9893 - val_loss: 0.0297
Epoch 4/5
[1m1123/1123[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1354s[0m 1s/step - accuracy: 0.9910 - loss: 0.0316 - val_accuracy: 0.9930 - val_loss: 0.0334
Epoch 5/5
[1m1123/1123[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1354s[0m 1s/step - accuracy: 0.9947 - loss: 0.0199 - val_accuracy: 0.9951 - val_loss: 0.0169




['tokenizer.pkl']

In [18]:
!pip install streamlit
!pip install pyngrok

Collecting streamlit
  Downloading streamlit-1.43.2-py2.py3-none-any.whl.metadata (8.9 kB)
Collecting watchdog<7,>=2.1.5 (from streamlit)
  Downloading watchdog-6.0.0-py3-none-manylinux2014_x86_64.whl.metadata (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.3/44.3 kB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0m
Collecting pydeck<1,>=0.8.0b4 (from streamlit)
  Downloading pydeck-0.9.1-py2.py3-none-any.whl.metadata (4.1 kB)
Downloading streamlit-1.43.2-py2.py3-none-any.whl (9.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.7/9.7 MB[0m [31m26.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pydeck-0.9.1-py2.py3-none-any.whl (6.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.9/6.9 MB[0m [31m34.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading watchdog-6.0.0-py3-none-manylinux2014_x86_64.whl (79 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m79.1/79.1 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[

In [23]:
!ngrok authtoken 2uaQbgREFUSAmbgpx1VPWVMdMFe_2SYiPu9V2ELnCxHDhAavP


Authtoken saved to configuration file: /root/.config/ngrok/ngrok.yml
