In [5]:

# Cell 1: Import additional required libraries
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidataVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import joblib
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
nltk.download('stopwords')
nltk.download('wordnet')


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\admin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\admin\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [6]:
data = pd.read_csv("cleaned_misinformation_dataset.csv")
data.head()

Unnamed: 0,clean_text,likes,shares,comments,sentiment_score,contains_link,contains_hashtag,platform_encoded,device_encoded,verified_encoded,target
0,small citizen class morning,79,62,27,0.04,0,0,2,1,1,0
1,others kind company likely improve notice meet...,78,27,31,-0.28,0,0,0,0,0,1
2,check real leader bad school name care several...,57,111,99,0.27,0,0,3,1,0,0
3,car financial security stock ball organization...,48,78,22,0.06,0,0,3,1,0,0
4,could yourself plan base rise would i question...,83,129,2,0.02,0,0,0,0,0,1


In [7]:

# Cell 2: Text cleaning function
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def clean_text(text):
    text = str(text).lower()
    text = re.sub(r'http\S+|www\S+|https\S+', '', text)
    text = re.sub(r'[^A-Za-z\s]', '', text)
    text = ' '.join([lemmatizer.lemmatize(word) for word in text.split() if word not in stop_words])
    return text


In [8]:

# Cell 3: Apply text preprocessing to the content column
data['cleaned_text'] = data['text_content'].apply(clean_text)
data[['text_content', 'cleaned_text']].head()


KeyError: 'text_content'

In [None]:

# Cell 4: Encode the target label
label_encoder = LabelEncoder()
data['label_encoded'] = label_encoder.fit_transform(data['label'])  # Adjust if your label column has a different name
data['label_encoded'].value_counts()


In [None]:

# Cell 5: Split dataset
X = data['cleaned_text']
y = data['label_encoded']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)


In [None]:

# Cell 6: TF-Idata Vectorization
tfidata = TfidataVectorizer(max_features=10000, ngram_range=(1,2))
X_train_tfidata = tfidata.fit_transform(X_train)
X_test_tfidata = tfidata.transform(X_test)


In [None]:

# Cell 7: Train Logistic Regression
lr_model = LogisticRegression(max_iter=1000)
lr_model.fit(X_train_tfidata, y_train)


In [None]:

# Cell 8: Evaluate model
y_pred = lr_model.predict(X_test_tfidata)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))


In [None]:

# Cell 9: Save TF-Idata and model
joblib.dump(tfidata, 'tfidata_vectorizer.pkl')
joblib.dump(lr_model, 'logistic_model.pkl')


In [None]:

# Cell 10: Setup transformers and tokenizer for BERT model
from transformers import DistilBertTokenizer, TFDistilBertForSequenceClassification
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import SparseCategoricalCrossentropy
from tensorflow.keras.metrics import SparseCategoricalAccuracy
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.preprocessing import LabelEncoder
import tensorflow as tf

tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')


In [None]:

# Cell 11: Tokenize text for BERT
train_encodings = tokenizer(list(X_train), truncation=True, padding=True, max_length=128)
test_encodings = tokenizer(list(X_test), truncation=True, padding=True, max_length=128)


In [None]:

# Cell 12: Convert encodings to TensorFlow dataset
train_dataset = tf.data.Dataset.from_tensor_slices((dict(train_encodings), y_train)).batch(16)
test_dataset = tf.data.Dataset.from_tensor_slices((dict(test_encodings), y_test)).batch(16)


In [None]:

# Cell 13: Train DistilBERT model
bert_model = TFDistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2)
bert_model.compile(optimizer=Adam(learning_rate=5e-5),
                   loss=SparseCategoricalCrossentropy(from_logits=True),
                   metrics=[SparseCategoricalAccuracy()])

bert_model.fit(train_dataset, validation_data=test_dataset, epochs=3, callbacks=[EarlyStopping(patience=1)])


In [None]:

# Cell 14: Evaluate BERT model
bert_model.evaluate(test_dataset)


In [None]:

# Cell 15: Save BERT model
bert_model.save_pretrained('bert_misinfo_model')
tokenizer.save_pretrained('bert_misinfo_model')


In [None]:

# Cell 16: Function to make predictions using BERT
def predict_misinfo(text):
    inputs = tokenizer(text, return_tensors="tf", truncation=True, padding=True)
    logits = bert_model(inputs)[0]
    prediction = tf.argmax(logits, axis=1).numpy()[0]
    return label_encoder.inverse_transform([prediction])[0]


In [None]:

# Cell 17: Test prediction
sample_text = "Breaking news: COVID-19 cures found in herbs!"
print("Prediction:", predict_misinfo(sample_text))


In [None]:

# Cell 18: SHAP Explainability (Logistic Regression only)
import shap
explainer = shap.LinearExplainer(lr_model, X_train_tfidata, feature_dependence="independent")
shap_values = explainer.shap_values(X_test_tfidata[:10])
shap.summary_plot(shap_values, X_test_tfidata[:10], feature_names=tfidata.get_feature_names_out())


In [None]:

# Cell 19: Example Flask API structure (pseudo-code)
# from flask import Flask, request, jsonify
# app = Flask(__name__)
# @app.route('/predict', methods=['POST'])
# def predict():
#     text = request.json['text']
#     result = predict_misinfo(text)
#     return jsonify({'prediction': result})


In [None]:

# Cell 20: Conclusion
print("Logistic Regression and DistilBERT models trained. Logistic model and TF-Idata vectorizer saved.")
print("DistilBERT saved to 'bert_misinfo_model' directory. Ready for API deployment.")
