In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
from imblearn.over_sampling import SMOTE

# Step 1: Load the data
data = pd.read_csv('labelled_train_set.csv')

# Step 2: Preprocess the data
# Drop any rows with missing values
data.dropna(inplace=True)

# Convert text to lowercase
data['News/Comment'] = data['News/Comment'].str.lower()

# Encode the target variable
data['Type'] = data['Type'].map({'FALSE': 0, 'HALF TRUE': 1, 'TRUE': 2})

# Step 3: Split the data into features and target
X = data['News/Comment']
y = data['Type']

# Handle Missing Values in Target Variable
y = y.fillna(y.mode()[0]) # Fill missing values with the most frequent class

# Step 4: Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Step 5: Vectorize the text data
vectorizer = TfidfVectorizer()
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)

# Step 6: Handle class imbalance using SMOTE
smote = SMOTE()
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_vectorized, y_train)

# Step 7: Train the model
model = LogisticRegression(max_iter=1000)
model.fit(X_train_resampled, y_train_resampled)

# Step 8: Evaluate the model
y_pred = model.predict(X_test_vectorized)

# Print classification report and confusion matrix
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

# Optional: Save the model and vectorizer for future use
import joblib
joblib.dump(model, 'fake_news_model.pkl')
joblib.dump(vectorizer, 'tfidf_vectorizer.pkl')



              precision    recall  f1-score   support

         0.0       0.93      0.90      0.91       279
         1.0       0.23      0.29      0.25        28

    accuracy                           0.85       307
   macro avg       0.58      0.59      0.58       307
weighted avg       0.86      0.85      0.85       307

[[252  27]
 [ 20   8]]


['tfidf_vectorizer.pkl']

In [None]:
import pandas as pd
import joblib

# Step 1: Load the saved model and vectorizer
model = joblib.load('fake_news_model.pkl')
vectorizer = joblib.load('tfidf_vectorizer.pkl')

# Step 2: Load the new test data
test_data = pd.read_csv('unlabelled_test1.csv')  # Replace with your test data file path

# Step 3: Preprocess the test data
# Convert text to lowercase
test_data['News/Comment'] = test_data['News/Comment'].str.lower()

# Step 4: Vectorize the test data
X_test_vectorized = vectorizer.transform(test_data['News/Comment'])

# Step 5: Make predictions
predictions = model.predict(X_test_vectorized)

# Step 6: Add predictions to the test data
test_data['Predicted_Type'] = predictions

# Step 7: Save the results
test_data.to_csv('predicted_test_results.csv', index=False)

print("Predictions saved to 'predicted_test_results.csv'")


Predictions saved to 'predicted_test_results.csv'
