<a href="https://colab.research.google.com/github/it21222672/modelcreation/blob/main/Untitled8.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import re
import string
import zipfile
import os

# Step 1: Extract ZIP File
zip_path = "archive.zip"  # Replace with actual ZIP file path
extract_path = "extracted_data"

with zipfile.ZipFile(zip_path, "r") as zip_ref:
    zip_ref.extractall(extract_path)

# Step 2: Load the dataset
df_fake = pd.read_csv(os.path.join(extract_path, "Fake.csv"))
df_true = pd.read_csv(os.path.join(extract_path, "True.csv"))

# Step 3: Add labels
df_fake["label"] = 0  # Fake news
df_true["label"] = 1  # True news

# Step 4: Combine datasets
df = pd.concat([df_fake, df_true], axis=0).reset_index(drop=True)

# Step 5: Define text preprocessing function
def preprocess_text(text):
    text = text.lower()  # Lowercasing
    text = re.sub(f"[{string.punctuation}]", "", text)  # Remove punctuation
    text = re.sub("\d+", "", text)  # Remove numbers
    text = re.sub("\s+", " ", text).strip()  # Remove extra spaces
    return text

# Step 6: Apply preprocessing
df["clean_text"] = df["text"].astype(str).apply(preprocess_text)

# Step 7: Save the preprocessed data
df[["clean_text", "label"]].to_csv("preprocessed_news.csv", index=False)

print("Preprocessing complete. Data saved as 'preprocessed_news.csv'.")


Preprocessing complete. Data saved as 'preprocessed_news.csv'.


In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

# Load dataset
df = pd.read_csv("preprocessed_news.csv")

# Step 1: Ensure all text values are strings
df["clean_text"] = df["clean_text"].astype(str)

# Step 2: Remove any empty strings or NaN values
df = df[df["clean_text"].str.strip() != ""]  # Remove empty text rows
df = df.dropna(subset=["clean_text"])  # Drop NaN rows

# Step 3: Apply TF-IDF Vectorization
vectorizer = TfidfVectorizer(max_features=5000)
X = vectorizer.fit_transform(df["clean_text"])
y = df["label"]

print("TF-IDF transformation successful!")


TF-IDF transformation successful!


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report
from sklearn.feature_extraction.text import TfidfVectorizer

# Load preprocessed dataset
df = pd.read_csv("preprocessed_news.csv")

# Ensure all text values are strings
df["clean_text"] = df["clean_text"].astype(str)

# Remove empty strings and NaN values
df = df[df["clean_text"].str.strip() != ""]
df = df.dropna(subset=["clean_text"])

# Apply TF-IDF Vectorization
vectorizer = TfidfVectorizer(max_features=5000)
X = vectorizer.fit_transform(df["clean_text"])
y = df["label"]

# Split data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train SVM model
svm_model = SVC(kernel="linear", C=1.0)
svm_model.fit(X_train, y_train)

# Make predictions
y_pred = svm_model.predict(X_test)

# Evaluate model performance
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy:.4f}")

# Print classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))


Model Accuracy: 0.9930
Classification Report:
              precision    recall  f1-score   support

           0       0.99      0.99      0.99      4733
           1       0.99      0.99      0.99      4247

    accuracy                           0.99      8980
   macro avg       0.99      0.99      0.99      8980
weighted avg       0.99      0.99      0.99      8980



In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.feature_extraction.text import TfidfVectorizer

# Load preprocessed dataset
df = pd.read_csv("preprocessed_news.csv")

# Ensure all text values are strings
df["clean_text"] = df["clean_text"].astype(str)

# Remove empty strings and NaN values
df = df[df["clean_text"].str.strip() != ""]
df = df.dropna(subset=["clean_text"])

# Apply TF-IDF Vectorization
vectorizer = TfidfVectorizer(max_features=5000)
X = vectorizer.fit_transform(df["clean_text"])
y = df["label"]

# Split data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train Logistic Regression model
log_model = LogisticRegression(max_iter=1000)
log_model.fit(X_train, y_train)

# Make predictions
y_pred = log_model.predict(X_test)

# Evaluate model performance
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy:.4f}")

# Print classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))


Model Accuracy: 0.9860
Classification Report:
              precision    recall  f1-score   support

           0       0.99      0.99      0.99      4733
           1       0.98      0.99      0.99      4247

    accuracy                           0.99      8980
   macro avg       0.99      0.99      0.99      8980
weighted avg       0.99      0.99      0.99      8980



In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.feature_extraction.text import TfidfVectorizer

# Load preprocessed dataset
df = pd.read_csv("preprocessed_news.csv")

# Ensure all text values are strings
df["clean_text"] = df["clean_text"].astype(str)

# Remove empty strings and NaN values
df = df[df["clean_text"].str.strip() != ""]
df = df.dropna(subset=["clean_text"])

# Apply TF-IDF Vectorization
vectorizer = TfidfVectorizer(max_features=5000)
X = vectorizer.fit_transform(df["clean_text"])
y = df["label"]

# Split data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train Decision Tree model
dt_model = DecisionTreeClassifier(random_state=42)
dt_model.fit(X_train, y_train)

# Make predictions
y_pred = dt_model.predict(X_test)

# Evaluate model performance
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy:.4f}")

# Print classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))


Model Accuracy: 0.9951
Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      4733
           1       1.00      0.99      0.99      4247

    accuracy                           1.00      8980
   macro avg       1.00      1.00      1.00      8980
weighted avg       1.00      1.00      1.00      8980



In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.feature_extraction.text import TfidfVectorizer

# Load preprocessed dataset
df = pd.read_csv("preprocessed_news.csv")

# Ensure all text values are strings
df["clean_text"] = df["clean_text"].astype(str)

# Remove empty strings and NaN values
df = df[df["clean_text"].str.strip() != ""]
df = df.dropna(subset=["clean_text"])

# Apply TF-IDF Vectorization
vectorizer = TfidfVectorizer(max_features=5000)
X = vectorizer.fit_transform(df["clean_text"])
y = df["label"]

# Split data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train Random Forest model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Make predictions
y_pred = rf_model.predict(X_test)

# Evaluate model performance
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy:.4f}")

# Print classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))


Model Accuracy: 0.9965
Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      4733
           1       1.00      1.00      1.00      4247

    accuracy                           1.00      8980
   macro avg       1.00      1.00      1.00      8980
weighted avg       1.00      1.00      1.00      8980

