In [35]:
import pandas as pd
import numpy as np
import nltk
from nltk.stem import WordNetLemmatizer, PorterStemmer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# Download necessary NLTK data files (run once)
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Import classifier packages
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, f1_score


def load_data(facts_file, fakes_file):
    with open(facts_file, 'r', encoding='utf-8') as f:
        facts = f.readlines()
    with open(fakes_file, 'r', encoding='utf-8') as f:
        fakes = f.readlines()
    # Create labels
    facts = [(fact.strip(), 1) for fact in facts]
    fakes = [(fake.strip(), 0) for fake in fakes]
    return facts + fakes


# Preprocess and split the dataset
def preprocess_and_split(data, method):
    # Create a DataFrame
    df = pd.DataFrame(data, columns=['text', 'label'])

    # Apply preprocessing if method is not None
    if method is not None:
        df['text'] = df['text'].apply(lambda x: preprocess_text(x, method))

    X = df['text']
    y = df['label']

    # Split into train and test sets
    return train_test_split(X, y, test_size=0.2, random_state=0)


# For 2 preprocesses
def double_preprocess_and_split(data, method1, method2):
    df = pd.DataFrame(data, columns=['text', 'label'])
    # Apply the first preprocessing method
    df['text'] = df['text'].apply(lambda x: preprocess_text(x, method1))
    # Apply the second preprocessing method
    df['text'] = df['text'].apply(lambda x: preprocess_text(x, method2))
    X = df['text']
    y = df['label']
    return train_test_split(X, y, test_size=0.2, random_state=0)


def preprocess_text(text, method):
    stop_words = set(stopwords.words('english'))
    lemmatizer = WordNetLemmatizer()
    stemmer = PorterStemmer()

    # Tokenize the text
    words = word_tokenize(text.lower())

    # Apply preprocessing method
    if method == 'Lemmatizer':
        words = [lemmatizer.lemmatize(word) for word in words]
    elif method == 'Stemmer':
        words = [stemmer.stem(word) for word in words]
    elif method == 'Stop_words':
        words = [word for word in words if word.isalnum() and word not in stop_words]

    return ' '.join(words)


# Build a pipeline with different classifiers
def build_pipeline(vectorizer, classifier):
    return Pipeline([
        ('vectorizer', vectorizer),
        ('classifier', classifier)
    ])


# Function to perform parameter tuning using GridSearchCV
def perform_grid_search(pipeline, param_grid, X_train, y_train):
    grid_search = GridSearchCV(pipeline, param_grid, scoring='f1', cv=5, n_jobs=-1)
    grid_search.fit(X_train, y_train)
    return grid_search.best_params_, grid_search.best_score_


# Main function to run experiments
def run_experiments():
    # Load and split data
    data = load_data('/content/drive/MyDrive/facts.txt', '/content/drive/MyDrive/fakes.txt')

    # Define vectorizers and classifiers
    vectorizers = [
        TfidfVectorizer()
    ]

    classifiers = [
        LogisticRegression(max_iter=1000),
        SVC(),
        MultinomialNB()
    ]

    preprocessors = [None, 'Lemmatizer', 'Stemmer', 'Stop_words']

    # Define parameter grids for each classifier
    param_grids = {
        'LogisticRegression': {
            'classifier__C': [0.1, 1, 10],
            'classifier__max_iter': [100, 300, 1000]
        },
        'SVC': {
            'classifier__C': [0.1, 1, 10],
            'classifier__kernel': ['linear', 'rbf'],
            'classifier__gamma': ['scale', 'auto']
        },
        'MultinomialNB': {
            'classifier__alpha': [0.5, 1.0, 1.5]
        }
    }

    for preprocess in preprocessors:
        X_train, X_test, y_train, y_test = preprocess_and_split(data, preprocess)

        for vectorizer in vectorizers:
            for classifier in classifiers:
                classifier_name = classifier.__class__.__name__
                pipeline = build_pipeline(vectorizer, classifier)

                # Perform parameter tuning if a grid is defined for the classifier
                if classifier_name in param_grids:
                    print(f"Tuning parameters for {classifier_name} with {preprocess} preprocessing...")
                    best_params, best_score = perform_grid_search(pipeline, param_grids[classifier_name], X_train, y_train)
                    print(f"Best parameters for {classifier_name}: {best_params}, Best F1 Score: {best_score:.2f}")

                    # Set the best parameters in the pipeline
                    pipeline.set_params(**best_params)

                # Fit the model with the best parameters and evaluate
                pipeline.fit(X_train, y_train)
                y_pred = pipeline.predict(X_test)
                accuracy = accuracy_score(y_test, y_pred)
                f1 = f1_score(y_test, y_pred, pos_label=1)
                print(f'Vectorizer: {vectorizer}, Classifier: {classifier_name}, Preprocess: {preprocess}, '
                      f'Accuracy: {accuracy:.2f}, F1 Score: {f1:.2f}')

    # Applying both lemmatizer and stemmer
    X_train, X_test, y_train, y_test = double_preprocess_and_split(data, "Lemmatizer", "Stemmer")
    for vectorizer in vectorizers:
        for classifier in classifiers:
            classifier_name = classifier.__class__.__name__
            pipeline = build_pipeline(vectorizer, classifier)

            # Perform parameter tuning if a grid is defined for the classifier
            if classifier_name in param_grids:
                print(f"Tuning parameters for {classifier_name} with Lemmatizer and Stemmer preprocessing...")
                best_params, best_score = perform_grid_search(pipeline, param_grids[classifier_name], X_train, y_train)
                print(f"Best parameters for {classifier_name}: {best_params}, Best F1 Score: {best_score:.2f}")

                # Set the best parameters in the pipeline
                pipeline.set_params(**best_params)

            # Fit the model with the best parameters and evaluate
            pipeline.fit(X_train, y_train)
            y_pred = pipeline.predict(X_test)
            accuracy = accuracy_score(y_test, y_pred)
            f1 = f1_score(y_test, y_pred, pos_label=1)
            print(f'Vectorizer: {vectorizer}, Classifier: {classifier_name}, '
                  f'Preprocess: Lemmatizer and Stemmer, Accuracy: {accuracy:.2f}, F1 Score: {f1:.2f}')


if __name__ == '__main__':
    run_experiments()




Tuning parameters for LogisticRegression with None preprocessing...


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Best parameters for LogisticRegression: {'classifier__C': 1, 'classifier__max_iter': 100}, Best F1 Score: 0.91
Vectorizer: TfidfVectorizer(), Classifier: LogisticRegression, Preprocess: None, Accuracy: 0.95, F1 Score: 0.95
Tuning parameters for SVC with None preprocessing...
Best parameters for SVC: {'classifier__C': 1, 'classifier__gamma': 'scale', 'classifier__kernel': 'rbf'}, Best F1 Score: 0.92
Vectorizer: TfidfVectorizer(), Classifier: SVC, Preprocess: None, Accuracy: 0.93, F1 Score: 0.91
Tuning parameters for MultinomialNB with None preprocessing...
Best parameters for MultinomialNB: {'classifier__alpha': 1.0}, Best F1 Score: 0.92
Vectorizer: TfidfVectorizer(), Classifier: MultinomialNB, Preprocess: None, Accuracy: 0.95, F1 Score: 0.95
Tuning parameters for LogisticRegression with Lemmatizer preprocessing...
Best parameters for LogisticRegression: {'classifier__C': 10, 'classifier__max_iter': 100}, Best F1 Score: 0.92
Vectorizer: TfidfVectorizer(), Classifier: LogisticRegression,