# Ironhack Project NLP (Training)

Team 4 (Salva, Diego, Fabi)

## Imports

Import only needed libaries

In [1]:
import re
import nltk
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import itertools
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB, ComplementNB
from sklearn.metrics import ConfusionMatrixDisplay, accuracy_score, classification_report, confusion_matrix, f1_score
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

%matplotlib inline

In [2]:
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


True

## Preparations

### Constants

Make classes readable

In [3]:
CLASSES = {"0": "Fake news", "1": "Real news"}

#### Pre Processing

Helper methods to clean up the texts

In [4]:
english_stopwords = stopwords.words('english')
def remove_stopwords(text):
    """Removes stop words from a given text.

    Args:
      text(string): The input text as a string.

    Returns:
      The text with stop words removed as a string.
    """

    text = ' '.join([word for word in text.split() if word not in english_stopwords]) # Remove word if it's a stopword
    return text

In [5]:
def remove_special_chars(text):
    """Removes special characters from a given text.

    Args:
        text(string): The input text as a string.

    Returns:
        The text with special characters removed as a string.
    """

    return re.sub(r'[^\w\s.]', '', text) # Remove special Characters

In [6]:
def clean_text(text):
    """
    Remove numbers and extra spaces from text

    Args:
        text(string): The text to be cleaned.

    Returns:
        string: The cleaned text.
    """

    text = re.sub(r'\d+', '', text) # Remove numbers
    text = re.sub(r'\s+', ' ', text)  # replace multiple spaces with just 1
    return text

In [7]:
lemmatizer = WordNetLemmatizer()
def lemmatize_text(text):
    """
    Lemmatize a given text.

    Args:
        text(string): The text to be lemmatized.

    Returns:
        string: The lemmatized text.
    """
    words = word_tokenize(text) # Tokenize the text into words
    lemmatized_words = [lemmatizer.lemmatize(word) for word in words] # Lemmatize each word
    return ' '.join(lemmatized_words) # Join the lemmatized words back into a single string

### Import Data

Loads the labeled text data from a tab-separated file named "training_data_lowercase.csv" into a Pandas DataFrame.

In [12]:
raw_data = pd.read_csv('training_data_lowercase.csv', sep='\t', header=None, names=['class', 'text'])

### Check Data

Check Data if it contains blank cells

In [13]:
raw_data.isnull().sum()

Unnamed: 0,0
class,0
text,0


### Clean Data

Create combinations of all pre prcessing methods to test the models on

In [14]:
combinations = {}
functions = [clean_text, remove_special_chars, remove_stopwords, lemmatize_text]

for r in range(1, len(functions) + 1):
    for subset in itertools.combinations(functions, r):
        data_copy = raw_data.copy()
        func_names = [func.__name__ for func in subset]
        name = ','.join(func_names)
        print(f"Applying: {name}")
        for func in subset:
            data_copy['text'] = data_copy['text'].apply(func)
            combinations[name] = data_copy

Applying: clean_text
Applying: remove_special_chars
Applying: remove_stopwords
Applying: lemmatize_text
Applying: clean_text,remove_special_chars
Applying: clean_text,remove_stopwords
Applying: clean_text,lemmatize_text
Applying: remove_special_chars,remove_stopwords
Applying: remove_special_chars,lemmatize_text
Applying: remove_stopwords,lemmatize_text
Applying: clean_text,remove_special_chars,remove_stopwords
Applying: clean_text,remove_special_chars,lemmatize_text
Applying: clean_text,remove_stopwords,lemmatize_text
Applying: remove_special_chars,remove_stopwords,lemmatize_text
Applying: clean_text,remove_special_chars,remove_stopwords,lemmatize_text


## Models

Multiple models, including MultinomialNB, ComplementNB, and LogisticRegression, were tested with two vectorizers: CountVectorizer and TfidfVectorizer. These models were evaluated on different text preprocessing combinations to find the best performing combination. The goal was to identify the model and text processing approach that yielded the highest accuracy and lowest difference between training and testing accuracy.

In [None]:
results = {}

models = {"MultinomialNB": MultinomialNB(),
          "ComplementNB": ComplementNB(),
          "LogisticRegression": LogisticRegression(solver='liblinear')}

vectorizers = {"CountVectorizer": CountVectorizer(ngram_range=(1,2)),
               "TfidfVectorizer": TfidfVectorizer(max_features=3200)}

In [15]:
for name, model in models.items():
    for vec_name, vectorizer in vectorizers.items():
        for comb_name, data in combinations.items():
          print(name, vec_name, comb_name)

          vec_data = vectorizer.fit_transform(data['text'])
          X_train, X_test, y_train, y_test = train_test_split(vec_data, data['class'], test_size=0.3, random_state=42)
          classifier = model
          classifier.fit(X_train, y_train)

          y_pred = classifier.predict(X_test)
          y_pred_train = classifier.predict(X_train)

          test_accuracy = accuracy_score(y_test, y_pred)
          train_accuracy = accuracy_score(y_train, y_pred_train)

          cross_score = cross_val_score(model, vec_data, raw_data['class'], cv=5)

          results[f"{name}, {vec_name}, {comb_name}"] = {
                "Diff": np.round(train_accuracy - test_accuracy, decimals=3),
                "Train Accuracy": np.round(train_accuracy, decimals=3),
                "Test Accuracy": np.round(test_accuracy, decimals=3),
                "Cross Validation Score": np.round(np.mean(cross_score), decimals=3)
          }


MultinomialNB CountVectorizer clean_text
MultinomialNB CountVectorizer remove_special_chars
MultinomialNB CountVectorizer remove_stopwords
MultinomialNB CountVectorizer lemmatize_text
MultinomialNB CountVectorizer clean_text,remove_special_chars
MultinomialNB CountVectorizer clean_text,remove_stopwords
MultinomialNB CountVectorizer clean_text,lemmatize_text
MultinomialNB CountVectorizer remove_special_chars,remove_stopwords
MultinomialNB CountVectorizer remove_special_chars,lemmatize_text
MultinomialNB CountVectorizer remove_stopwords,lemmatize_text
MultinomialNB CountVectorizer clean_text,remove_special_chars,remove_stopwords
MultinomialNB CountVectorizer clean_text,remove_special_chars,lemmatize_text
MultinomialNB CountVectorizer clean_text,remove_stopwords,lemmatize_text
MultinomialNB CountVectorizer remove_special_chars,remove_stopwords,lemmatize_text
MultinomialNB CountVectorizer clean_text,remove_special_chars,remove_stopwords,lemmatize_text
MultinomialNB TfidfVectorizer clean_te

In [16]:
results_df = pd.DataFrame(results).T
results_df.sort_values(by=['Train Accuracy', 'Diff'], ascending=[False, True])

Unnamed: 0,Diff,Train Accuracy,Test Accuracy,Cross Validation Score
"LogisticRegression, CountVectorizer, clean_text",0.049,0.999,0.949,0.922
"LogisticRegression, CountVectorizer, lemmatize_text",0.049,0.999,0.949,0.922
"LogisticRegression, CountVectorizer, clean_text,lemmatize_text",0.049,0.999,0.949,0.923
"LogisticRegression, CountVectorizer, remove_special_chars,lemmatize_text",0.049,0.999,0.949,0.920
"LogisticRegression, CountVectorizer, remove_special_chars",0.050,0.999,0.949,0.920
...,...,...,...,...
"MultinomialNB, TfidfVectorizer, remove_special_chars,remove_stopwords,lemmatize_text",0.013,0.931,0.918,0.895
"MultinomialNB, TfidfVectorizer, clean_text,remove_special_chars,remove_stopwords,lemmatize_text",0.013,0.931,0.919,0.896
"MultinomialNB, TfidfVectorizer, clean_text,remove_special_chars,remove_stopwords",0.008,0.930,0.922,0.895
"ComplementNB, TfidfVectorizer, remove_special_chars,remove_stopwords",0.008,0.930,0.922,0.896
