In [None]:
import pandas as pd

# load data
src = 'data/training_data_features.csv'
training_data_raw = pd.read_csv(src, index_col=0)

src = 'data/validation_data_features.csv'
validation_data_raw = pd.read_csv(src, index_col=0)

In [None]:
# Create TF-IDF matrix

from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, HashingVectorizer

X_train = training_data_raw['content_stem']
X_val = validation_data_raw['content_stem']

# bag of words
# https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html#sklearn.feature_extraction.text.CountVectorizer
vectorizer = CountVectorizer()
# Learn a vocabulary dictionary of all word tokens in the entire training set.
vectorizer.fit(X_train)
# Transform documents to document-term matrix.
X_train_cnts = vectorizer.transform(X_train)
X_val_cnts = vectorizer.transform(X_val)

# tf-idf
# https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfTransformer.html#sklearn.feature_extraction.text.TfidfTransformer
tfidf_transformer = TfidfTransformer()
# Learn the idf vector (global term weights) from the document-term matrix
tfidf_transformer.fit(X_train_cnts)
# Transform a count matrix to a tf or tf-idf representation
X_train_tfidf = tfidf_transformer.transform(X_train_cnts)
X_val_tfidf = tfidf_transformer.transform(X_val_cnts)

In [None]:
# Combine TF-IDF matrix with simple features

from scipy.sparse import hstack

features = [
    'date_count', 
    'url_count', 
    'exclm_count',
    'content_word_freq', 
    'stop_word_freq', 
    'stem_word_freq',
    'stop_reduction_rate', 
    'stem_reduction_rate', 
    'average_sentence_length'
]

# combine features and TF-IDF
X_training_numeric = training_data_raw[features]
X_training_combined = hstack((X_train_tfidf, X_training_numeric))

X_validation_numeric = validation_data_raw[features]
X_validation_combined = hstack((X_val_tfidf, X_validation_numeric))

### Logistic Regression w. Baseline + TF-IDF

In [None]:
# LOGISTIC REGRESSION

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

x_train = X_training_combined
y_train = training_data_raw['reliable']

x_val = X_validation_combined
y_val = validation_data_raw['reliable']

# create logistic reg. model, and train it
log_reg_model = LogisticRegression(max_iter=200)
log_reg_model.fit(x_train, y_train)

# test the model and report performance
predictions = log_reg_model.predict(x_val)
print('LOGISTIC REGRESSION w/ BASELINE, TF-IDF')
print(classification_report(y_val, predictions))

# Confusion matrix of classification errors
from sklearn.metrics import ConfusionMatrixDisplay
# ConfusionMatrixDisplay.from_estimator(log_reg_model, x_val, y_val)

### Logistic Regression w. TF-IDF

In [None]:
# LOGISTIC REGRESSION

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

# x_train = X_training_combined
x_train = X_train_tfidf
y_train = training_data_raw['reliable']

# x_val = X_validation_combined
x_val = X_val_tfidf
y_val = validation_data_raw['reliable']

# create logistic reg. model, and train it
log_reg_model = LogisticRegression(max_iter=200)
log_reg_model.fit(x_train, y_train)

# test the model and report performance
predictions = log_reg_model.predict(x_val)
print('LOGISTIC REGRESSION w/ TF-IDF')
print(classification_report(y_val, predictions))

# Confusion matrix of classification errors
from sklearn.metrics import ConfusionMatrixDisplay
# ConfusionMatrixDisplay.from_estimator(log_reg_model, x_val, y_val)

### Naive Bayes w. Baseline and TF-IDF

In [None]:
# NAIVE BAYES
# REF: https://scikit-learn.org/stable/tutorial/text_analytics/working_with_text_data.html

import numpy as np
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report

x_train = X_training_combined
y_train = training_data_raw['reliable']

x_val = X_validation_combined
y_val = validation_data_raw['reliable']

# naive bayes model
nb_model = MultinomialNB().fit(x_train, y_train)

# predictions
y_pred = nb_model.predict(x_val)
print('NAIVE BAYES w/ BASELINE, TF-IDF')
print(classification_report(y_val, y_pred))

# Confusion matrix of classification errors
from sklearn.metrics import ConfusionMatrixDisplay
# ConfusionMatrixDisplay.from_estimator(nb_model, x_val, y_val)

### Naive Bayes w. TF-IDF

In [None]:
# NAIVE BAYES
# REF: https://scikit-learn.org/stable/tutorial/text_analytics/working_with_text_data.html

import numpy as np
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report

# x_train = X_training_combined
x_train = X_train_tfidf
y_train = training_data_raw['reliable']

# x_val = X_validation_combined
x_val = X_val_tfidf
y_val = validation_data_raw['reliable']

# naive bayes model
nb_model = MultinomialNB().fit(x_train, y_train)

# predictions
y_pred = nb_model.predict(x_val)
print('NAIVE BAYES w/ TF-IDF')
print(classification_report(y_val, y_pred))

# Confusion matrix of classification errors
from sklearn.metrics import ConfusionMatrixDisplay
# ConfusionMatrixDisplay.from_estimator(nb_model, x_val, y_val)

### Neural Network with Simple Features + TF-IDF

In [None]:
# NEURAL NETWORK
# https://scikit-learn.org/stable/modules/generated/sklearn.neural_network.MLPClassifier.html

from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler

# Scale input features
scaler = StandardScaler(with_mean=False)  # Pass with_mean=False for sparse matrices
X_training_scaled = scaler.fit_transform(X_training_combined)
X_validation_scaled = scaler.transform(X_validation_combined)

y_training_data = training_data_raw['reliable']
y_validation_data = validation_data_raw['reliable']

# MLP model with 1 hidden layer and 10 neurones, with the default rectified linear unit function.
mlp_model = MLPClassifier(hidden_layer_sizes=(10), max_iter=500, batch_size=256, early_stopping=True, verbose=True)
mlp_model.fit(X_training_scaled, y_training_data)

predictions = mlp_model.predict(X_validation_scaled)

print("MLP CLASSIFIER w/ FEATURES + TF-IDF")
print(classification_report(y_validation_data, predictions))

# Confusion matrix of classification errors
from sklearn.metrics import ConfusionMatrixDisplay
ConfusionMatrixDisplay.from_estimator(mlp_model, X_validation_scaled, y_validation_data)