In [None]:
import pandas as pd

# load data
# training data
src = 'data/training_data_embeddings.csv'
training_embeddings = pd.read_csv(src)

src = 'data/training_data_features.csv'
training_data = pd.read_csv(src)

# validation data
src = 'data/validation_data_embeddings.csv'
validation_embeddings = pd.read_csv(src)

src = 'data/validation_data_features.csv'
validation_data = pd.read_csv(src)

In [None]:
# NAIVE BAYES
# with sentence transformer word embeddings matrix

# REF: https://scikit-learn.org/stable/tutorial/text_analytics/working_with_text_data.html

from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
from sklearn.preprocessing import MinMaxScaler

X_train = training_embeddings
Y_train = training_data['reliable']

X_val = validation_embeddings
Y_val = validation_data['reliable']

# scalar to account for negative values
# Transform features by scaling each feature to a given range.
# REF: https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.MinMaxScaler.html
scaler = MinMaxScaler()
# Compute the minimum and maximum to be used for scaling
scaler.fit(X_train)
# Scale features of X according to feature_range.
X_train_scaled = pd.DataFrame(scaler.transform(X_train))
X_val_scaled = pd.DataFrame(scaler.transform(X_val))

# naive bayes model
clf = MultinomialNB().fit(X_train_scaled, Y_train)

# predictions
y_pred = clf.predict(X_val_scaled)
print('NAIVE BAYES w/ TRANSFORMER')

print(classification_report(Y_val, y_pred))

In [None]:
# LOGISTIC REGRESSION
# with sentence transformer word embeddings matrix

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

X_train = training_embeddings
Y_train = training_data['reliable']

X_val = validation_embeddings
Y_val = validation_data['reliable']

# create logistic reg. model, and train it
model = LogisticRegression()
model.fit(X_train, Y_train)

# test the model and report performance
predictions = model.predict(X_val)
print('LOGISTIC REGRESSION w/ TRANSFORMER')
print(classification_report(Y_val, predictions))

In [None]:
# NEURAL NETWORK
# https://scikit-learn.org/stable/modules/generated/sklearn.neural_network.MLPClassifier.html

from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler

# Scale input features
scaler = StandardScaler(with_mean=False)  # Pass with_mean=False for sparse matrices
X_training_scaled = scaler.fit_transform(training_embeddings)
X_validation_scaled = scaler.transform(validation_embeddings)

y_train = training_data['reliable']
y_val = validation_data['reliable']

# MLP model with 1 hidden layer and 10 neurones, with the default rectified linear unit function.
mlp_model = MLPClassifier(hidden_layer_sizes=(10), max_iter=500, batch_size=256, early_stopping=True, verbose=True)
mlp_model.fit(X_training_scaled, y_train)

predictions = mlp_model.predict(X_validation_scaled)

print("MLP CLASSIFIER w/ TRANSFORMER")
print(classification_report(y_val, predictions))