# Simple Naive Bayes Model

In [1]:
import pandas as pd

# load data
src = 'data/training_data_features.csv'
training_data = pd.read_csv(src, index_col=0)

src = 'data/validation_data_features.csv'
validation_data = pd.read_csv(src, index_col=0)

### NBayes model with Baseline features

Using FakeNews validation and training set

In [4]:
# NAIVE BAYES
# REF: https://scikit-learn.org/stable/tutorial/text_analytics/working_with_text_data.html

from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, mean_squared_error

features = [
    'date_count', 
    'url_count', 
    'exclm_count',
    'content_word_freq', 
    'stop_word_freq', 
    'stem_word_freq',
    'stop_reduction_rate', 
    'stem_reduction_rate', 
    'average_sentence_length'
]

# features
X_training = training_data[features]
X_validation = validation_data[features]

# targets
y_training = training_data['reliable']
y_validation = validation_data['reliable']

# naive bayes model
nb_model = MultinomialNB()
nb_model.fit(X_training, y_training)

# predictions
y_pred = nb_model.predict(X_validation)
print('NAIVE BAYES w/ BASELINE [VALIDATION]')
print(classification_report(y_validation, y_pred))

NAIVE BAYES w/ BASELINE [VALIDATION]
              precision    recall  f1-score   support

       False       0.58      0.70      0.64     21571
        True       0.63      0.51      0.56     22002

    accuracy                           0.60     43573
   macro avg       0.61      0.60      0.60     43573
weighted avg       0.61      0.60      0.60     43573



### FakeNews Test Results

FakeNews test split data

In [None]:
# load test data
src = 'data/test_data_features.csv'
test_data = pd.read_csv(src, index_col=0)

# get features
X_test = test_data[features]
y_test = test_data['reliable']

# test the model and report performance
test_predictions = nb_model.predict(X_test)

print('NAIVE BAYES w/ BASELINE [TEST]')
print(classification_report(y_test, test_predictions))

### LIAR Test Results

LIAR test data

In [None]:
# load test data
src = 'data/liar_dataset/test_features.csv'
liar_data = pd.read_csv(src, index_col=0)

# get features
X_liar = liar_data[features]
y_liar = liar_data['reliable']

# test the model and report performance
liar_predictions = nb_model.predict(X_liar)

print('NAIVE BAYES w/ BASELINE [LIAR TEST]')
print(classification_report(y_liar, liar_predictions))