# Simple Logistic Regression Model

In [None]:
import pandas as pd

# load data
src = 'data/training_data_features.csv'
training_data = pd.read_csv(src, index_col=0)

src = 'data/validation_data_features.csv'
validation_data = pd.read_csv(src, index_col=0)

### LogReg with Baseline features

Using FakeNews validation and training set

In [None]:
# Code reference: https://www.freecodecamp.org/news/how-to-build-and-train-linear-and-logistic-regression-ml-models-in-python/

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, mean_squared_error

features = [
    'date_count', 
    'url_count', 
    'exclm_count',
    'content_word_freq', 
    'stop_word_freq', 
    'stem_word_freq',
    'stop_reduction_rate', 
    'stem_reduction_rate', 
    'average_sentence_length'
]

# features
X_training = training_data[features]
X_validation = validation_data[features]

# targets
y_training = training_data['reliable']
y_validation = validation_data['reliable']

# create logistic reg. model, and train it
model = LogisticRegression(max_iter=1000)
model.fit(X_training, y_training)

# test the model on validation data and report performance
predictions = model.predict(X_validation)

print('LOGISTIC REGRESSION w/ BASELINE (VALIDATION)')
print(classification_report(y_validation, predictions))

print('MSE:', mean_squared_error(y_validation, predictions))

In [None]:
# get weights of coefficents
dict(zip(model.feature_names_in_, model.coef_[0]))

### FakeNews Test Results

FakeNews test split data

In [None]:
# load test data
src = 'data/test_data_features.csv'
test_data = pd.read_csv(src, index_col=0)

# get features
X_test = test_data[features]
y_test = test_data['reliable']

# test the model and report performance
test_predictions = model.predict(X_test)

print('LOGISTIC REGRESSION w/ BASELINE [TEST]')
print(classification_report(y_test, test_predictions))

### LIAR Test Results

LIAR test data

In [None]:
# load test data
src = 'data/liar_dataset/test_features.csv'
liar_data = pd.read_csv(src, index_col=0)

# get features
X_liar = liar_data[features]
y_liar = liar_data['reliable']

# test the model and report performance
liar_predictions = model.predict(X_liar)

print('LOGISTIC REGRESSION w/ BASELINE [LIAR TEST]')
print(classification_report(y_liar, liar_predictions))