# Simple Logistic Regression Model w/ Extended Training Set

Scraped BBC articles are added to the training data. (Part 2, task 3)

In [None]:
import pandas as pd

# load grouped training data
src = 'data/training_data_features.csv'
training_data_raw = pd.read_csv(src, index_col=0)

# load grouped validation data
src = 'data/validation_data_features.csv'
validation_data_raw = pd.read_csv(src, index_col=0)

# load BBC articles
src = 'data/articles_features.csv'
bbc_data_raw = pd.read_csv(src, index_col=0)


In [None]:
# columns info
training_data_cols = list(training_data_raw.columns)
print('Training data cols:', training_data_cols)

validation_data_cols = list(validation_data_raw.columns)
print('Validation data cols:', validation_data_cols)

bbc_data_cols = list(bbc_data_raw.columns)
print('BBC data cols:', bbc_data_cols)

In [None]:
# add BBC articles to training_data
training_data_extended = pd.concat([training_data_raw, bbc_data_raw])

### LogReg model

In [None]:
# Code reference: https://www.freecodecamp.org/news/how-to-build-and-train-linear-and-logistic-regression-ml-models-in-python/

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

training_data = training_data_extended.copy(deep=True)
validation_data = validation_data_raw.copy(deep=True)

drop_lst = [
    'type', 
    'content',
    'title', 
    'authors', 
    'content_clean', 
    'content_stopword',
    'content_stem',
    # 'num_count', 
    # 'date_count',
    # 'url_count',
    # 'comma_count',
    # 'exclm_count',
    # 'content_word_freq',
    # 'stop_word_freq',
    # 'stem_word_freq',
    # 'stop_reduction_rate',
    # 'stem_reduction_rate',
    # 'average_sentence_length', 
    # 'has_author'
]

# remove columns
training_data.drop(drop_lst, axis = 1, inplace = True)
validation_data.drop(drop_lst, axis = 1, inplace = True)

# Split data into training and prediction data
y_training_data = training_data['reliable']
y_validation_data = validation_data['reliable']

# x data are all the features to train on. 
# The 'reliable' bool column is dropped from the training input
x_training_data = training_data.drop(['reliable'], axis=1)
x_validation_data = validation_data.drop(['reliable'], axis=1)

# create logistic reg. model, and train it
model = LogisticRegression()
model.fit(x_training_data, y_training_data)

# test the model on validation data and report performance
predictions = model.predict(x_validation_data)

print('LOGISTIC REGRESSION w/ EXTENDED TRANING SET (BBC ARTICLES)')
print(classification_report(y_validation_data, predictions))

In [None]:
# get weights of coefficents
dict(zip(model.feature_names_in_, model.coef_[0]))