# Simple Logistic Regression Model

In [None]:
import pandas as pd

# load grouped training data
src = 'data/training_data_features.csv'
training_data_raw = pd.read_csv(src)

# load grouped validation data
src = 'data/validation_data_features.csv'
validation_data_raw = pd.read_csv(src)

In [None]:
# columns info
training_data_cols = list(training_data_raw.columns)
print('Training data cols:', training_data_cols)

validation_data_cols = list(validation_data_raw.columns)
print('Validation data cols:', validation_data_cols)

### Train the model

In [None]:
# Code reference: https://www.freecodecamp.org/news/how-to-build-and-train-linear-and-logistic-regression-ml-models-in-python/

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

training_data = training_data_raw.copy(deep=True)
validation_data = validation_data_raw.copy(deep=True)

# remove columns
training_data.drop([
                'Unnamed: 0', 
                'type', 
                'content',
                'title', 
                'authors', 
                'content_clean', 
                'content_stopword',
                'content_stem',
                # 'stop_word_freq',
                # 'stem_word_freq',
                # 'stem_reduction_rate'
                ], axis = 1, inplace = True)

# remove columns
validation_data.drop([
                'Unnamed: 0', 
                'type', 
                'content',
                'title', 
                'authors', 
                'content_clean', 
                'content_stopword',
                'content_stem',
                # 'stop_word_freq',
                # 'stem_word_freq',
                # 'stem_reduction_rate'
                ], axis = 1, inplace = True)


# Split data into training and prediction data
y_training_data = training_data['reliable']
y_validation_data = validation_data['reliable']

# x data are all the features to train on. 
# The 'reliable' bool column is dropped from the training input
x_training_data = training_data.drop(['reliable'], axis=1)
x_validation_data = validation_data.drop(['reliable'], axis=1)

# create logistic reg. model, and train it
model = LogisticRegression()
model.fit(x_training_data, y_training_data)

# test the model on validation data and report performance
predictions = model.predict(x_validation_data)
print(classification_report(y_validation_data, predictions))

In [None]:
# get weights of coefficents
dict(zip(model.feature_names_in_, model.coef_[0]))