# Part 2: Simple model

## Task 0: Grouping labels

In [None]:
import pandas as pd

# src = 'data/995,000_rows_cleaned.csv'
# src = 'data/news_sample_cleaned.csv'

src_train = 'data/training_data.csv'
train_data = pd.read_csv(src_train)

src_valid = 'data/validation_data.csv'
valid_data = pd.read_csv(src_valid)

In [None]:
import pandas as pd

# rows with omitted types
omitted_types = {'political',
                'bias',
                'rumor',
                'unknown',
                'unreliable',
                'clickbait',
                'junksci',
                'hate',
                '2018-02-10 13:43:39.521661'
}

# reassigned labels into 'fake' and 'reliable'
fake_types = {'fake', 
              'satire',
              'conspiracy',
}

reliable_types = {'reliable'}

def group_data(df: pd.DataFrame, omitted_types: set, fake_types: set) -> pd.DataFrame:
    # make copy
    df_out = df.copy(deep=True)

    # drop omitted types
    drop_indexes = df_out[ (df_out['type'].isin(omitted_types))].index
    df_out.drop(drop_indexes, inplace=True)

    # group fake types
    def change_to_fake(type: str) -> str:
        if type in fake_types:
            return 'fake'
        else:
            return type

    df_out['type'] = df_out['type'].apply(change_to_fake)

    # return dataframe
    return df_out

In [None]:
# group training data and save to file
dst = 'data/training_data_grouped.csv'
train_data_group = group_data(train_data, omitted_types, fake_types)
train_data_group.to_csv(dst)

# group valid data and save to file
dst = 'data/validation_data_grouped.csv'
valid_data_group = group_data(valid_data, omitted_types, fake_types)
valid_data_group.to_csv(dst)

In [None]:
# destribution of types in grouped training data
type_dist = train_data_group['type'].value_counts(normalize=True) * 100
print("TRANING DATA:")
print(type_dist)

# destribution of types in grouped validation data
type_dist = valid_data_group['type'].value_counts(normalize=True) * 100
print("VALIDATION DATA:")
print(type_dist)

### Logistic Regression model

In [None]:
import pandas as pd

# load grouped training data
src = 'data/training_data_grouped.csv'
train_data_grp = pd.read_csv(src)

# load grouped validation data
src = 'data/validation_data_grouped.csv'
train_data_grp = pd.read_csv(src)

In [None]:
# Code reference: https://www.freecodecamp.org/news/how-to-build-and-train-linear-and-logistic-regression-ml-models-in-python/

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

# copy data
model_data = group_data.copy(deep=True)

# Make grouped types ('fake' or 'reliable') into 'true' or 'false' values
type_data = pd.get_dummies(model_data['type'], drop_first=True)
model_data = pd.concat([model_data, type_data], axis=1)

# remove columns
model_data.drop(['type',
               'content',
               'title',
               'authors',
               'content_clean', 
               'content_stopword',
               'content_stem'
               ], axis = 1, inplace = True)

# Split data into training and prediction data
y_data = model_data['reliable']
x_data = model_data.drop(['reliable'], axis=1)

x_training_data, x_test_data, y_training_data, y_test_data = train_test_split(x_data, y_data, test_size = 0.1)

# create logistic reg. model, and train it
model = LogisticRegression()
model.fit(x_training_data, y_training_data)

# test the model and report performance
predictions = model.predict(x_test_data)
print(classification_report(y_test_data, predictions))