# Part 2: Simple model

## Task 0: Grouping labels

In [None]:
import pandas as pd

# src = 'data/995,000_rows.csv'
src = 'data/995,000_rows_cleaned.csv'
raw_data = pd.read_csv(src)

In [None]:
# percentage destribution of types
type_dist = raw_data['type'].value_counts(normalize=True) * 100
print(type_dist)

In [None]:
# dataframe for grouped types
grouped_data = raw_data.copy(deep=True)

# remove omitted types from dataset
omitted_types = ['political',
                 'bias',
                 'rumor',
                 'unknown',
                 'unreliable',
                 'clickbait',
                 'junksci',
                 'hate',
                 '2018-02-10 13:43:39.521661'
                 ]

for i in range(len(omitted_types)):
    grouped_data = grouped_data.drop(grouped_data[grouped_data['type'] == omitted_types[i]].index)

# reassigned labels into 'fake' and 'reliable'
fake_types = {'fake', 
              'satire',
              'conspiracy',
}

reliable_types = {'reliable'}

def change_to_fake(type: str) -> str:
    if type in fake_types:
        return 'fake'
    else:
        return type
    
grouped_data['type'] = grouped_data['type'].apply(change_to_fake)

# destribution of fake vs. reliable after re-labelling
type_dist = grouped_data['type'].value_counts(normalize=True) * 100
print(type_dist)

### Logistic Regression model

In [None]:
# Code reference: https://www.freecodecamp.org/news/how-to-build-and-train-linear-and-logistic-regression-ml-models-in-python/

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

raw_data = pd.read_csv('data/995,000_rows_cleaned_SAMPLE.csv')
# raw_data = df

# Make grouped types ('fake' or 'reliable') into 'true' or 'false' values
type_data = pd.get_dummies(raw_data['type'], drop_first=True)
raw_data = pd.concat([raw_data, type_data], axis=1)

# remove columns
raw_data.drop(['Unnamed: 0.2', 'Unnamed: 0.1', 'content', 'Unnamed: 0', 'id',
       'url', 'scraped_at', 'inserted_at', 'updated_at', 'title',
       'authors', 'keywords', 'meta_keywords', 'meta_description', 'tags',
       'summary', 'source', 'content_clean', 'content_stopword'], axis = 1, inplace = True)

# Split data into training and prediction data
y_data = raw_data['reliable']
x_data = raw_data.drop(['reliable', 'type', 'domain', 'content_stem'], axis=1)

x_training_data, x_test_data, y_training_data, y_test_data = train_test_split(x_data, y_data, test_size = 0.1)

# create logistic reg. model, and train it
model = LogisticRegression()
model.fit(x_training_data, y_training_data)

# test the model and report performance
predictions = model.predict(x_test_data)
print(classification_report(y_test_data, predictions))