In [19]:
import os
import warnings
warnings.simplefilter('ignore')

import pandas as pd
import json
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.metrics import log_loss,confusion_matrix,classification_report,roc_curve,auc
from sklearn.linear_model import LogisticRegression
from scipy import sparse

In [20]:
def jsonl_to_df(file_path):
    with open(file_path) as f:
        lines = f.read().splitlines()

    df_inter = pd.DataFrame(lines)
    df_inter.columns = ['json_element']

    df_inter['json_element'].apply(json.loads)

    return pd.json_normalize(df_inter['json_element'].apply(json.loads))

# Load the dataset

Note that the label:
1 - reliable
0 - unreliable

In [23]:
FILE_PATH = './data/nela_gt_2018_site_split/'

train_df = jsonl_to_df(FILE_PATH + 'train.jsonl')
train_df['split'] = 'train'
dev_df = jsonl_to_df(FILE_PATH + 'dev.jsonl')
dev_df['split'] = 'dev'
test_df = jsonl_to_df(FILE_PATH + 'test.jsonl')
test_df['split'] = 'test'

df = pd.concat([train_df, dev_df, test_df])
df.sample(10)

Unnamed: 0,content,title,date,source,label,split
5887,Britain's most prolific bird egg thief single-...,Britains most prolific bird egg thief single-h...,2018-10-12,thetelegraph,1,dev
10217,HARRIS FAULKNER (CO-HOST): I want to just hit ...,Fox Host Trumps lawyer didnt want him meeting ...,2018-11-30,mediamattersforamerica,1,train
24568,It feels as though 2018 is the year of rapid i...,Typhoon Yutu could strike Guam as a Category 5...,2018-10-24,sottnet,0,dev
20124,If the turn of the year means one thing for Ar...,Arsenal transfer news The players out of contr...,2018-11-27,eveningstandard,1,test
21915,The Tupolev Aircraft Company will develop the ...,Russia to modernize Tupolev Tu-95MS strategic ...,2018-08-14,sottnet,0,dev
22869,The Florida Senate race the country's most ex...,The Florida Senate race between Nelson and Sco...,2018-11-07,cnbc,1,train
21601,As US President Donald Trump read a statement ...,Lights go out at White House as Trump pledges ...,2018-07-17,rt,0,dev
37752,Is the /pol/ Prophecy About to be Fulfilled?\n...,Is the pol Prophecy About to be Fulfilled,2018-11-23,dailystormer,0,train
20576,the move is unclear as the people of both coun...,Curious and hostile Reports Austria granting c...,2018-07-26,sottnet,0,dev
21270,ER actress Vanessa Marquez has died after bein...,ER actress Vanessa Marquez shot dead by police...,2018-09-01,eveningstandard,1,test


The splitting script provided here (https://github.com/alexa/unreliable-news-detection-biases) produces balanced data between classes and all articles from each news source are contained within the splits. 

In [24]:
table = pd.pivot_table(df, values='title', index=['split'], columns=['label'], aggfunc='count', margins='all')
table

label,0,1,All
split,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
dev,20294,20294,40588
test,19410,19410,38820
train,35302,35302,70604
All,75006,75006,150012


In [25]:
df.fillna('', inplace=True)

# Logistic Regression (Title Only)
Train a logistic regression model using Tfidf features to serve as baseline

In [26]:
X_train = df[df['split'] == 'train']['title']
y_train = df[df['split'] == 'train']['label']

X_test = df[df['split'] == 'test']['title']
y_test = df[df['split'] == 'test']['label']

print(f'X_train: {X_train.shape}\nX_test: {X_test.shape}')

X_train: (70604,)
X_test: (38820,)


In [27]:
vect_word = TfidfVectorizer(max_features=20000, lowercase=True, analyzer='word',
                        stop_words= 'english',ngram_range=(1,3),dtype=np.float32)
vect_char = TfidfVectorizer(max_features=40000, lowercase=True, analyzer='char',
                        stop_words= 'english',ngram_range=(3,6),dtype=np.float32)

In [28]:
X_train_vect = sparse.hstack([vect_word.fit_transform(X_train), vect_char.fit_transform(X_train)])
X_test_vect = sparse.hstack([vect_word.fit_transform(X_test), vect_char.fit_transform(X_test)])

In [29]:
lr = LogisticRegression(C=1, max_iter=500, random_state = 42)
lr.fit(X_train_vect, y_train)
y_pred = lr.predict(X_test_vect)

In [30]:
print('\nConfusion matrix\n',confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))


Confusion matrix
 [[ 1725 17685]
 [ 1408 18002]]
              precision    recall  f1-score   support

           0       0.55      0.09      0.15     19410
           1       0.50      0.93      0.65     19410

    accuracy                           0.51     38820
   macro avg       0.53      0.51      0.40     38820
weighted avg       0.53      0.51      0.40     38820



Logistic regression has an accuracy of 51% and an f1-score of 40%