# HPF: building reliability classifier

In [2]:
import json
import pandas as pd
from collections import Counter

In [3]:
reliability = json.load(open("reliability.json", "r"))

In [4]:
rep = json.load(open("hpf_repr.json", "r"))

In [5]:
len(rep), len(reliability)

(740, 1285)

In [6]:
rel = {i:v for i,v in reliability.items() if i in rep}

In [7]:
newssources = list(rel.keys())

## We make a pandas dataframe

In [8]:
data = pd.DataFrame.from_dict({})
data.head()

In [9]:
data["newssource"] = newssources
data.head()

Unnamed: 0,newssource
0,esquire.com
1,wsj.com
2,roanoke.com
3,dcist.com
4,haaretz.com


In [10]:
data["reliability"] = [1 if rel[ns]=="reliable" else 0 for ns in newssources]
data.head()

Unnamed: 0,newssource,reliability
0,esquire.com,1
1,wsj.com,1
2,roanoke.com,1
3,dcist.com,1
4,haaretz.com,1


In [11]:
rep_list = [rep[n] for n in newssources]

In [12]:
num_features = len(rep["wsj.com"]) 
num_features

20

In [14]:
for f in range(num_features):
    data[f"x{f+1}"] = [r[f] for r in rep_list]
data.head()

Unnamed: 0,newssource,reliability,x1,x2,x3,x4,x5,x6,x7,x8,...,x11,x12,x13,x14,x15,x16,x17,x18,x19,x20
0,esquire.com,1,4e-06,0.002795,4.1e-05,1.7e-05,0.000453,0.001727,4.3e-05,2.5e-05,...,2e-05,3e-05,1.9e-05,7e-06,0.087414,1.9e-05,3e-06,3.3e-05,5.3e-05,9.6e-05
1,wsj.com,1,4e-06,0.000102,4.3e-05,1.5e-05,0.008389,0.374472,0.000149,2.5e-05,...,2e-05,3e-05,0.165728,7e-06,0.161615,1.9e-05,3e-06,0.000476,0.226667,0.004529
2,roanoke.com,1,2.9e-05,3.6e-05,0.012787,1.5e-05,0.000173,1.7e-05,0.006418,2.5e-05,...,1.9e-05,3e-05,1.6e-05,7e-06,0.003604,1.9e-05,3e-06,3.1e-05,5.9e-05,5.3e-05
3,dcist.com,1,4e-06,3.6e-05,4.2e-05,1.5e-05,2.3e-05,1.5e-05,4.8e-05,2.4e-05,...,1.9e-05,3e-05,1.6e-05,7e-06,0.018249,1.9e-05,3e-06,3e-05,5.2e-05,5.3e-05
4,haaretz.com,1,4e-06,3.6e-05,4.1e-05,1.6e-05,2.3e-05,0.051647,4.1e-05,2.5e-05,...,1.9e-05,3e-05,1.7e-05,7e-06,0.032927,1.9e-05,3e-06,3e-05,5.3e-05,0.001199


## Making classifier

In [15]:
import numpy as np
from sklearn.model_selection import train_test_split
from supervised.automl import AutoML
from random import sample, seed

In [16]:
# !python3 -m pip install -U mljar-supervised

In [17]:
Counter(list(data["reliability"]))

Counter({1: 692, 0: 48})

In [18]:
# let's sample 72 reliable news sources

In [19]:
reliable_news_sources = [ns for ns in newssources if rel[ns] == "reliable"]
len(reliable_news_sources)

692

In [53]:
unreliable_news_sources = [ns for ns in newssources if rel[ns] == "unreliable"]
len(unreliable_news_sources)

48

In [63]:
# sample 62
seed(100)
relnews72 = sample(reliable_news_sources,72)
len(relnews72)

72

In [64]:
data120 = data[data["newssource"].isin(relnews72 + unreliable_news_sources)]
data120.shape

(120, 22)

In [65]:
feature_names = [x for x in list(data.columns) if x not in ["reliability", "newssource"]]
feature_names

['x1',
 'x2',
 'x3',
 'x4',
 'x5',
 'x6',
 'x7',
 'x8',
 'x9',
 'x10',
 'x11',
 'x12',
 'x13',
 'x14',
 'x15',
 'x16',
 'x17',
 'x18',
 'x19',
 'x20']

In [66]:
X = data120[feature_names]
X.head()

Unnamed: 0,x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,x16,x17,x18,x19,x20
5,6e-05,3.5e-05,4.1e-05,1.5e-05,2.5e-05,1.5e-05,0.008594,2.5e-05,2.2e-05,3.1e-05,2.7e-05,3.6e-05,1.6e-05,8e-06,0.000747,2.1e-05,3e-06,3.2e-05,5.5e-05,5.9e-05
10,4e-06,3.5e-05,0.402569,1.5e-05,2.3e-05,1.5e-05,4e-05,2.5e-05,2.2e-05,2.9e-05,1.9e-05,3e-05,1.6e-05,7e-06,2.3e-05,1.9e-05,3e-06,3e-05,0.561503,5.4e-05
11,4e-06,3.6e-05,4.5e-05,1.6e-05,2.3e-05,0.063136,0.000421,2.5e-05,2.3e-05,3e-05,1.9e-05,3e-05,1.7e-05,7e-06,2.1e-05,1.9e-05,3e-06,3e-05,0.00465,5.5e-05
24,4e-06,3.6e-05,4.4e-05,1.5e-05,2.3e-05,3.2e-05,4.3e-05,2.5e-05,2.2e-05,3e-05,2e-05,3e-05,1.7e-05,7e-06,0.000983,1.9e-05,3e-06,0.000135,0.015672,0.001402
29,4e-06,3.7e-05,0.019708,1.6e-05,0.015271,1.8e-05,0.000421,0.000107,0.00455,3.8e-05,2e-05,3e-05,1.7e-05,7e-06,0.00708,1.9e-05,3e-06,3.3e-05,0.00355,0.004562


In [67]:
y = data120["reliability"]
y

5      1
10     0
11     0
24     0
29     1
      ..
701    1
711    1
712    1
722    1
729    1
Name: reliability, Length: 120, dtype: int64

In [68]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, 
                                                    random_state= 42, 
                                                    stratify= y)

In [69]:
Counter(y_train)

Counter({1: 54, 0: 36})

In [70]:
Counter(y_test)

Counter({1: 18, 0: 12})

In [71]:
automl = AutoML(algorithms=['Baseline', 'CatBoost', 'Decision Tree', 
                            'Extra Trees', 'Nearest Neighbors', 'LightGBM', 
                            'Linear', 'Neural Network', 'Random Forest', 'Xgboost'], 
                total_time_limit=5*60) 
automl.fit(X, y)

AutoML directory: AutoML_8
The task is binary_classification with evaluation metric logloss
AutoML will use algorithms: ['Baseline', 'CatBoost', 'Decision Tree', 'Extra Trees', 'Nearest Neighbors', 'LightGBM', 'Linear', 'Neural Network', 'Random Forest', 'Xgboost']
AutoML will ensemble available models
AutoML steps: ['simple_algorithms', 'default_algorithms', 'ensemble']
* Step simple_algorithms will try to check up to 3 models
'module' object is not callable
1_Baseline logloss 0.673012 trained in 0.28 seconds
'module' object is not callable
2_DecisionTree logloss 4.917023 trained in 1.7 seconds
'module' object is not callable
3_Linear logloss 0.620155 trained in 3.87 seconds
* Step default_algorithms will try to check up to 7 models
'module' object is not callable
4_Default_LightGBM logloss 0.549539 trained in 5.26 seconds
'module' object is not callable
5_Default_Xgboost logloss 0.575748 trained in 4.65 seconds
'module' object is not callable
6_Default_CatBoost logloss 0.631223 train

In [31]:
y_predicted = automl.predict(X_test)
y_predicted

array([1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0,
       1, 1, 1, 1, 1, 1, 0, 1])

In [32]:
print(pd.DataFrame({"Predicted": y_predicted, 
                    "Target": np.array(y_test)}))

    Predicted  Target
0           1       1
1           1       1
2           1       1
3           0       0
4           0       0
5           1       1
6           0       0
7           1       1
8           1       0
9           1       1
10          0       1
11          1       1
12          0       0
13          0       0
14          1       1
15          1       1
16          1       1
17          1       1
18          0       0
19          1       1
20          1       1
21          0       0
22          1       0
23          1       1
24          1       0
25          1       1
26          1       0
27          1       1
28          0       0
29          1       1
