# Building reliability classifier

In [40]:
import json
import pandas as pd
from collections import Counter

In [2]:
reliability = json.load(open("reliability.json", "r"))

In [4]:
rep = json.load(open("pmf_rep.json", "r"))

In [8]:
len(rep), len(reliability)

(740, 1285)

In [9]:
rel = {i:v for i,v in reliability.items() if i in rep}

In [19]:
newssources = list(rel.keys())

## We make a pandas dataframe

In [24]:
data = pd.DataFrame.from_dict({})
data.head()

In [25]:
data["newssource"] = newssources
data.head()

Unnamed: 0,newssource
0,esquire.com
1,wsj.com
2,roanoke.com
3,dcist.com
4,haaretz.com


In [28]:
data["reliability"] = [1 if rel[ns]=="reliable" else 0 for ns in newssources]
data.head()

Unnamed: 0,newssource,reliability
0,esquire.com,1
1,wsj.com,1
2,roanoke.com,1
3,dcist.com,1
4,haaretz.com,1


In [29]:
rep_list = [rep[n] for n in newssources]

In [30]:
num_features = len(rep["wsj.com"]) 
num_features

25

In [31]:
for f in range(num_features):
    data[f"x{f+1}"] = [r[f] for r in rep_list]
data.head()

Unnamed: 0,newssource,reliability,x1,x2,x3,x4,x5,x6,x7,x8,...,x16,x17,x18,x19,x20,x21,x22,x23,x24,x25
0,esquire.com,1,-0.683338,-1.074086,-0.368772,1.167273,1.043615,0.851311,2.101009,-1.134868,...,-0.630597,0.713705,0.308463,1.141287,0.19575,0.370277,0.960097,-0.778258,0.612087,0.729098
1,wsj.com,1,-7.113049,-10.187653,-4.849425,-4.513893,-4.000742,12.499324,4.817034,-9.149549,...,8.928319,7.974508,2.060963,-0.627666,2.344771,5.182325,-6.11728,0.21056,-9.134255,3.322232
2,roanoke.com,1,-0.135091,0.421313,0.102859,0.044451,0.0505,-0.091282,0.232027,-0.215155,...,-0.14642,-0.116556,-0.265552,0.177097,0.015635,-0.159217,0.119482,-0.019818,0.247971,0.113681
3,dcist.com,1,-0.130227,0.21231,0.142515,0.265669,0.036171,-0.132904,0.280048,0.042349,...,-0.083802,0.023901,-0.120396,0.298498,0.137096,-0.407272,0.067081,-0.059157,0.173141,0.171144
4,haaretz.com,1,-0.747731,-0.092488,-1.608726,1.686076,1.45774,1.399745,0.564752,0.190013,...,0.887405,0.985561,-1.195829,1.297947,0.052915,-1.111117,0.771824,0.302276,0.864714,0.011002


## Making classifier

In [86]:
import numpy as np
from sklearn.model_selection import train_test_split
from supervised.automl import AutoML
from random import sample, seed

In [38]:
!python3 -m pip install -U mljar-supervised

Defaulting to user installation because normal site-packages is not writeable
Collecting mljar-supervised
  Using cached mljar_supervised-1.1.7-py3-none-any.whl
Collecting scipy<=1.11.4,>=1.6.1 (from mljar-supervised)
  Using cached scipy-1.11.4-cp39-cp39-macosx_12_0_arm64.whl.metadata (60 kB)
Collecting xgboost>=2.0.0 (from mljar-supervised)
  Using cached xgboost-2.0.3-py3-none-macosx_12_0_arm64.whl.metadata (2.0 kB)
Collecting lightgbm>=3.0.0 (from mljar-supervised)
  Using cached lightgbm-4.3.0.tar.gz (1.7 MB)
  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Installing backend dependencies ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
[?25hCollecting catboost>=0.24.4 (from mljar-supervised)
  Using cached catboost-1.2.5-cp39-cp39-macosx_11_0_universal2.whl.metadata (1.2 kB)
Collecting tabulate>=0.8.7 (from mljar-supervised)
  Using cached tabulate-0.9.0-py3-none-any.whl.metadata (34 kB)
Co

In [42]:
Counter(list(data["reliability"]))

Counter({1: 692, 0: 48})

In [72]:
# let's sample 72 reliable news sources

In [74]:
reliable_news_sources = [ns for ns in newssources if rel[ns] == "reliable"]
len(reliable_news_sources)

692

In [83]:
unreliable_news_sources = [ns for ns in newssources if rel[ns] == "unreliable"]
len(unreliable_news_sources)

48

In [109]:
# sample 72
seed(100)
relnews72 = sample(reliable_news_sources,72)
len(relnews72)

72

In [110]:
data120 = data[data["newssource"].isin(relnews72 + unreliable_news_sources)]
data120.shape

(120, 27)

In [111]:
feature_names = [x for x in list(data.columns) if x not in ["reliability", "newssource"]]
feature_names

['x1',
 'x2',
 'x3',
 'x4',
 'x5',
 'x6',
 'x7',
 'x8',
 'x9',
 'x10',
 'x11',
 'x12',
 'x13',
 'x14',
 'x15',
 'x16',
 'x17',
 'x18',
 'x19',
 'x20',
 'x21',
 'x22',
 'x23',
 'x24',
 'x25']

In [112]:
X = data120[feature_names]
X.head()

Unnamed: 0,x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,...,x16,x17,x18,x19,x20,x21,x22,x23,x24,x25
5,-0.138304,0.021374,0.034156,0.004842,-0.130027,-0.064214,-0.051375,-0.07672,-0.155781,0.044226,...,-0.146992,-0.167734,-0.065738,-0.053205,0.120885,-0.075448,0.056509,-0.015277,0.145953,-0.010875
10,-9.750181,-1.563661,-8.436184,-2.209081,-5.050649,12.715886,4.734096,-11.401564,2.539832,-18.356282,...,6.711782,3.86897,-14.119403,-0.213117,7.74444,7.949606,-3.419666,7.513987,-2.178696,3.60588
11,-0.733211,0.167495,0.198854,0.484033,1.351267,-0.020986,0.227879,0.26481,1.037426,0.394845,...,0.511918,0.20185,-1.041993,1.14366,-0.171228,0.293041,0.707385,0.128686,0.067158,0.277267
24,-0.056301,0.103733,-0.01883,0.115594,-0.048967,-0.0858,0.025003,-0.073557,-0.07213,-0.179809,...,-0.019752,-0.165877,-0.145369,0.227722,0.075115,-0.108627,-0.000798,-0.036198,-0.073251,-0.010628
29,-0.000879,0.068027,0.107667,0.075886,0.409682,-0.110251,0.543334,-0.314552,-0.222827,0.359664,...,-0.180175,-0.070403,-0.133538,0.189914,0.248513,0.010302,-0.05907,0.087918,-0.004462,-0.166378


In [113]:
y = data120["reliability"]
y

5      1
10     0
11     0
24     0
29     1
      ..
701    1
711    1
712    1
722    1
729    1
Name: reliability, Length: 120, dtype: int64

In [114]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, 
                                                    random_state= 42, 
                                                    stratify= y)

In [115]:
Counter(y_train)

Counter({1: 54, 0: 36})

In [116]:
Counter(y_test)

Counter({1: 18, 0: 12})

In [117]:
automl = AutoML(algorithms=["Decision Tree", "Linear", "Random Forest"], 
                total_time_limit=5*60) 
automl.fit(X, y)

AutoML directory: AutoML_6
The task is binary_classification with evaluation metric logloss
AutoML will use algorithms: ['Decision Tree', 'Linear', 'Random Forest']
AutoML will ensemble available models
AutoML steps: ['simple_algorithms', 'default_algorithms', 'ensemble']
* Step simple_algorithms will try to check up to 2 models
'module' object is not callable
1_DecisionTree logloss 3.511583 trained in 6.88 seconds
'module' object is not callable
2_Linear logloss 0.759778 trained in 3.87 seconds
* Step default_algorithms will try to check up to 1 model
'module' object is not callable
3_Default_RandomForest logloss 0.527996 trained in 2.13 seconds
* Step ensemble will try to check up to 1 model
'module' object is not callable
Ensemble logloss 0.527996 trained in 0.63 seconds
AutoML fit time: 19.68 seconds
AutoML best model: 3_Default_RandomForest


In [118]:
y_predicted = automl.predict(X_test)
y_predicted

array([1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0,
       0, 1, 0, 1, 0, 1, 0, 1])

In [119]:
print(pd.DataFrame({"Predicted": y_predicted, 
                    "Target": np.array(y_test)}))

    Predicted  Target
0           1       1
1           1       1
2           1       1
3           0       0
4           1       0
5           1       1
6           1       0
7           1       1
8           1       0
9           1       1
10          1       1
11          1       1
12          0       0
13          1       0
14          1       1
15          1       1
16          1       1
17          1       1
18          0       0
19          1       1
20          1       1
21          0       0
22          0       0
23          1       1
24          0       0
25          1       1
26          0       0
27          1       1
28          0       0
29          1       1
