In [1]:
import json
import statistics

import numpy as np
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split

from valerie.data import Claim
from valerie.modeling import ClaimantModel

# Phase 1
---

In [2]:
with open("data/phase1/all_data/metadata.json") as fi:
    claims = [Claim.from_dict(claim) for claim in json.load(fi)]

In [3]:
df = ClaimantModel.analyze(claims, min_threshold=10, return_df=True)

In [4]:
metrics = {
    "min": min(df["score"]),
    "max": max(df["score"]),
    "median": statistics.median(df["score"]),
    "mean": statistics.mean(df["score"]),
}
metrics

{'min': 0.0,
 'max': 0.8095238095238095,
 'median': 0.4090909090909091,
 'mean': 0.360639837247766}

In [5]:
df[:10]

Unnamed: 0,false,partly,true,score,total
Multiple websites,11,0,0,0.0,11
Various websites,106,0,0,0.0,106
Viral meme,29,0,0,0.0,29
Social media posts,17,0,0,0.0,17
multiple sources,69,1,0,0.007143,70
AFP Fact Check,18,1,0,0.026316,19
YourNewsWire.com,13,1,0,0.035714,14
Michele Bachmann,12,1,0,0.038462,13
Viral image,115,12,0,0.047244,127
Multiple sources,43,7,0,0.07,50


In [6]:
df[-10:]

Unnamed: 0,false,partly,true,score,total
Cory Booker,2,6,3,0.545455,11
Tim Kaine,5,25,9,0.551282,39
Jeb Bush,6,40,12,0.551724,58
Mark Pocan,3,8,5,0.5625,16
Democratic National Committee,0,18,3,0.571429,21
William Ruto,2,6,4,0.583333,12
Sherrod Brown,2,6,5,0.615385,13
Andrew Cuomo,3,9,8,0.625,20
Gavin Newsom,1,10,6,0.647059,17
President Cyril Ramaphosa,3,2,16,0.809524,21


In [7]:
df.loc[""]

false     3034.000000
partly    1287.000000
true       641.000000
score        0.258867
total     4962.000000
Name: , dtype: float64

In [8]:
df.loc[""]["total"] / len(claims)

0.31899710703953715

# Phase2
---

In [9]:
with open("data/phase2/all_data/metadata.json") as fi:
    claims = [Claim.from_dict(claim) for claim in json.load(fi)]

In [10]:
df = ClaimantModel.analyze(claims, min_threshold=10, return_df=True)

In [11]:
metrics = {
    "min": min(df["score"]),
    "max": max(df["score"]),
    "median": statistics.median(df["score"]),
    "mean": statistics.mean(df["score"]),
}
metrics

{'min': 0.0,
 'max': 0.8181818181818182,
 'median': 0.43333333333333335,
 'mean': 0.41360425923792143}

In [12]:
df[:10]

Unnamed: 0,false,partly,true,score,total
Multiple websites,11,0,0,0.0,11
Social media posts,18,0,0,0.0,18
Various websites,94,2,0,0.010417,96
Viral meme,25,1,0,0.019231,26
YourNewsWire.com,12,1,0,0.038462,13
Viral image,140,16,0,0.051282,156
Facebook user,140,12,6,0.075949,158
Bloggers,288,60,2,0.091429,350
Chain email,95,16,3,0.096491,114
Michele Bachmann,29,8,1,0.131579,38


In [13]:
df[-10:]

Unnamed: 0,false,partly,true,score,total
Sherrod Brown,3,17,12,0.640625,32
Lloyd Doggett,0,10,4,0.642857,14
Alex Sink,1,7,5,0.653846,13
Kasim Reed,0,9,4,0.653846,13
Michael McCaul,0,8,4,0.666667,12
Paul Krugman,1,5,5,0.681818,11
Randy Forbes,0,7,4,0.681818,11
Jerry Brown,0,8,5,0.692308,13
Julián Castro,0,7,5,0.708333,12
President Cyril Ramaphosa,3,2,17,0.818182,22


In [14]:
df.loc[""]

false     62.000000
partly    13.000000
true      18.000000
score      0.263441
total     93.000000
Name: , dtype: float64

In [15]:
df.loc[""]["total"] / len(claims)

0.007120434882474542

# Combined
---

In [4]:
with open("data/phase1/all-data/metadata.json") as fi:
    claims = [Claim.from_dict(claim) for claim in json.load(fi)]
with open("data/phase2/all-data/metadata.json") as fi:
    claims += [Claim.from_dict(claim) for claim in json.load(fi)]

In [10]:
df = ClaimantModel.analyze(claims, min_threshold=100, return_df=True)

In [11]:
metrics = {
    "min": min(df["score"]),
    "max": max(df["score"]),
    "median": statistics.median(df["score"]),
    "mean": statistics.mean(df["score"]),
}
metrics

{'min': 0.0049504950495049506,
 'max': 0.5524193548387096,
 'median': 0.4297752808988764,
 'mean': 0.3364682398935331}

In [12]:
len(df)

23

In [13]:
df[:10]

Unnamed: 0,false,partly,true,score,total
Various websites,200,2,0,0.00495,202
Viral image,255,28,0,0.04947,283
Facebook user,198,21,10,0.08952,229
Bloggers,597,119,6,0.09072,722
Chain email,156,24,5,0.091892,185
Facebook posts,180,60,8,0.153226,248
Donald Trump,1222,665,47,0.196225,1934
,3096,1300,659,0.258952,5055
Mike Pence,48,54,6,0.305556,108
Ted Cruz,76,135,11,0.353604,222


In [14]:
df[-10:]

Unnamed: 0,false,partly,true,score,total
Scott Walker,57,138,31,0.442478,226
Marco Rubio,48,158,23,0.445415,229
Rick Perry,52,106,32,0.447368,190
John McCain,54,117,37,0.459135,208
Paul Ryan,19,90,13,0.47541,122
Bernie Sanders,28,157,27,0.497642,212
Hillary Clinton,90,259,88,0.497712,437
Barack Obama,107,438,110,0.50229,655
Charlie Crist,11,78,18,0.53271,107
Jeb Bush,12,87,25,0.552419,124


In [15]:
df.loc[""]

false     3096.000000
partly    1300.000000
true       659.000000
score        0.258952
total     5055.000000
Name: , dtype: float64

In [16]:
df.loc[""]["total"] / len(claims)

0.17664942689404528

In [17]:
sum(df["score"])/len(df)

0.336468239893533

# Classifier
---

In [2]:
with open("data/phase1/all-data/metadata.json") as fi:
    train_claims = [Claim.from_dict(claim) for claim in json.load(fi)]
with open("data/phase2/train-data/metadata.json") as fi:
    train_claims += [Claim.from_dict(claim) for claim in json.load(fi)]
    
with open("data/phase2/test-data/metadata.json") as fi:
    test_claims = [Claim.from_dict(claim) for claim in json.load(fi)]

In [3]:
model = ClaimantModel()
model.train(train_claims)

In [4]:
predictions = [model.predict(claim) for claim in test_claims]
labels = [claim.label for claim in test_claims]

In [5]:
# all predictions where non-hits are defaulted to 1 (partly true)
_predictions = [np.argmax(p) if p is not None else 1 for p in predictions]
print(classification_report(labels, _predictions))

precision    recall  f1-score   support

           0       0.68      0.39      0.49       233
           1       0.58      0.89      0.70       338
           2       1.00      0.01      0.02        83

    accuracy                           0.60       654
   macro avg       0.75      0.43      0.41       654
weighted avg       0.67      0.60      0.54       654



In [6]:
# only predictions where the claimant was found
_predictions = [np.argmax(p) for p in predictions if p is not None]
_labels = [l for l, p in zip(labels, predictions) if p is not None]
print(classification_report(_labels, _predictions))

precision    recall  f1-score   support

           0       0.68      0.62      0.65       144
           1       0.65      0.81      0.72       205
           2       1.00      0.03      0.05        40

    accuracy                           0.66       389
   macro avg       0.78      0.49      0.48       389
weighted avg       0.70      0.66      0.63       389



In [7]:
model = ClaimantModel()
model.train(train_claims, min)

predictions = [model.predict(claim) for claim in test_claims]
labels = [claim.label for claim in test_claims]


# only predictions where the claimant was found
_predictions = [np.argmax(p) for p in predictions if p is not None]
_labels = [l for l, p in zip(labels, predictions) if p is not None]
print(classification_report(_labels, _predictions))

precision    recall  f1-score   support

           0       0.68      0.62      0.65       144
           1       0.65      0.81      0.72       205
           2       1.00      0.03      0.05        40

    accuracy                           0.66       389
   macro avg       0.78      0.49      0.48       389
weighted avg       0.70      0.66      0.63       389



In [24]:
model.score(train_claims[0])

0.25905761235398933