In [23]:
from valerie.utils import get_logger
from valerie.modeling import ClaimantModel
from valerie.datasets import Phase2Dataset, Phase2TrialDataset, Phase2Validation500Dataset, Phase2Validation100Dataset, combine_datasets_claims

In [24]:
logger = get_logger()

In [None]:
datasets = [
    Phase2Dataset.from_raw("data/phase2-0/raw/metadata.json"), 
    Phase2TrialDataset.from_raw(),
    Phase2Validation500Dataset.from_raw()
]

claims = combine_datasets_claims(datasets)

In [4]:
df = ClaimantModel.analyze(claims, return_df=True)

In [23]:
df[:20]

Unnamed: 0,false,partly,true,score,total
multiple websites,12,0,0,0.0,12
various websites,106,2,0,0.009259,108
viral meme,28,1,0,0.017241,29
social media posts,41,2,0,0.023256,43
yournewswire.com,12,1,0,0.038462,13
viral image,161,21,1,0.062842,183
facebook user,142,12,6,0.075,160
chain email,99,16,3,0.09322,118
bloggers,298,64,2,0.093407,364
louie gohmert,9,3,0,0.125,12


In [7]:
df[-20:]

Unnamed: 0,false,partly,true,score,total
gavin newsom,2,11,5,0.583333,18
rob portman,4,17,9,0.583333,30
amy klobuchar,1,8,3,0.583333,12
joe manchin,2,8,5,0.6,15
mark warner,1,12,5,0.611111,18
sheldon whitehouse,2,8,6,0.625,16
moveon.org,2,5,5,0.625,12
dennis kucinich,3,6,7,0.625,16
michael bloomberg,1,6,4,0.636364,11
andrew cuomo,3,12,10,0.64,25


In [24]:
clickbait_probable_claimants = [
    "multiple websites", 
    "various websites", 
    "viral meme", 
    "social media posts", 
    "viral image", 
    "facebook user", 
    "chain email",
    "chain message",
    "instagram posts",
    "bloggers",
    "facebook posts",
    "tweets",
    "facebook post",
]

In [28]:
for clm in clickbait_probable_claimants:
    print(df.loc[clm]["score"])

0.0
0.009259259259259259
0.017241379310344827
0.023255813953488372
0.06284153005464481
0.075
0.09322033898305085
0.09340659340659341
0.15767634854771784
0.23076923076923078
0.2619047619047619


# validation

In [12]:
df_val_100 = ClaimantModel.analyze(Phase2Validation100Dataset.from_raw().claims, min_threshold=0, return_df=True)

HBox(children=(FloatProgress(value=0.0, description='Phase2Validation100Dataset to claims', style=ProgressStyl…


[2020-07-17 17:37:17,333] INFO:valerie.datasets: missed row to claim conversions: 0
[2020-07-17 17:37:17,333] INFO:valerie.datasets: Phase2Validation100Dataset claims set change 100 --> 100


In [13]:
df_val_100

Unnamed: 0,false,partly,true,score,total
lincoln project,1,0,0,0.0,1
unidentified iranian man,1,0,0,0.0,1
mitch mcconnell,1,0,0,0.0,1
youtube video,1,0,0,0.0,1
eric trump,2,0,0,0.0,2
multiple sources,2,0,0,0.0,2
chain message,1,0,0,0.0,1
national rifle association,1,0,0,0.0,1
donald trump 2020 voters,1,0,0,0.0,1
dmitry peskov,1,0,0,0.0,1


In [15]:
df_val_500 = ClaimantModel.analyze(Phase2Validation500Dataset.from_raw().claims, min_threshold=0, return_df=True)

HBox(children=(FloatProgress(value=0.0, description='Phase2Validation500Dataset to claims', max=500.0, style=P…


[2020-07-17 17:37:40,511] INFO:valerie.datasets: missed row to claim conversions: 0
[2020-07-17 17:37:40,512] INFO:valerie.datasets: Phase2Validation500Dataset claims set change 500 --> 500


In [20]:
df_val_500

Unnamed: 0,false,partly,true,score,total
jerome adams,1,0,0,0.0,1
meme,1,0,0,0.0,1
jim humble,1,0,0,0.0,1
unidentified iranian man,1,0,0,0.0,1
mitch mcconnell,1,0,0,0.0,1
...,...,...,...,...,...
darren jackson,0,0,1,1.0,1
president ramaphosa,0,0,1,1.0,1
ron johnson,0,0,1,1.0,1
south african government,0,0,1,1.0,1


# claimant model v2

In [3]:
datasets = [
    Phase2Dataset.from_raw("data/phase2-0/raw/metadata.json"), 
    Phase2TrialDataset.from_raw(),
    Phase2Validation500Dataset.from_raw()
]

HBox(children=(FloatProgress(value=0.0, description='Phase2Dataset to claims', max=13130.0, style=ProgressStyl…


[2020-07-18 04:47:19,583] INFO:valerie.datasets: missed row to claim conversions: 0
[2020-07-18 04:47:19,590] INFO:valerie.datasets: Phase2Dataset claims set change 13130 --> 13130


HBox(children=(FloatProgress(value=0.0, description='Phase2TrialDataset to claims', style=ProgressStyle(descri…


[2020-07-18 04:47:19,660] INFO:valerie.datasets: missed row to claim conversions: 0
[2020-07-18 04:47:19,660] INFO:valerie.datasets: Phase2TrialDataset claims set change 100 --> 100


HBox(children=(FloatProgress(value=0.0, description='Phase2Validation500Dataset to claims', max=500.0, style=P…


[2020-07-18 04:47:19,886] INFO:valerie.datasets: missed row to claim conversions: 0
[2020-07-18 04:47:19,887] INFO:valerie.datasets: Phase2Validation500Dataset claims set change 500 --> 500


In [4]:
claims = combine_datasets_claims(datasets)

[2020-07-18 04:47:30,766] INFO:valerie.data: ... combining claims ...
[2020-07-18 04:47:30,775] INFO:valerie.data: Phase2Dataset: 0 --> 13130 (+ 13130 = 13130 - 0)
[2020-07-18 04:47:30,776] INFO:valerie.data: Phase2TrialDataset: 13130 --> 13230 (+ 100 = 100 - 0)
[2020-07-18 04:47:30,777] INFO:valerie.data: Phase2Validation500Dataset: 13230 --> 13730 (+ 500 = 500 - 0)


In [19]:
claimant_model = ClaimantModel()
claimant_model.train(claims, min_threshold=10)

In [20]:
clickbait_probable_claimants = [
    "multiple websites", 
    "various websites", 
    "viral meme", 
    "social media posts", 
    "viral image", 
    "facebook user", 
    "chain email",
    "chain message",
    "instagram posts",
    "bloggers",
    "facebook posts",
    "tweets",
    "facebook post",
]

In [21]:
for clm in clickbait_probable_claimants:
    print(clm)
    try:
        print(claimant_model.model[clm])
    except:
        print("not enough examples")
    print()

multiple websites
{'false': 12, 'partly': 0, 'true': 0, 'score': 0.0, 'total': 12}

various websites
{'false': 106, 'partly': 2, 'true': 0, 'score': 0.009259259259259259, 'total': 108}

viral meme
{'false': 28, 'partly': 1, 'true': 0, 'score': 0.017241379310344827, 'total': 29}

social media posts
{'false': 41, 'partly': 2, 'true': 0, 'score': 0.023255813953488372, 'total': 43}

viral image
{'false': 161, 'partly': 21, 'true': 1, 'score': 0.06284153005464481, 'total': 183}

facebook user
{'false': 142, 'partly': 12, 'true': 6, 'score': 0.075, 'total': 160}

chain email
{'false': 99, 'partly': 16, 'true': 3, 'score': 0.09322033898305085, 'total': 118}

chain message
not enough examples

instagram posts
not enough examples

bloggers
{'false': 298, 'partly': 64, 'true': 2, 'score': 0.09340659340659341, 'total': 364}

facebook posts
{'false': 172, 'partly': 62, 'true': 7, 'score': 0.15767634854771784, 'total': 241}

tweets
{'false': 7, 'partly': 6, 'true': 0, 'score': 0.23076923076923078, 

In [22]:
claimant_model.save_pretrained("models/claimant_model_v2.json")

In [None]:
!gsutil cp models/claimant_model_v2.json gs://valerie-bucket/models

In [50]:
import pandas as pd

In [51]:
claimant_model = claimant_model.from_pretrained("models/claimant_model.json")

In [52]:
claimant_model.model["Donald Trump"]

{'false': 1222,
 'partly': 665,
 'true': 47,
 'score': 0.19622543950361945,
 'total': 1934}

In [53]:
pd.DataFrame.from_dict(claimant_model.model, orient="index")

Unnamed: 0,false,partly,true,score,total
Multiple websites,22,0,0,0.000000,22
Social media posts,35,0,0,0.000000,35
TheLastLineOfDefense.org,11,0,0,0.000000,11
Afrikan Daily,12,0,0,0.000000,12
Various websites,200,2,0,0.004950,202
...,...,...,...,...,...
West Virginia Republican Party,0,8,6,0.714286,14
Tom Graves,1,6,7,0.714286,14
Donna Howard,0,6,6,0.750000,12
John Oliver,0,6,6,0.750000,12


In [54]:
claimant_model = claimant_model.from_pretrained("models/claimant_model_v2.json")

In [55]:
claimant_model.model["donald trump"]

{'false': 437,
 'partly': 300,
 'true': 25,
 'score': 0.22965879265091863,
 'total': 762}

In [56]:
pd.DataFrame.from_dict(claimant_model.model, orient="index")

Unnamed: 0,false,partly,true,score,total
multiple websites,12,0,0,0.000000,12
various websites,106,2,0,0.009259,108
viral meme,28,1,0,0.017241,29
social media posts,41,2,0,0.023256,43
yournewswire.com,12,1,0,0.038462,13
...,...,...,...,...,...
randy forbes,0,7,4,0.681818,11
paul krugman,1,5,5,0.681818,11
jerry brown,0,8,5,0.692308,13
julián castro,0,7,5,0.708333,12
