In [1]:
from valerie.utils import get_logger
from valerie.modeling import ClaimantModel
from valerie.datasets import Phase1Dataset, Phase2Dataset, Phase2TrialDataset, Phase2Validation500Dataset, Phase2Validation100Dataset, combine_datasets_claims



In [2]:
logger = get_logger()

In [3]:
datasets = [
    Phase2Dataset.from_raw("data/phase2-0/raw/metadata.json"), 
    Phase2TrialDataset.from_raw(),
    Phase2Validation500Dataset.from_raw(),
    Phase1Dataset.from_raw(),
]

claims = combine_datasets_claims(datasets)

HBox(children=(FloatProgress(value=0.0, description='Phase2Dataset to claims', max=13130.0, style=ProgressStyl…


[2020-07-19 22:14:39,082] INFO:valerie.datasets: missed row to claim conversions: 0
[2020-07-19 22:14:39,090] INFO:valerie.datasets: Phase2Dataset claims set change 13130 --> 13130


HBox(children=(FloatProgress(value=0.0, description='Phase2TrialDataset to claims', style=ProgressStyle(descri…


[2020-07-19 22:14:39,158] INFO:valerie.datasets: missed row to claim conversions: 0
[2020-07-19 22:14:39,158] INFO:valerie.datasets: Phase2TrialDataset claims set change 100 --> 100


HBox(children=(FloatProgress(value=0.0, description='Phase2Validation500Dataset to claims', max=500.0, style=P…


[2020-07-19 22:14:39,365] INFO:valerie.datasets: missed row to claim conversions: 0
[2020-07-19 22:14:39,365] INFO:valerie.datasets: Phase2Validation500Dataset claims set change 500 --> 500


HBox(children=(FloatProgress(value=0.0, description='Phase1Dataset to claims', max=15555.0, style=ProgressStyl…


[2020-07-19 22:14:45,185] INFO:valerie.datasets: missed row to claim conversions: 0
[2020-07-19 22:14:45,195] INFO:valerie.datasets: Phase1Dataset claims set change 15555 --> 15555
[2020-07-19 22:14:45,198] INFO:valerie.data: ... combining claims ...
[2020-07-19 22:14:45,206] INFO:valerie.data: Phase2Dataset: 0 --> 13130 (+ 13130 = 13130 - 0)
[2020-07-19 22:14:45,206] INFO:valerie.data: Phase2TrialDataset: 13130 --> 13230 (+ 100 = 100 - 0)
[2020-07-19 22:14:45,207] INFO:valerie.data: Phase2Validation500Dataset: 13230 --> 13730 (+ 500 = 500 - 0)
[2020-07-19 22:14:45,222] INFO:valerie.data: Phase1Dataset: 13730 --> 20647 (+ 6917 = 15555 - 8638)


In [18]:
df = ClaimantModel.analyze(claims, min_threshold=5, return_df=True)

In [19]:
df[:20]

Unnamed: 0,false,partly,true,score,total
thelastlineofdefense.org,6,0,0,0.0,6
viral video,7,0,0,0.0,7
multiple websites,12,0,0,0.0,12
ourlandofthefree.com,6,0,0,0.0,6
donald trump jr.,6,0,0,0.0,6
afrikan daily,6,0,0,0.0,6
facebook page,6,0,0,0.0,6
various websites,123,2,0,0.008,125
viral meme,34,1,0,0.014286,35
social media posts,46,2,0,0.020833,48


In [20]:
df[-20:]

Unnamed: 0,false,partly,true,score,total
alex sink,1,7,5,0.653846,13
michelle nunn,0,4,2,0.666667,6
michael steele,2,0,4,0.666667,6
pete gallego,0,4,2,0.666667,6
michael mccaul,0,8,4,0.666667,12
gwen graham,0,4,2,0.666667,6
bob corker,0,4,2,0.666667,6
randy forbes,0,7,4,0.681818,11
paul krugman,1,5,5,0.681818,11
jim webb,0,5,3,0.6875,8


In [21]:
clickbait_probable_claimants = [
    "multiple websites", 
    "various websites", 
    "viral meme", 
    "social media posts", 
    "viral image", 
    "facebook user", 
    "chain email",
    "chain message",
    "instagram posts",
    "bloggers",
    "facebook posts",
    "tweets",
    "facebook post",
]

In [22]:
for clm in clickbait_probable_claimants:
    print(clm)
    try:
        print(df.loc[clm])
    except:
        print("not enough examples")
    print()

multiple websites
false     12.0
partly     0.0
true       0.0
score      0.0
total     12.0
Name: multiple websites, dtype: float64

various websites
false     123.000
partly      2.000
true        0.000
score       0.008
total     125.000
Name: various websites, dtype: float64

viral meme
false     34.000000
partly     1.000000
true       0.000000
score      0.014286
total     35.000000
Name: viral meme, dtype: float64

social media posts
false     46.000000
partly     2.000000
true       0.000000
score      0.020833
total     48.000000
Name: social media posts, dtype: float64

viral image
false     178.000000
partly     22.000000
true        1.000000
score       0.059701
total     201.000000
Name: viral image, dtype: float64

facebook user
false     142.000
partly     12.000
true        6.000
score       0.075
total     160.000
Name: facebook user, dtype: float64

chain email
false     106.000
partly     16.000
true        3.000
score       0.088
total     125.000
Name: chain email,

In [24]:
claimant_model = ClaimantModel()
claimant_model.train(claims, min_threshold=10)

In [25]:
claimant_model.model["donald trump"]

{'false': 872,
 'partly': 474,
 'true': 30,
 'score': 0.19404069767441862,
 'total': 1376}

In [26]:
claimant_model.save_pretrained("models/claimant_model_v3.json")

In [27]:
!gsutil cp models/claimant_model_v3.json gs://valerie-bucket/models/claimant_model_v3.json

Copying file://models/claimant_model_v3.json [Content-Type=application/json]...
- [1 files][ 18.6 KiB/ 18.6 KiB]                                                
Operation completed over 1 objects/18.6 KiB.                                     
