# leaders/setup
---

In [98]:
import os
import json
import datetime
import collections
import multiprocessing

import spacy
import tldextract
import numpy as np
import pandas as pd
from tqdm.auto import tqdm

from valerie.data import Claim, claims_from_phase1, claims_from_phase2

In [None]:
nlp = spacy.load("en_core_web_lg")

In [2]:
claims1 = list(claims_from_phase1("data/phase1/raw/metadata.json").values())
claims2 = list(claims_from_phase2("data/phase2-1/raw/metadata.json").values())

loading claims from phase1: 100%|██████████| 15555/15555 [00:02<00:00, 6613.93it/s]
loading claims from phase2: 100%|██████████| 13061/13061 [00:01<00:00, 8078.18it/s]


In [3]:
def label_stats(labels):
    print("total:", len(labels))
    print()
    for k, v in collections.Counter(labels).most_common():
        print(str(v) + "\t" + str(k))
    print()

In [41]:
class StrictClaimCompare:
    def __init__(self, claim):
        self.claim = claim

    def __eq__(self, other):
        return self.claim.claim == other.claim.claim
    
    def __hash__(self):
        return hash(self.claim.claim)
        

def strict_claims_overlap(c1, c2, cmp=StrictClaimCompare):
    c1_set = set([cmp(c) for c in c1])
    print(len(c1), len(c1_set))
    
    c2_set = set([cmp(c) for c in c2])
    print(len(c2), len(c2_set))
    
    overall_set = c1_set | c2_set
    print(len(c1_set) + len(c2_set), "-", (len(c1_set) + len(c2_set)) - len(overall_set), "=", len(overall_set))
    
    return [c.claim for c in overall_set]

In [5]:
def _spacify(c):
    return c, nlp(c.claim)
    
def spacify(claims, nproc=1):
    pool = multiprocessing.Pool(nproc)
    for c, _nlp in tqdm(pool.imap_unordered(_spacify, claims), total=len(claims), desc="spacify"):
        c.support = _nlp

def spacy_claims_overlap(c1, c2):
    scores = []
    
    for _c1 in tqdm(c1, position=1, leave=False):
        for _c2 in tqdm(c2, position=0, leave=False):
            scores.append(_c1.support.similarity(_c2.support))
            
    return scores

In [6]:
label_stats([claim.label for claim in claims1])

total: 15555

7408	0
6451	1
1696	2



In [7]:
label_stats([claim.label for claim in claims2])

total: 13061

6757	1
4648	0
1656	2



In [39]:
leaders_claims = strict_claims_overlap(claims1, claims2)

28616 - 8609 = 20007


In [None]:
# spacify(claims1[:100], nproc=6)

In [None]:
# spacify(claims2, nproc=6)

In [None]:
# claims2[0].support

In [None]:
# _scr = spacy_claims_overlap(claims1[:100], claims2[:200])

# 2016-10-facebook-fact-check
---

In [9]:
ffc_df = pd.read_csv("data/external/2016-10-facebook-fact-check/data/facebook-fact-check.csv")
label_stats(ffc_df["Rating"])
ffc_df.head()

total: 2282

1669	mostly true
264	no factual content
245	mixture of true and false
104	mostly false



Unnamed: 0,account_id,post_id,Category,Page,Post URL,Date Published,Post Type,Rating,Debate,share_count,reaction_count,comment_count
0,184096565021911,1035057923259100,mainstream,ABC News Politics,https://www.facebook.com/ABCNewsPolitics/posts...,2016-09-19,video,no factual content,,,146.0,15.0
1,184096565021911,1035269309904628,mainstream,ABC News Politics,https://www.facebook.com/ABCNewsPolitics/posts...,2016-09-19,link,mostly true,,1.0,33.0,34.0
2,184096565021911,1035305953234297,mainstream,ABC News Politics,https://www.facebook.com/ABCNewsPolitics/posts...,2016-09-19,link,mostly true,,34.0,63.0,27.0
3,184096565021911,1035322636565962,mainstream,ABC News Politics,https://www.facebook.com/ABCNewsPolitics/posts...,2016-09-19,link,mostly true,,35.0,170.0,86.0
4,184096565021911,1035352946562931,mainstream,ABC News Politics,https://www.facebook.com/ABCNewsPolitics/posts...,2016-09-19,video,mostly true,,568.0,3188.0,2815.0


In [167]:
ffc_df.iloc[-1]

account_id                                          114517875225866
post_id                                            1472655802745393
Category                                                       left
Page                                                  The Other 98%
Post URL          https://www.facebook.com/TheOther98/posts/1472...
Date Published                                           2016-09-27
Post Type                                                     photo
Rating                                           no factual content
Debate                                                          NaN
share_count                                                   24499
reaction_count                                                47312
comment_count                                                  1375
Name: 2281, dtype: object

In [None]:
def facebook_fact_check_2016_10_to_claims(df):
    pass

# 2018-12-fake-news-top-50
---

In [12]:
with open("data/external/2018-12-fake-news-top-50/data/sites_2016.csv") as fi:
    fn_top50_sites_2016 = fi.read().splitlines()
with open("data/external/2018-12-fake-news-top-50/data/sites_2017.csv") as fi:
    fn_top50_sites_2017 = fi.read().splitlines()
with open("data/external/2018-12-fake-news-top-50/data/sites_2018.csv") as fi:
    fn_top50_sites_2018 = fi.read().splitlines()
print(len(fn_top50_sites_2016))
print(len(fn_top50_sites_2017))
print(len(fn_top50_sites_2018))

fn_top50_sites_set = set(fn_top50_sites_2016 + fn_top50_sites_2017 + fn_top50_sites_2018)
print(len(fn_top50_sites_set))

97
168
130
258


In [35]:
fn_top50_df = pd.read_csv("data/external/2018-12-fake-news-top-50/data/top_2018.csv")
print(len(fn_top50_df))
fn_top50_df.head()

13961


Unnamed: 0,title,url,fb_engagement,published_date,category,source
0,"Lottery winner arrested for dumping $200,000 o...",https://worldnewsdailyreport.com/lottery-winne...,2383021.0,2018-05-13,Crime,
1,Former first lady Barbara Bush dies at 92 - CNN,http://breaking-cnn.com/former-first-lady-barb...,2290000.0,2018-04-16,Politics,Trendolizer
2,Woman sues Samsung for $1.8M after cell phone ...,https://worldnewsdailyreport.com/woman-sues-sa...,1304430.0,2018-09-19,Medical,
3,BREAKING: Michael Jordan Resigns From The Boar...,https://trumpbetrayed.us/all/breaking-michael-...,911336.0,2018-09-05,Business,
4,Donald Trump Ends School Shootings By Banning ...,https://www.8shit.net/donald-trump-ends-school...,830116.0,2018-02-23,Politics,


In [31]:
def fake_news_top_50_to_claims(df):
    claims = []
    for i, row in tqdm(df.iterrows(), total=len(df)):
        try:
            claims.append(Claim(
                i,
                claim=row["title"],
                date=row["published_date"],
                claimant="facebook",
                label=0,
            ))
        except:
            continue
    return claims

In [33]:
fn_top50_claims = fake_news_top_50_to_claims(fn_top50_df)
print(len(fn_top50_claims))

100%|██████████| 13961/13961 [00:02<00:00, 5280.04it/s]

13960





In [36]:
fn_top50_claims[100]

{
  "id": 100,
  "claim": "Florida Man Arrested For Hanging On Traffic Light And Sh*tting On Cars Passing Underneath",
  "claimant": "facebook",
  "label": 0,
  "date": "2018-08-13",
  "related_articles": null,
  "explanation": null,
  "support": null
}

In [42]:
_ = strict_claims_overlap(leaders_claims, fn_top50_claims)

20007 20007
13960 12372
32379 - 0 = 32379


# ad_fontes
---

In [None]:
ad_fontes_df = pd.read_csv("data/external/ad_fontes/Interactive Media Bias Chart - Ad Fontes Media.csv")
ad_fontes_df.head()

In [None]:
print(len(ad_fontes_df))
collections.Counter(ad_fontes_df["Source"]).most_common()[:5]

# clmentbisaillon (same as ISOT!)
---

In [None]:
# clmentbisaillon_fake_df = pd.read_csv("data/external/clmentbisaillon/Fake.csv")
# clmentbisaillon_true_df = pd.read_csv("data/external/clmentbisaillon/True.csv")
# print("fake:", len(clmentbisaillon_fake_df))
# print("true:", len(clmentbisaillon_true_df))
# clmentbisaillon_fake_df.head()

# fake_news_finder
---

In [None]:
# todo

# fake-news (kaggle)
---

In [43]:
fn_kaggle_train_df = pd.read_csv("data/external/fake-news/train.csv")
label_stats(fn_kaggle_train_df["label"])
fn_kaggle_train_df.head()

total: 20800

10413	1
10387	0



Unnamed: 0,id,title,author,text,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1


In [48]:
fn_kaggle_test_df = pd.read_csv("data/external/fake-news/test.csv")
print(len(fn_kaggle_test_df))
fn_kaggle_test_df.head()

5200


Unnamed: 0,id,title,author,text
0,20800,"Specter of Trump Loosens Tongues, if Not Purse...",David Streitfeld,"PALO ALTO, Calif. — After years of scorning..."
1,20801,Russian warships ready to strike terrorists ne...,,Russian warships ready to strike terrorists ne...
2,20802,#NoDAPL: Native American Leaders Vow to Stay A...,Common Dreams,Videos #NoDAPL: Native American Leaders Vow to...
3,20803,"Tim Tebow Will Attempt Another Comeback, This ...",Daniel Victor,"If at first you don’t succeed, try a different..."
4,20804,Keiser Report: Meme Wars (E995),Truth Broadcast Network,42 mins ago 1 Views 0 Comments 0 Likes 'For th...


In [51]:
def fn_kaggle_to_claims(df):
    claims = []
    for i, row in df.iterrows():
        try:
            claims.append(Claim(
                row["id"], 
                claim=row["title"],
                claimant=row["author"], 
                label=0 if row["label"] else 2
            ))
        except:
            continue
    return claims

In [53]:
fn_kaggle_claims = fn_kaggle_to_claims(fn_kaggle_train_df)
print(len(fn_kaggle_claims))

18285


In [54]:
_ = strict_claims_overlap(leaders_claims, fn_kaggle_claims)

20007 20007
18285 17931
37938 - 0 = 37938


# fake-news-pair-classification-challenge
---

In [57]:
fnpcc_train_df = pd.read_csv("data/external/fake-news-pair-classification-challenge/train.csv")
label_stats(fnpcc_train_df["label"])
fnpcc_train_df.head()

total: 320552

219313	unrelated
92973	agreed
8266	disagreed



Unnamed: 0,id,tid1,tid2,title1_zh,title2_zh,title1_en,title2_en,label
0,0,0,1,2017养老保险又新增两项，农村老人人人可申领，你领到了吗,警方辟谣“鸟巢大会每人领5万” 仍有老人坚持进京,There are two new old-age insurance benefits f...,"Police disprove ""bird's nest congress each per...",unrelated
1,3,2,3,"""你不来深圳，早晚你儿子也要来""，不出10年深圳人均GDP将超香港",深圳GDP首超香港？深圳统计局辟谣：只是差距在缩小,"""If you do not come to Shenzhen, sooner or lat...",Shenzhen's GDP outstrips Hong Kong? Shenzhen S...,unrelated
2,1,2,4,"""你不来深圳，早晚你儿子也要来""，不出10年深圳人均GDP将超香港",GDP首超香港？深圳澄清：还差一点点……,"""If you do not come to Shenzhen, sooner or lat...",The GDP overtopped Hong Kong? Shenzhen clarifi...,unrelated
3,2,2,5,"""你不来深圳，早晚你儿子也要来""，不出10年深圳人均GDP将超香港",去年深圳GDP首超香港？深圳统计局辟谣：还差611亿,"""If you do not come to Shenzhen, sooner or lat...",Shenzhen's GDP topped Hong Kong last year? She...,unrelated
4,9,6,7,"""用大蒜鉴别地沟油的方法,怎么鉴别地沟油",吃了30年食用油才知道，一片大蒜轻松鉴别地沟油,"""How to discriminate oil from gutter oil by me...",It took 30 years of cooking oil to know that o...,agreed


# FakeNewsNet
---

In [55]:
fake_news_net_politifact_fake = pd.read_csv("data/external/FakeNewsNet/dataset/politifact_fake.csv")
fake_news_net_politifact_real = pd.read_csv("data/external/FakeNewsNet/dataset/politifact_real.csv")
print("fake:", len(fake_news_net_politifact_fake))
print("real:", len(fake_news_net_politifact_real))
fake_news_net_politifact_fake.head()

fake: 432
real: 624


Unnamed: 0,id,news_url,title,tweet_ids
0,politifact15014,speedtalk.com/forum/viewtopic.php?t=51650,BREAKING: First NFL Team Declares Bankruptcy O...,937349434668498944\t937379378006282240\t937380...
1,politifact15156,politics2020.info/index.php/2018/03/13/court-o...,Court Orders Obama To Pay $400 Million In Rest...,972666281441878016\t972678396575559680\t972827...
2,politifact14745,www.nscdscamps.org/blog/category/parenting/467...,UPDATE: Second Roy Moore Accuser Works For Mic...,929405740732870656\t929439450400264192\t929439...
3,politifact14355,https://howafrica.com/oscar-pistorius-attempts...,Oscar Pistorius Attempts To Commit Suicide,886941526458347521\t887011300278194176\t887023...
4,politifact15371,http://washingtonsources.org/trump-votes-for-d...,Trump Votes For Death Penalty For Being Gay,915205698212040704\t915242076681506816\t915249...


In [56]:
fake_news_net_gossipcop_fake = pd.read_csv("data/external/FakeNewsNet/dataset/gossipcop_fake.csv")
fake_news_net_gossipcop_real = pd.read_csv("data/external/FakeNewsNet/dataset/gossipcop_real.csv")
print("fake:", len(fake_news_net_gossipcop_fake))
print("real:", len(fake_news_net_gossipcop_real))
fake_news_net_gossipcop_fake.head()

fake: 5323
real: 16817


Unnamed: 0,id,news_url,title,tweet_ids
0,gossipcop-2493749932,www.dailymail.co.uk/tvshowbiz/article-5874213/...,Did Miley Cyrus and Liam Hemsworth secretly ge...,284329075902926848\t284332744559968256\t284335...
1,gossipcop-4580247171,hollywoodlife.com/2018/05/05/paris-jackson-car...,Paris Jackson & Cara Delevingne Enjoy Night Ou...,992895508267130880\t992897935418503169\t992899...
2,gossipcop-941805037,variety.com/2017/biz/news/tax-march-donald-tru...,Celebrities Join Tax March in Protest of Donal...,853359353532829696\t853359576543920128\t853359...
3,gossipcop-2547891536,www.dailymail.co.uk/femail/article-3499192/Do-...,Cindy Crawford's daughter Kaia Gerber wears a ...,988821905196158981\t988824206556172288\t988825...
4,gossipcop-5476631226,variety.com/2018/film/news/list-2018-oscar-nom...,Full List of 2018 Oscar Nominations – Variety,955792793632432131\t955795063925301249\t955798...


In [66]:
def fake_news_net_to_claims(df_fake, df_real):
    claims = []
    
    for i, row in df_fake.iterrows():
        try:
            claims.append(Claim(
                row["id"], 
                claim=row["title"],
                claimant=tldextract.extract(row["news_url"]).domain,
                label=0
            ))
        except:
            continue
            
    for i, row in df_real.iterrows():
        try:
            claims.append(Claim(
                row["id"], 
                claim=row["title"],
                claimant=tldextract.extract(row["news_url"]).domain,
                label=2
            ))
        except:
            continue
            
    return claims

In [68]:
fake_news_net_politifact_claims = fake_news_net_to_claims(fake_news_net_politifact_fake, fake_news_net_politifact_real)
print(len(fake_news_net_politifact_claims))
print()
_ = strict_claims_overlap(leaders_claims, fake_news_net_politifact_claims)

995

20007 20007
995 925
20932 - 3 = 20929


In [69]:
fake_news_net_gossipcop_claims = fake_news_net_to_claims(fake_news_net_gossipcop_fake, fake_news_net_gossipcop_real)
print(len(fake_news_net_gossipcop_claims))
print()
_ = strict_claims_overlap(leaders_claims, fake_news_net_gossipcop_claims)

21871

20007 20007
21871 20475
40482 - 0 = 40482


# fnc-1
---

In [58]:
fnc1_stances_df = pd.read_csv("data/external/fnc-1/train_stances.csv")
label_stats(fnc1_stances_df["Stance"])
fnc1_stances_df.head()

total: 49972

36545	unrelated
8909	discuss
3678	agree
840	disagree



Unnamed: 0,Headline,Body ID,Stance
0,Police find mass graves with at least '15 bodi...,712,unrelated
1,Hundreds of Palestinians flee floods in Gaza a...,158,agree
2,"Christian Bale passes on role of Steve Jobs, a...",137,unrelated
3,HBO and Apple in Talks for $15/Month Apple TV ...,1034,unrelated
4,Spider burrowed through tourist's stomach and ...,1923,disagree


In [59]:
fnc1_bodies_df = pd.read_csv("data/external/fnc-1/train_bodies.csv")
fnc1_bodies_df.head()

Unnamed: 0,Body ID,articleBody
0,0,A small meteorite crashed into a wooded area i...
1,4,Last week we hinted at what was to come as Ebo...
2,5,(NEWSER) – Wonder how long a Quarter Pounder w...
3,6,"Posting photos of a gun-toting child online, I..."
4,7,At least 25 suspected Boko Haram insurgents we...


# george-mcintires
---

In [85]:
george_mcintires_df = pd.read_csv("data/external/george-mcintires/fake_or_real_news.csv", skiprows=1, names=["id", "title", "text", "label"])
label_stats(george_mcintires_df["label"])
george_mcintires_df.head()

total: 6335

3171	REAL
3164	FAKE



Unnamed: 0,id,title,text,label
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL
3,10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
4,875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL


In [99]:
def george_mcintires_to_claims(df):
    claims = []
    for i, row in df.iterrows():
        claims.append(Claim(
            row["id"], 
            claim=row["title"],
            label=0 if row["label"] == "FAKE" else 1
        ))
    return claims

In [100]:
george_mcintires_claims = george_mcintires_to_claims(george_mcintires_df)
print(len(george_mcintires_claims))
print()
_ = strict_claims_overlap(leaders_claims, george_mcintires_claims)

6335

20007 20007
6335 6256
26263 - 0 = 26263


# ISOT
---

In [96]:
isot_fake_df = pd.read_csv("data/external/ISOT/Fake.csv")
isot_fake_df.head()

Unnamed: 0,title,text,subject,date
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017"
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017"
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017"
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017"
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017"


In [109]:
isot_true_df = pd.read_csv("data/external/ISOT/True.csv")
isot_true_df.head()

Unnamed: 0,title,text,subject,date
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017"
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,"December 29, 2017"
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politicsNews,"December 31, 2017"
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,politicsNews,"December 30, 2017"
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,politicsNews,"December 29, 2017"


In [131]:
print("fake:", len(isot_fake_df))
print("true:", len(isot_true_df))

fake: 23481
true: 21417


In [138]:
def isot_to_claims(df_fake, df_true):
    claims = []

    for i, row in df_fake.iterrows():
        try: # December 31, 2017
            _date = datetime.datetime.strptime(row["date"], "%B %d, %Y")
        except: # 19-Feb-18
            try:
                _date = datetime.datetime.strptime(row["date"], "%d-%b-%y")
            except: # Dec 31, 2017
                try:
                    _date = datetime.datetime.strptime(row["date"], "%b %d, %Y")
                except:
                    _date = None
            
        
        claims.append(Claim(
            i, 
            claim=row["title"],
            date=_date.strftime("%Y-%m-%d") if _date else None,
            label=0
        ))
            
    for i, row in df_true.iterrows():
        try: # December 31, 2017
            _date = datetime.datetime.strptime(row["date"], "%B %d, %Y")
        except: # 19-Feb-18
            try:
                _date = datetime.datetime.strptime(row["date"], "%d-%b-%y")
            except: # Dec 31, 2017
                try:
                    _date = datetime.datetime.strptime(row["date"], "%b %d, %Y")
                except:
                    _date = None
            
        claims.append(Claim(
            i, 
            claim=row["title"],
            date=_date.strftime("%Y-%m-%d") if _date else None,
            label=2
        ))
    
    return claims

In [139]:
isot_claims = isot_to_claims(isot_fake_df, isot_true_df)
print(len(isot_claims))
print()
_ = strict_claims_overlap(leaders_claims, isot_claims)

44898

20007 20007
44898 38726
58733 - 0 = 58733


# liar
---


In [143]:
liar_train_df = pd.read_csv("data/external/liar/train.tsv", sep="\t", names=[
    "id",
    "label",
    "statement",
    "subject(s)",
    "speaker",
    "speaker's job title",
    "state info",
    "party affiliation",
    "total credit history count",
    "barely true counts",
    "false counts",
    "half true counts",
    "mostly true counts",
    "context (venue/location of speech or statement)",
    "pants on fire counts",
])
label_stats(liar_train_df["label"])
liar_train_df.head()

total: 10240

2114	half-true
1995	false
1962	mostly-true
1676	true
1654	barely-true
839	pants-fire



Unnamed: 0,id,label,statement,subject(s),speaker,speaker's job title,state info,party affiliation,total credit history count,barely true counts,false counts,half true counts,mostly true counts,context (venue/location of speech or statement),pants on fire counts
0,2635.json,false,Says the Annies List political group supports ...,abortion,dwayne-bohac,State representative,Texas,republican,0.0,1.0,0.0,0.0,0.0,a mailer,
1,10540.json,half-true,When did the decline of coal start? It started...,"energy,history,job-accomplishments",scott-surovell,State delegate,Virginia,democrat,0.0,0.0,1.0,1.0,0.0,a floor speech.,
2,324.json,mostly-true,"Hillary Clinton agrees with John McCain ""by vo...",foreign-policy,barack-obama,President,Illinois,democrat,70.0,71.0,160.0,163.0,9.0,Denver,
3,1123.json,false,Health care reform legislation is likely to ma...,health-care,blog-posting,,,none,7.0,19.0,3.0,5.0,44.0,a news release,
4,9028.json,half-true,The economic turnaround started at the end of ...,"economy,jobs",charlie-crist,,Florida,democrat,15.0,9.0,20.0,19.0,2.0,an interview on CNN,


In [150]:
def liar_to_claims(df):
    claims = []
    for i, row in df.iterrows():
        if row["label"] == "false":
            _lab = 0
        elif row["label"] == "true":
            _lab = 2
        else:
            _lab = 2
        claims.append(Claim(
            row["id"], 
            claim=row["statement"],
            claimant=row["speaker"] if isinstance(row["speaker"], str) else None,
            label=_lab
        ))
    return claims

In [151]:
liar_claims = liar_to_claims(liar_train_df)
print(len(liar_claims))
print()
_ = strict_claims_overlap(leaders_claims, liar_claims)

10240

20007 20007
10240 10223
30230 - 899 = 29331


# mrisdal
---

In [158]:
mrisdal_df = pd.read_csv("data/external/mrisdal/fake.csv")
label_stats(mrisdal_df["type"])
mrisdal_df.head()

total: 12999

11492	bs
443	bias
430	conspiracy
246	hate
146	satire
121	state
102	junksci
19	fake



Unnamed: 0,uuid,ord_in_thread,author,published,title,text,language,crawled,site_url,country,domain_rank,thread_title,spam_score,main_img_url,replies_count,participants_count,likes,comments,shares,type
0,6a175f46bcd24d39b3e962ad0f29936721db70db,0,Barracuda Brigade,2016-10-26T21:41:00.000+03:00,Muslims BUSTED: They Stole Millions In Gov’t B...,Print They should pay all the back all the mon...,english,2016-10-27T01:49:27.168+03:00,100percentfedup.com,US,25689.0,Muslims BUSTED: They Stole Millions In Gov’t B...,0.0,http://bb4sp.com/wp-content/uploads/2016/10/Fu...,0,1,0,0,0,bias
1,2bdc29d12605ef9cf3f09f9875040a7113be5d5b,0,reasoning with facts,2016-10-29T08:47:11.259+03:00,Re: Why Did Attorney General Loretta Lynch Ple...,Why Did Attorney General Loretta Lynch Plead T...,english,2016-10-29T08:47:11.259+03:00,100percentfedup.com,US,25689.0,Re: Why Did Attorney General Loretta Lynch Ple...,0.0,http://bb4sp.com/wp-content/uploads/2016/10/Fu...,0,1,0,0,0,bias
2,c70e149fdd53de5e61c29281100b9de0ed268bc3,0,Barracuda Brigade,2016-10-31T01:41:49.479+02:00,BREAKING: Weiner Cooperating With FBI On Hilla...,Red State : \nFox News Sunday reported this mo...,english,2016-10-31T01:41:49.479+02:00,100percentfedup.com,US,25689.0,BREAKING: Weiner Cooperating With FBI On Hilla...,0.0,http://bb4sp.com/wp-content/uploads/2016/10/Fu...,0,1,0,0,0,bias
3,7cf7c15731ac2a116dd7f629bd57ea468ed70284,0,Fed Up,2016-11-01T05:22:00.000+02:00,PIN DROP SPEECH BY FATHER OF DAUGHTER Kidnappe...,Email Kayla Mueller was a prisoner and torture...,english,2016-11-01T15:46:26.304+02:00,100percentfedup.com,US,25689.0,PIN DROP SPEECH BY FATHER OF DAUGHTER Kidnappe...,0.068,http://100percentfedup.com/wp-content/uploads/...,0,0,0,0,0,bias
4,0206b54719c7e241ffe0ad4315b808290dbe6c0f,0,Fed Up,2016-11-01T21:56:00.000+02:00,FANTASTIC! TRUMP'S 7 POINT PLAN To Reform Heal...,Email HEALTHCARE REFORM TO MAKE AMERICA GREAT ...,english,2016-11-01T23:59:42.266+02:00,100percentfedup.com,US,25689.0,FANTASTIC! TRUMP'S 7 POINT PLAN To Reform Heal...,0.865,http://100percentfedup.com/wp-content/uploads/...,0,0,0,0,0,bias


In [164]:
def mrisdal_to_claims(df):
    claims = []
    for i, row in df.iterrows():
        try:
            if row["ord_in_thread"] != 0:
                continue
            claims.append(Claim(
                i, 
                claim=row["title"],
                claimant=row["site_url"],
                date=datetime.datetime.strptime(row["published"].split("T")[0], "%Y-%m-%d").strftime("%Y-%m-%d"),
                label=0
            ))
        except:
            continue
    return claims

In [165]:
mrisdal_claims = mrisdal_to_claims(mrisdal_df)
print(len(mrisdal_claims))
print()
_ = strict_claims_overlap(leaders_claims, mrisdal_claims)

12319

20007 20007
12319 11698
31705 - 0 = 31705
