# leaders/setup
---

In [1]:
# better id

import os
import json
import logging
import datetime
import collections

import tldextract
import numpy as np
import pandas as pd
from tqdm.auto import tqdm

from valerie.data import Claim, claims_from_phase1, claims_from_phase2



In [2]:
_logger = logging.getLogger(__name__)

In [3]:
claims1 = list(claims_from_phase1("data/phase1/raw/metadata.json").values())
claims2 = list(claims_from_phase2("data/phase2-1/raw/metadata.json").values())

loading claims from phase1: 100%|██████████| 15555/15555 [00:02<00:00, 6417.20it/s]
loading claims from phase2: 100%|██████████| 13061/13061 [00:01<00:00, 7608.33it/s]


In [4]:
class StrictClaimCompare:
    def __init__(self, claim):
        self.claim = claim

    def __eq__(self, other):
        return self.claim.claim == other.claim.claim
    
    def __hash__(self):
        return hash(self.claim.claim)
        

def strict_claims_overlap(c1, c2):
    c1_set = set([StrictClaimCompare(c) for c in c1])
    print(len(c1), len(c1_set))
    
    c2_set = set([StrictClaimCompare(c) for c in c2])
    print(len(c2), len(c2_set))
    
    overall_set = c1_set | c2_set
    print(len(c1_set) + len(c2_set), "-", (len(c1_set) + len(c2_set)) - len(overall_set), "=", len(overall_set))
    
    return [c.claim for c in overall_set]

In [5]:
leaders_claims = strict_claims_overlap(claims1, claims2)

15555 15555
13061 13061
28616 - 8609 = 20007


In [6]:
class ExternalDataset:
    def row_to_claim(self, i, row):
        raise NotImplementedError

    def df_to_claims(self):
        claims = []
        misses = 0
        for i, row in tqdm(self.df.iterrows(), total=len(self.df), desc="df to claims"):
            try:
                claims.append(self.row_to_claim(i, row))
            except:
                misses += 1
                continue
        _logger.info("missed row to claim conversions: {}".format(misses))
        return claims

In [7]:
def analyze(dataset):
    print("df len:", len(dataset.df))
    print("claims len:", len(dataset.claims))
    print()
    print("labels:")
    for k, v in collections.Counter([claim.label for claim in dataset.claims]).most_common():
        print(str(v) + "\t" + str(k))
    print()
    print("overlap with leaders:")
    strict_claims_overlap(leaders_claims, dataset.claims)
    print()
    return dataset.df

# 2018-12-fake-news-top-50
---

In [8]:
class FakeNewsTop50Dataset(ExternalDataset):
    """https://github.com/BuzzFeedNews/2018-12-fake-news-top-50.git"""

    def __init__(self, top_csv, sites_csvs=[]):
        self.df = pd.read_csv(top_csv)
        self.claims = self.df_to_claims()

        self.sites = []
        for sites_csv in sites_csvs:
            with open(sites_csv) as fi:
                self.sites += fi.read().splitlines()
        self.sites = list(set(self.sites))
        
    def row_to_claim(self, i, row):
        # TODO: consider lowercasing the input claim (all words start with capital currently)
        return Claim(
            i,
            claim=row["title"],
            date=row["published_date"],
            claimant="facebook",
            label=0,
        )

In [9]:
fake_news_top50_dataset = FakeNewsTop50Dataset(
    "data/external/2018-12-fake-news-top-50/data/top_2018.csv", 
    sites_csvs=[
        "data/external/2018-12-fake-news-top-50/data/sites_2016.csv",
        "data/external/2018-12-fake-news-top-50/data/sites_2017.csv",
        "data/external/2018-12-fake-news-top-50/data/sites_2018.csv"
    ]
)
analyze(fake_news_top50_dataset)

df to claims: 100%|██████████| 13961/13961 [00:02<00:00, 4855.94it/s]


df len: 13961
claims len: 13960

labels:
13960	0

overlap with leaders:
20007 20007
13960 12372
32379 - 0 = 32379



Unnamed: 0,title,url,fb_engagement,published_date,category,source
0,"Lottery winner arrested for dumping $200,000 o...",https://worldnewsdailyreport.com/lottery-winne...,2383021.00,2018-05-13,Crime,
1,Former first lady Barbara Bush dies at 92 - CNN,http://breaking-cnn.com/former-first-lady-barb...,2290000.00,2018-04-16,Politics,Trendolizer
2,Woman sues Samsung for $1.8M after cell phone ...,https://worldnewsdailyreport.com/woman-sues-sa...,1304430.00,2018-09-19,Medical,
3,BREAKING: Michael Jordan Resigns From The Boar...,https://trumpbetrayed.us/all/breaking-michael-...,911336.00,2018-09-05,Business,
4,Donald Trump Ends School Shootings By Banning ...,https://www.8shit.net/donald-trump-ends-school...,830116.00,2018-02-23,Politics,
...,...,...,...,...,...,...
13956,Florida: Largest food stamp fraud bust in hist...,http://www.vtamedia.com/2018/04/04/florida-lar...,0,2018-04-04,,
13957,Polygamy EXTRA Benefits: UK- ISLAMISTS migrant...,http://www.vtamedia.com/2018/03/23/polygamy-1/,0,2018-03-23,,
13958,Burger King Halloween Whopper Reportedly Turni...,https://wittheshit.com/2018/10/16/burger-king-...,0,2018-10-16,,
13959,,http://ww25.breaking-cnn.com/pop-star-phil-col...,0,2018-09-12,,


# fake-news (kaggle)
---

In [10]:
class FakeNewsKaggleDataset(ExternalDataset):
    """https://www.kaggle.com/c/fake-news/"""

    def __init__(self, train_csv):
        self.df = pd.read_csv(train_csv)
        self.claims = self.df_to_claims()

    def row_to_claim(self, i, row):
        # label 0 for reliable
        # label 1 for unreliable
        return Claim(
            row["id"], 
            claim=row["title"],
            claimant=row["author"], 
            label=0 if row["label"] else 2
        )

In [11]:
fake_news_kaggle_dataset = FakeNewsKaggleDataset("data/external/fake-news/train.csv")
analyze(fake_news_kaggle_dataset)

df to claims: 100%|██████████| 20800/20800 [00:04<00:00, 4167.22it/s]


df len: 20800
claims len: 18285

labels:
10361	2
7924	0

overlap with leaders:
20007 20007
18285 17931
37938 - 0 = 37938



Unnamed: 0,id,title,author,text,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1
...,...,...,...,...,...
20795,20795,Rapper T.I.: Trump a ’Poster Child For White S...,Jerome Hudson,Rapper T. I. unloaded on black celebrities who...,0
20796,20796,"N.F.L. Playoffs: Schedule, Matchups and Odds -...",Benjamin Hoffman,When the Green Bay Packers lost to the Washing...,0
20797,20797,Macy’s Is Said to Receive Takeover Approach by...,Michael J. de la Merced and Rachel Abrams,The Macy’s of today grew from the union of sev...,0
20798,20798,"NATO, Russia To Hold Parallel Exercises In Bal...",Alex Ansary,"NATO, Russia To Hold Parallel Exercises In Bal...",1


# FakeNewsNet
---

In [12]:
class FakeNewsNetDataset(ExternalDataset):
    """https://github.com/KaiDMML/FakeNewsNet.git"""

    def __init__(self, politifact_fake_csv, politifact_real_csv, gossipcop_fake_csv, gossipcop_real_csv):
        self.df = pd.concat([
            pd.read_csv(politifact_fake_csv).assign(label=0),
            pd.read_csv(politifact_real_csv).assign(label=2),
            pd.read_csv(gossipcop_fake_csv).assign(label=0),
            pd.read_csv(gossipcop_real_csv).assign(label=2)
        ], ignore_index=True)
        
        self.claims = self.df_to_claims()

    def row_to_claim(self, i, row):
        return Claim(
            row["id"], 
            claim=row["title"],
            claimant=tldextract.extract(row["news_url"]).domain,
            label=row["label"]
        )

In [13]:
fake_news_net_dataset = FakeNewsNetDataset(
    politifact_fake_csv="data/external/FakeNewsNet/dataset/politifact_fake.csv", 
    politifact_real_csv="data/external/FakeNewsNet/dataset/politifact_real.csv", 
    gossipcop_fake_csv="data/external/FakeNewsNet/dataset/gossipcop_fake.csv", 
    gossipcop_real_csv="data/external/FakeNewsNet/dataset/gossipcop_real.csv",
)
analyze(fake_news_net_dataset)

df to claims: 100%|██████████| 23196/23196 [00:05<00:00, 3933.61it/s]


df len: 23196
claims len: 22866

labels:
17371	2
5495	0

overlap with leaders:
20007 20007
22866 21398
41405 - 3 = 41402



Unnamed: 0,id,news_url,title,tweet_ids,label
0,politifact15014,speedtalk.com/forum/viewtopic.php?t=51650,BREAKING: First NFL Team Declares Bankruptcy O...,937349434668498944\t937379378006282240\t937380...,0
1,politifact15156,politics2020.info/index.php/2018/03/13/court-o...,Court Orders Obama To Pay $400 Million In Rest...,972666281441878016\t972678396575559680\t972827...,0
2,politifact14745,www.nscdscamps.org/blog/category/parenting/467...,UPDATE: Second Roy Moore Accuser Works For Mic...,929405740732870656\t929439450400264192\t929439...,0
3,politifact14355,https://howafrica.com/oscar-pistorius-attempts...,Oscar Pistorius Attempts To Commit Suicide,886941526458347521\t887011300278194176\t887023...,0
4,politifact15371,http://washingtonsources.org/trump-votes-for-d...,Trump Votes For Death Penalty For Being Gay,915205698212040704\t915242076681506816\t915249...,0
...,...,...,...,...,...
23191,gossipcop-891749,https://omgcheckitout.com/2017-hollywood-film-...,2017 Hollywood Film Awards: The Complete List ...,927385466357260288\t927386601034010625\t927387...,2
23192,gossipcop-941486,https://pagesix.com/2018/06/04/jada-pinkett-sm...,Jada Pinkett Smith explains why son Jaden move...,1004044947006386178\t1004045964401889285\t1004...,2
23193,gossipcop-953143,https://www.etonline.com/tinsley-mortimer-reac...,Tinsley Mortimer Reacts to Luann de Lesseps' R...,1019924845889572864\t1019925702676709377\t1019...,2
23194,gossipcop-954366,https://www.healthbreakingnews.net/2018/07/pri...,Prince Harry Carries on Princess Diana’s Legac...,1021766291139584000\t1021772054599802880\t1021...,2


# george-mcintires
---

In [14]:
class GeorgeMcIntireDataset(ExternalDataset):
    """https://github.com/GeorgeMcIntire"""
    
    def __init__(self, data_csv):
        self.df = pd.read_csv(data_csv, skiprows=1, names=["id", "title", "text", "label"])
        self.claims = self.df_to_claims()

    def row_to_claim(self, i, row):
        return Claim(
            row["id"], 
            claim=row["title"],
            label=0 if row["label"] == "FAKE" else 1
        )

In [15]:
george_mcintire_dataset = GeorgeMcIntireDataset("data/external/george-mcintires/fake_or_real_news.csv")
analyze(george_mcintire_dataset)

df to claims: 100%|██████████| 6335/6335 [00:01<00:00, 4445.29it/s]


df len: 6335
claims len: 6335

labels:
3171	1
3164	0

overlap with leaders:
20007 20007
6335 6256
26263 - 0 = 26263



Unnamed: 0,id,title,text,label
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL
3,10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
4,875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL
...,...,...,...,...
6330,4490,State Department says it can't find emails fro...,The State Department told the Republican Natio...,REAL
6331,8062,The ‘P’ in PBS Should Stand for ‘Plutocratic’ ...,The ‘P’ in PBS Should Stand for ‘Plutocratic’ ...,FAKE
6332,8622,Anti-Trump Protesters Are Tools of the Oligarc...,Anti-Trump Protesters Are Tools of the Oligar...,FAKE
6333,4021,"In Ethiopia, Obama seeks progress on peace, se...","ADDIS ABABA, Ethiopia —President Obama convene...",REAL


# ISOT
---

In [16]:
class ISOTDataset(ExternalDataset):
    """https://www.uvic.ca/engineering/ece/isot/datasets/"""
    
    def __init__(self, fake_csv, true_csv):
        self.df = pd.concat([
            pd.read_csv(fake_csv).assign(label=0),
            pd.read_csv(true_csv).assign(label=2)
        ], ignore_index=True)
        self.claims = self.df_to_claims()

    def row_to_claim(self, i, row):
        try: # December 31, 2017
            _date = datetime.datetime.strptime(row["date"], "%B %d, %Y")
        except: # 19-Feb-18
            try:
                _date = datetime.datetime.strptime(row["date"], "%d-%b-%y")
            except: # Dec 31, 2017
                try:
                    _date = datetime.datetime.strptime(row["date"], "%b %d, %Y")
                except:
                    _date = None

        return Claim(
            i, 
            claim=row["title"],
            date=_date.strftime("%Y-%m-%d") if _date else None,
            label=row["label"]
        )

In [17]:
isot_dataset = ISOTDataset(
    fake_csv="data/external/ISOT/Fake.csv",
    true_csv="data/external/ISOT/True.csv"
)
analyze(isot_dataset)

df to claims: 100%|██████████| 44898/44898 [00:13<00:00, 3342.96it/s]


df len: 44898
claims len: 44898

labels:
23481	0
21417	2

overlap with leaders:
20007 20007
44898 38726
58733 - 0 = 58733



Unnamed: 0,title,text,subject,date,label
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017",0
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017",0
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017",0
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017",0
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017",0
...,...,...,...,...,...
44893,'Fully committed' NATO backs new U.S. approach...,BRUSSELS (Reuters) - NATO allies on Tuesday we...,worldnews,"August 22, 2017",2
44894,LexisNexis withdrew two products from Chinese ...,"LONDON (Reuters) - LexisNexis, a provider of l...",worldnews,"August 22, 2017",2
44895,Minsk cultural hub becomes haven from authorities,MINSK (Reuters) - In the shadow of disused Sov...,worldnews,"August 22, 2017",2
44896,Vatican upbeat on possibility of Pope Francis ...,MOSCOW (Reuters) - Vatican Secretary of State ...,worldnews,"August 22, 2017",2


# liar
---


In [18]:
class LiarDataset(ExternalDataset):
    """https://www.cs.ucsb.edu/~william/data/liar_dataset.zip"""
    
    def __init__(self, data_tsv):
        self.df = pd.read_csv(data_tsv, sep="\t", names=[
            "id",
            "label",
            "statement",
            "subject(s)",
            "speaker",
            "speaker's job title",
            "state info",
            "party affiliation",
            "total credit history count",
            "barely true counts",
            "false counts",
            "half true counts",
            "mostly true counts",
            "context (venue/location of speech or statement)",
            "pants on fire counts",
        ])
        self.claims = self.df_to_claims()

    def row_to_claim(self, i, row):
        if row["label"] == "false":
            _lab = 0
        elif row["label"] == "true":
            _lab = 2
        else:
            _lab = 2
            
        return Claim(
            row["id"], 
            claim=row["statement"],
            claimant=row["speaker"] if isinstance(row["speaker"], str) else None,
            label=_lab
        )

In [19]:
liar_dataset = LiarDataset("data/external/liar/train.tsv")
analyze(liar_dataset)

df to claims: 100%|██████████| 10240/10240 [00:03<00:00, 3244.86it/s]


df len: 10240
claims len: 10240

labels:
8245	2
1995	0

overlap with leaders:
20007 20007
10240 10223
30230 - 899 = 29331



Unnamed: 0,id,label,statement,subject(s),speaker,speaker's job title,state info,party affiliation,total credit history count,barely true counts,false counts,half true counts,mostly true counts,context (venue/location of speech or statement),pants on fire counts
0,2635.json,false,Says the Annies List political group supports ...,abortion,dwayne-bohac,State representative,Texas,republican,0.0,1.0,0.0,0.0,0.0,a mailer,
1,10540.json,half-true,When did the decline of coal start? It started...,"energy,history,job-accomplishments",scott-surovell,State delegate,Virginia,democrat,0.0,0.0,1.0,1.0,0.0,a floor speech.,
2,324.json,mostly-true,"Hillary Clinton agrees with John McCain ""by vo...",foreign-policy,barack-obama,President,Illinois,democrat,70.0,71.0,160.0,163.0,9.0,Denver,
3,1123.json,false,Health care reform legislation is likely to ma...,health-care,blog-posting,,,none,7.0,19.0,3.0,5.0,44.0,a news release,
4,9028.json,half-true,The economic turnaround started at the end of ...,"economy,jobs",charlie-crist,,Florida,democrat,15.0,9.0,20.0,19.0,2.0,an interview on CNN,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10235,5473.json,mostly-true,There are a larger number of shark attacks in ...,"animals,elections",aclu-florida,,Florida,none,0.0,1.0,1.0,1.0,0.0,"interview on ""The Colbert Report""",
10236,3408.json,mostly-true,Democrats have now become the party of the [At...,elections,alan-powell,,Georgia,republican,0.0,0.0,0.0,1.0,0.0,an interview,
10237,3959.json,half-true,Says an alternative to Social Security that op...,"retirement,social-security",herman-cain,,Georgia,republican,4.0,11.0,5.0,3.0,3.0,a Republican presidential debate,
10238,2253.json,false,On lifting the U.S. Cuban embargo and allowing...,"florida,foreign-policy",jeff-greene,,Florida,democrat,3.0,1.0,3.0,0.0,0.0,a televised debate on Miami's WPLG-10 against ...,


# mrisdal
---

In [20]:
class MrisdalDataset(ExternalDataset):
    """https://www.kaggle.com/mrisdal/fake-news"""
    
    def __init__(self, data_csv):
        self.df = pd.read_csv(data_csv)
        self.claims = self.df_to_claims()

    def row_to_claim(self, i, row):
        if row["ord_in_thread"] != 0:
            raise ValueError("must be main post")
        return Claim(
            i, 
            claim=row["title"],
            claimant=row["site_url"],
            date=datetime.datetime.strptime(row["published"].split("T")[0], "%Y-%m-%d").strftime("%Y-%m-%d"),
            label=0
        )

In [21]:
mrisdal_dataset = MrisdalDataset("data/external/mrisdal/fake.csv")
analyze(mrisdal_dataset)

df to claims: 100%|██████████| 12999/12999 [00:03<00:00, 3637.01it/s]

df len: 12999
claims len: 12319

labels:
12319	0

overlap with leaders:
20007 20007
12319 11698
31705 - 0 = 31705






Unnamed: 0,uuid,ord_in_thread,author,published,title,text,language,crawled,site_url,country,domain_rank,thread_title,spam_score,main_img_url,replies_count,participants_count,likes,comments,shares,type
0,6a175f46bcd24d39b3e962ad0f29936721db70db,0,Barracuda Brigade,2016-10-26T21:41:00.000+03:00,Muslims BUSTED: They Stole Millions In Gov’t B...,Print They should pay all the back all the mon...,english,2016-10-27T01:49:27.168+03:00,100percentfedup.com,US,25689.0,Muslims BUSTED: They Stole Millions In Gov’t B...,0.000,http://bb4sp.com/wp-content/uploads/2016/10/Fu...,0,1,0,0,0,bias
1,2bdc29d12605ef9cf3f09f9875040a7113be5d5b,0,reasoning with facts,2016-10-29T08:47:11.259+03:00,Re: Why Did Attorney General Loretta Lynch Ple...,Why Did Attorney General Loretta Lynch Plead T...,english,2016-10-29T08:47:11.259+03:00,100percentfedup.com,US,25689.0,Re: Why Did Attorney General Loretta Lynch Ple...,0.000,http://bb4sp.com/wp-content/uploads/2016/10/Fu...,0,1,0,0,0,bias
2,c70e149fdd53de5e61c29281100b9de0ed268bc3,0,Barracuda Brigade,2016-10-31T01:41:49.479+02:00,BREAKING: Weiner Cooperating With FBI On Hilla...,Red State : \nFox News Sunday reported this mo...,english,2016-10-31T01:41:49.479+02:00,100percentfedup.com,US,25689.0,BREAKING: Weiner Cooperating With FBI On Hilla...,0.000,http://bb4sp.com/wp-content/uploads/2016/10/Fu...,0,1,0,0,0,bias
3,7cf7c15731ac2a116dd7f629bd57ea468ed70284,0,Fed Up,2016-11-01T05:22:00.000+02:00,PIN DROP SPEECH BY FATHER OF DAUGHTER Kidnappe...,Email Kayla Mueller was a prisoner and torture...,english,2016-11-01T15:46:26.304+02:00,100percentfedup.com,US,25689.0,PIN DROP SPEECH BY FATHER OF DAUGHTER Kidnappe...,0.068,http://100percentfedup.com/wp-content/uploads/...,0,0,0,0,0,bias
4,0206b54719c7e241ffe0ad4315b808290dbe6c0f,0,Fed Up,2016-11-01T21:56:00.000+02:00,FANTASTIC! TRUMP'S 7 POINT PLAN To Reform Heal...,Email HEALTHCARE REFORM TO MAKE AMERICA GREAT ...,english,2016-11-01T23:59:42.266+02:00,100percentfedup.com,US,25689.0,FANTASTIC! TRUMP'S 7 POINT PLAN To Reform Heal...,0.865,http://100percentfedup.com/wp-content/uploads/...,0,0,0,0,0,bias
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12994,f1b5d0e44803f48732bde854a9fdf95837219b12,2,replaceme,2016-10-26T23:58:00.000+03:00,,It DOES allow you to put a dog face on top of ...,english,2016-10-27T00:37:46.194+03:00,zerohedge.com,US,2435.0,"Snapchat To Raise Up To $4 Billion In IPO, Val...",0.000,,40,32,0,0,0,bs
12995,36011ceba3647e1bea78299b68b6fb705a1fc1ad,3,Freedumb,2016-10-27T00:02:00.000+03:00,,Wait till you see what happens to the valuatio...,english,2016-10-27T00:37:46.220+03:00,zerohedge.com,US,2435.0,"Snapchat To Raise Up To $4 Billion In IPO, Val...",0.000,,40,32,0,0,0,bs
12996,6995d1aa9ac99926106489b14b5530e85358059a,4,major major maj...,2016-10-27T00:06:00.000+03:00,,I'm waiting for the one that puts a pussy on m...,english,2016-10-27T00:37:46.244+03:00,zerohedge.com,US,2435.0,"Snapchat To Raise Up To $4 Billion In IPO, Val...",0.000,,40,32,0,0,0,bs
12997,7de8ae90eee164eb756db6c8a3772288e11d7a94,5,beemasters,2016-10-27T00:09:00.000+03:00,,$4 Billion even after they are known to be kee...,english,2016-10-27T00:37:46.247+03:00,zerohedge.com,US,2435.0,"Snapchat To Raise Up To $4 Billion In IPO, Val...",0.000,,40,32,0,0,0,bs


In [44]:
if __name__ == "__main__":
#     datasets = {
#         "fake_news_top50_dataset": FakeNewsTop50Dataset(
#             "data/external/2018-12-fake-news-top-50/data/top_2018.csv",
#             sites_csvs=[
#                 "data/external/2018-12-fake-news-top-50/data/sites_2016.csv",
#                 "data/external/2018-12-fake-news-top-50/data/sites_2017.csv",
#                 "data/external/2018-12-fake-news-top-50/data/sites_2018.csv",
#             ],
#         ),
#         "fake_news_kaggle_dataset": FakeNewsKaggleDataset(
#             "data/external/fake-news/train.csv"
#         ),
#         "fake_news_net_dataset": FakeNewsNetDataset(
#             politifact_fake_csv="data/external/FakeNewsNet/dataset/politifact_fake.csv",
#             politifact_real_csv="data/external/FakeNewsNet/dataset/politifact_real.csv",
#             gossipcop_fake_csv="data/external/FakeNewsNet/dataset/gossipcop_fake.csv",
#             gossipcop_real_csv="data/external/FakeNewsNet/dataset/gossipcop_real.csv",
#         ),
#         "george_mcintire_dataset": GeorgeMcIntireDataset(
#             "data/external/george-mcintires/fake_or_real_news.csv"
#         ),
#         "isot_dataset": ISOTDataset(
#             fake_csv="data/external/ISOT/Fake.csv", true_csv="data/external/ISOT/True.csv"
#         ),
#         "liar_dataset": LiarDataset("data/external/liar/train.tsv"),
#         "mrisdal_dataset": MrisdalDataset("data/external/mrisdal/fake.csv"),
#     }
    datasets_claims = {k: dataset.claims for k, dataset in datasets.items()}
    datasets_claims["phase1"] = claims1
    datasets_claims["phase2"] = claims2

    claims_set = set()
    for k, claims in datasets_claims.items():
        print(k.center(50, "-"))
        orig_len = len(claims_set)
        dataset_set = set([StrictClaimCompare(c) for c in claims])
        claims_set = dataset_set | claims_set
        print(orig_len, "+", len(dataset_set), "-", orig_len + len(dataset_set) - len(claims_set), "=", len(claims_set), "(+ {})".format(len(claims_set) - orig_len))
        print()

-------------fake_news_top50_dataset--------------
0 + 12372 - 0 = 12372 (+ 12372)

-------------fake_news_kaggle_dataset-------------
12372 + 17928 - 0 = 30300 (+ 17928)

--------------fake_news_net_dataset---------------
30300 + 21398 - 15 = 51683 (+ 21383)

-------------george_mcintire_dataset--------------
51683 + 6256 - 2049 = 55890 (+ 4207)

-------------------isot_dataset-------------------
55890 + 38726 - 23 = 94593 (+ 38703)

-------------------liar_dataset-------------------
94593 + 10223 - 0 = 104816 (+ 10223)

-----------------mrisdal_dataset------------------
104816 + 11698 - 8658 = 107856 (+ 3040)

----------------------phase1----------------------
107856 + 15555 - 459 = 122952 (+ 15096)

----------------------phase2----------------------
122952 + 13061 - 9052 = 126961 (+ 4009)

