## Bootstrap and setup dependencies

In [None]:
import pandas as pd
import nltk
import collections
import itertools
from collections import Counter
import tldextract
import numpy as np

#web
from urllib.request import Request, urlopen
from bs4 import BeautifulSoup
import time
import json

#snorkel
from snorkel.labeling import labeling_function
#from snorkel.labeling import PandasLFApplier
from snorkel.labeling.apply.spark import SparkLFApplier
from snorkel.labeling.model import MajorityLabelVoter
from snorkel.labeling import LFAnalysis
from snorkel.labeling.model import LabelModel


ABSTAIN = -1
FAKE = 0
REAL = 1

## data filenames
TRAINING_dataFile = "/dbfs/FileStore/test/fnn_train.csv" #training dataset

API_RESULTS_dataFile = "/dbfs/FileStore/FNN/apiResults.csv" #Local csv that has the output of the fake news detection (FNN)

GB_dataFile = "/dbfs/FileStore/FNN/glennbeck_ratings.csv" # GlennGeck data files (www.glennbeck.com)
RP_dataFile = "/dbfs/FileStore/FNN/realclearpolitics_ratings.csv" # Realclear Politics file (www.realclearpolitics.com)
WP_dataFile = "/dbfs/FileStore/FNN/washingtonpost_ratings.csv" # Washington Post data file (www.washingtonpost.com/news/fact-checker)

# Load the Liar dataset
LIAR_dataFile = "/dbfs/FileStore/test/liar/liar_train.csv" #training dataset

## Initializing the sentiment analysis package

In [None]:
#In some cases a call later will fail, and this is needed on spark

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.sentiment.vader import SentimentIntensityAnalyzer

nltk.download('vader_lexicon')
sid = SentimentIntensityAnalyzer()

## Load the data

In [None]:
# Use panda to read CSV
data = pd.read_csv(TRAINING_dataFile, encoding='utf-8')

print("Count:", data.count())

#peek at the data
data.head(3)
#display(data) 

Unnamed: 0,id,date,speaker,statement,sources,paragraph_based_content,fullText_based_content,label_fnn
0,3106,2011-01-25T06:00:00-05:00,Joe Wilkinson,A national organization says Georgia has one o...,['http://www.ajc.com/news/georgia-politics-ele...,['A coalition of government watchdog groups la...,A coalition of government watchdog groups last...,fake
1,5655,2012-04-02T11:42:20-04:00,Rick Scott,"Says Barack Obama's health care law ""will be t...",['http://www.youtube.com/watch?v=TaC0mKApf9Q&f...,['As Supreme Court justices embarked on three ...,As Supreme Court justices embarked on three da...,fake
2,3506,2011-04-01T09:49:05-04:00,J.D. Alexander,Says the Southwest Florida Water Management Di...,['http://www.tampabay.com/news/politics/gubern...,"[""Here's a new one: The Senate budget committe...",Here's a new one: The Senate budget committee ...,fake


In [None]:
data.info(verbose=True)

In [None]:
#apiResultsFile = spark.read.csv(API_RESULTS_dataFile, header="false", inferSchema="true")
#apiResultsFile = pd.read_csv(API_RESULTS_dataFile, sep="|", encoding='utf-8', header=None)
#apiResultsFile.head(3)

In [None]:
# Best to avoid pandas in spark but we trying to keep it consistent
# We replace the NaN with None for Spark
data = data.where(cond=data.notna(), other=None)
print("Count:", data.count())

## Convert label to numbers; we can use the numbers validation

In [None]:
data["label_numeric"] = data.apply(lambda row: 1 if row["label_fnn"]=='real' else 0, axis=1)

#peek at the data
data.head(3)

Unnamed: 0,id,date,speaker,statement,sources,paragraph_based_content,fullText_based_content,label_fnn,label_numeric
0,3106,2011-01-25T06:00:00-05:00,Joe Wilkinson,A national organization says Georgia has one o...,['http://www.ajc.com/news/georgia-politics-ele...,['A coalition of government watchdog groups la...,A coalition of government watchdog groups last...,fake,0
1,5655,2012-04-02T11:42:20-04:00,Rick Scott,"Says Barack Obama's health care law ""will be t...",['http://www.youtube.com/watch?v=TaC0mKApf9Q&f...,['As Supreme Court justices embarked on three ...,As Supreme Court justices embarked on three da...,fake,0
2,3506,2011-04-01T09:49:05-04:00,J.D. Alexander,Says the Southwest Florida Water Management Di...,['http://www.tampabay.com/news/politics/gubern...,"[""Here's a new one: The Senate budget committe...",Here's a new one: The Senate budget committee ...,fake,0


In [None]:
data.info(verbose=True)

## Retrieving labels / information from each site

In [None]:
# contacts a url, downloads the website's content and parses it.  
def get_parsed_html(url):
    req = Request(url, headers={'User-Agent': 'Mozilla/5.0'})
    webpage = urlopen(req).read()
    parsed_html = BeautifulSoup(webpage)
    return parsed_html

### www.politifact.com

In [None]:
def get_poitifact_image_alt(url):
    result = "abstain"
    try:
        parsed_html = get_parsed_html(url)
        div = parsed_html.body.find('div', attrs={'class':'m-statement__meter'})
        result = div.find("img", attrs={'class':'c-image__original'})["alt"]
        time.sleep(0.5)
    except Exception as e:
        print(e)
    return result

### www.snopes.com

In [None]:
def get_snopes_image_alt(url):
    result = "abstain"
    try:
        parsed_html = get_parsed_html(url)
        div = parsed_html.body.find('div', attrs={'class':'media rating'})
        result = div.find("img")["alt"]
    except Exception as e:
        print(e)
    return result

### www.factcheck.org

In [None]:
def get_factcheck_first_paragraph(url):
    result = "abstain"
    try:
        parsed_html = get_parsed_html(url)
        div = parsed_html.body.find('div', attrs={'class':'entry-content'})
        # if the first paragraph starts with 'Q:' and the second with 'A:' than it is a Q & A style; 
        # take the second paragraph
        # otherwise take the first.
        parag = div.find_all("p")
        if(parag[0].text[0:3] == 'Q: ' and parag[1].text[0:3] == 'A: '):           
            return parag[1].text
        return parag[0].text
    except Exception as e:
        print(e)
    return result

### www.factcheck.afp.com

In [None]:
def get_factcheck_afp_title(url):
    result = "abstain"
    try:
        parsed_html = get_parsed_html(url)
        h3 = parsed_html.body.find('h3')
        return h3.text
    except Exception as e:
        print(e)
    return result

### www.twitter.com

In [None]:
def extract_twitter_name(url):
    domain = "https://twitter.com/"
    sub = url[url.find(domain) + len(domain):]
    index = sub.find('/')
    if(index == -1):
        return sub
    else:
        return sub[:index]

## Retrieving URLs of fact checking sites

In [None]:
from urllib.request import Request, urlopen
from bs4 import BeautifulSoup
import time

In [None]:
fact_checking_sites = {
    "www.politifact.com" : get_poitifact_image_alt,
    "www.snopes.com": get_snopes_image_alt,
    "www.twitter.com":  extract_twitter_name,
    "www.factcheck.org": get_factcheck_first_paragraph,
    "factcheck.afp.com": get_factcheck_afp_title,
    "www.washingtonpost.com/news/fact-checker/": None,
    "www.realclearpolitics.com": None,
    "www.glennbeck.com": None,
}

In [None]:
def sources_as_list(source, domain):
    urls = source[1:-1].split(',')
    u = []
    for url in urls:
        if domain in url:
            u.append(url)
    return u

In [None]:
# Initialize the new columns
for site in fact_checking_sites: 
      data[site] = None
    
data_size = data.shape[0]

# peek at the data
data.head(3)

Unnamed: 0,id,date,speaker,statement,sources,paragraph_based_content,fullText_based_content,label_fnn,label_numeric,www.politifact.com,www.snopes.com,www.twitter.com,www.factcheck.org,factcheck.afp.com,www.washingtonpost.com/news/fact-checker/,www.realclearpolitics.com,www.glennbeck.com
0,3106,2011-01-25T06:00:00-05:00,Joe Wilkinson,A national organization says Georgia has one o...,['http://www.ajc.com/news/georgia-politics-ele...,['A coalition of government watchdog groups la...,A coalition of government watchdog groups last...,fake,0,,,,,,,,
1,5655,2012-04-02T11:42:20-04:00,Rick Scott,"Says Barack Obama's health care law ""will be t...",['http://www.youtube.com/watch?v=TaC0mKApf9Q&f...,['As Supreme Court justices embarked on three ...,As Supreme Court justices embarked on three da...,fake,0,,,,,,,,
2,3506,2011-04-01T09:49:05-04:00,J.D. Alexander,Says the Southwest Florida Water Management Di...,['http://www.tampabay.com/news/politics/gubern...,"[""Here's a new one: The Senate budget committe...",Here's a new one: The Senate budget committee ...,fake,0,,,,,,,,


In [None]:
#setup a dictionary for the results
fact_checking_sites_results = {
    "www.politifact.com" : [None] * data_size,
    "www.snopes.com": [None] * data_size,
    "www.twitter.com":  [None] * data_size,
    "www.factcheck.org": [None] * data_size,
    "factcheck.afp.com": [None] * data_size,
    "www.washingtonpost.com/news/fact-checker/": [None] * data_size,
    "www.realclearpolitics.com": [None] * data_size,
    "www.glennbeck.com": [None] * data_size,
}

#this output right now will look weird because of the structure of the dictionary
print(fact_checking_sites_results.items())

## Option 1 - Live data - Parse the sites and load the data

In [None]:
# #Iterate through the records
# #and looks through the sources for each fact-checking site

# for i, row in data.iterrows():
#     for site in fact_checking_sites: 
#         sources = sources_as_list(row["sources"], site)
#         if len(sources) != 0:
#             print("-{}".format(i), "id:", row["id"])
#             #print(sources)
#             labels = ""
#             for source in sources:
#                 handler = fact_checking_sites[site]
#                 if handler:
#                     #print("Handling: {} ++++++++++++++++++++++++++".format(site))
#                     source = str(source).strip()[1:-1]
#                     if(len(labels) > 0):
#                         labels += ", "+handler(str(source))
#                     else:
#                         labels += handler(str(source))
#                     #print("Result: {} ++++++++++++++++++++++++++".format(labels))
#                 else:
#                     if(len(labels) > 0):
#                         #print("Handling: {} ++++++++++++++++++++++++++".format(site))
#                         labels += ", "+ source
#                     else:
#                         labels += source
#                 #print("Result: {} ++++++++++++++++++++++++++".format(labels))
#             fact_checking_sites_results[site][i] =labels

## Option 1 - Continued

In [None]:
# for site in fact_checking_sites:
#     data[site] = fact_checking_sites_results[site]

## Option 2 - Load the data - saved to file earlier

In [None]:
## To keep this notebook consistent with the output in the chapters, and execute faster we downloaded the code
## and saved it locally as a csv. This allows the reader to run a apples-to-apples comparison on what they see
## and what is in the book. 

### ALTERNATIVE TO THE TWO CELL ABOVE, IF LOADING FROM THE FILE
apiResultsFile = pd.read_csv(API_RESULTS_dataFile, sep="|", encoding='utf-8', header=None)
apiResultsFile = apiResultsFile.where(cond=apiResultsFile.notna(), other=None)
apiResultsFile = apiResultsFile.where(cond=apiResultsFile.notnull(), other=None)

apiResultsFile.tail(3)

# process the data
for i, line in apiResultsFile.iterrows():
    #print(line[0], line[1], line[2])
    row = line[0]
    col = line[1].strip()
    data.at[row,col] = line[2].strip()

In [None]:
# Peek at the data
data.head(3)

Unnamed: 0,id,date,speaker,statement,sources,paragraph_based_content,fullText_based_content,label_fnn,label_numeric,www.politifact.com,www.snopes.com,www.twitter.com,www.factcheck.org,factcheck.afp.com,www.washingtonpost.com/news/fact-checker/,www.realclearpolitics.com,www.glennbeck.com
0,3106,2011-01-25T06:00:00-05:00,Joe Wilkinson,A national organization says Georgia has one o...,['http://www.ajc.com/news/georgia-politics-ele...,['A coalition of government watchdog groups la...,A coalition of government watchdog groups last...,fake,0,,,,,,,,
1,5655,2012-04-02T11:42:20-04:00,Rick Scott,"Says Barack Obama's health care law ""will be t...",['http://www.youtube.com/watch?v=TaC0mKApf9Q&f...,['As Supreme Court justices embarked on three ...,As Supreme Court justices embarked on three da...,fake,0,"false, false, barely-true",,,,,,,
2,3506,2011-04-01T09:49:05-04:00,J.D. Alexander,Says the Southwest Florida Water Management Di...,['http://www.tampabay.com/news/politics/gubern...,"[""Here's a new one: The Senate budget committe...",Here's a new one: The Senate budget committee ...,fake,0,,,,,,,,


## Crowdsourcing

### www.glennbeck.com

In [None]:
glenbeck_ratings = pd.read_csv(GB_dataFile);

for i, row in glenbeck_ratings.iterrows():
    data.loc[data["id"] == row["id"],["www.glennbeck.com"]] = row["www.glennbeck.com"]

### www.realclearpolitics.com

In [None]:
rp_ratings = pd.read_csv(RP_dataFile);

for i, row in rp_ratings.iterrows():
    data.loc[data["id"] == row["id"],["www.realclearpolitics.com"]] = row["www.realclearpolitics.com"]

### www.washingtonpost.com/news/fact-checker

In [None]:
wp_ratings = pd.read_csv(WP_dataFile);

for i, row in wp_ratings.iterrows():
    data.loc[data["id"] == row["id"],["www.washingtonpost.com/news/fact-checker/"]] = row["www.washingtonpost.com/news/fact-checker/"]

### www.glennbeck.com

In [None]:
# gb_urls = {}
# counter = 0
# for i, row in data.iterrows():
#     poli = sources_as_list(row["sources"], "www.glennbeck.com")

#     if(len(poli) > 0):
#         gb_urls[i]= row["label_fnn"]
#     else:
#         gb_urls[i]= None
# data["www.glennbeck.com"] = gb_urls.values()
# data[data["www.glennbeck.com"].notnull()].shape

### www.realclearpolitics.com

In [None]:
# gb_urls = {}
# counter = 0
# for i, row in data.iterrows():
#     poli = sources_as_list(row["sources"], "www.realclearpolitics.com")

#     if(len(poli) > 0):
#         gb_urls[i]= row["label_fnn"]
#     else:
#         gb_urls[i]= None
# data["www.realclearpolitics.com"] = gb_urls.values()
# data[data["www.realclearpolitics.com"].notnull()].shape

### www.washingtonpost.com/news/fact-checker/

In [None]:
# gb_urls = {}
# counter = 0
# for i, row in data.iterrows():
#     poli = sources_as_list(row["sources"], "www.washingtonpost.com/news/fact-checker/")

#     if(len(poli) > 0):
#         gb_urls[i]= row["label_fnn"]
#     else:
#         gb_urls[i]= None
# data["www.washingtonpost.com/news/fact-checker/"] = gb_urls.values()
# data[data["www.washingtonpost.com/news/fact-checker/"].notnull()].shape

# Learning the labels with Snorkel

## Get only the subset of the data that has at least one label

In [None]:
data2 = data[data["www.politifact.com"].notnull() 
             | data["www.snopes.com"].notnull()
             | data["www.factcheck.org"].notnull()
             | data["factcheck.afp.com"].notnull()
             | data["www.realclearpolitics.com"].notnull()
             | data["www.glennbeck.com"].notnull()
             | data["www.washingtonpost.com/news/fact-checker/"].notnull()        
             | data["www.twitter.com"].notnull()]

In [None]:
#peek at it
data2.head(3)

Unnamed: 0,id,date,speaker,statement,sources,paragraph_based_content,fullText_based_content,label_fnn,label_numeric,www.politifact.com,www.snopes.com,www.twitter.com,www.factcheck.org,factcheck.afp.com,www.washingtonpost.com/news/fact-checker/,www.realclearpolitics.com,www.glennbeck.com
1,5655,2012-04-02T11:42:20-04:00,Rick Scott,"Says Barack Obama's health care law ""will be t...",['http://www.youtube.com/watch?v=TaC0mKApf9Q&f...,['As Supreme Court justices embarked on three ...,As Supreme Court justices embarked on three da...,fake,0,"false, false, barely-true",,,,,,,
4,4776,2011-11-13T07:30:00-05:00,Rodney Frelinghuysen,"Says the Treasury Department ""says 41 percent ...",['http://frelinghuysen.house.gov/index.cfm?sec...,['The millionaires’ tax proposal made its late...,The millionaires’ tax proposal made its latest...,fake,0,"false, barely-true",,,,,,,
6,1415,2010-01-21T17:37:57-05:00,Chain email,"The House health care bill provides for ""free ...",['http://michaelconnelly.viviti.com/entries/ge...,"[""A chain e-mail written by former attorney Mi...",A chain e-mail written by former attorney Mich...,fake,0,"half-true, false",,,A: Legal experts agree that requiring citizens...,,,,


## Labeling function

In [None]:
@labeling_function()
def label_snopes(row):
    label = row["www.snopes.com"]
    print(label)
    if label is not None:
        if ('real' in label):
            return REAL
        else: 
            return FAKE
    else: 
        return ABSTAIN

In [None]:
@labeling_function()
def label_wp(row):
    label = row["www.washingtonpost.com/news/fact-checker/"]
    if label is not None:
        if ('real' in label):
            return REAL
        else: 
            return FAKE
    else: 
        return ABSTAIN

In [None]:
@labeling_function()
def label_rp(row):
    label = row["www.realclearpolitics.com"]
    if label is not None:
        label = row["www.realclearpolitics.com"]
        #print(label)
        if ('real' in label):
            return REAL
        else: 
            return FAKE
    else: 
        return ABSTAIN

In [None]:
truth_o_meter = {
    "true": 4,
    "mostly-true": 3,
    "half-true": 2,
    "barely-true": 1,
    "mostly-false": -1,
    "false": -2,
    "pants-fire": -3    
}

@labeling_function()
def label_politifact(row):
    total_score = 0
    labels = row["www.politifact.com"]
    #print(labels)
    if(labels):
        labels = str(row["www.politifact.com"]).split(',')
        # The last label has the newline character
        if(len(labels) > 0):
            labels[-1] = labels[-1][:-2]
        for label in labels:
            #print(label)
            label = label.strip()
            if(label in truth_o_meter):
                total_score += truth_o_meter[label]                
    #print("score: {} ".format(total_score))          
    if(total_score > 0):
        return REAL
    if(total_score < 0): 
        return FAKE
    
    return ABSTAIN

In [None]:
def factcheck_sentiment(row, columnName):
    label = str(row[columnName])
    score = 0
    if(label):
        claims = label[1:-1].split(',')
        for claim in claims:
            print(claim)
            sentiment = sid.polarity_scores(claim)
            print(sentiment)
            if(sentiment["neg"] > sentiment["pos"]):
                score -=1
            elif(sentiment["pos"] > sentiment["neg"]):
                score +=1
        if(score > 0):
            return REAL
        elif (score < 0):
            return FAKE
        else:
            return ABSTAIN
    return ABSTAIN

In [None]:
@labeling_function()
def factcheckqa_sentiment(row):
    return factcheck_sentiment(row, "www.factcheck.org")

In [None]:
@labeling_function()
def factcheckafpqa_sentiment(row):
    return factcheck_sentiment(row, "factcheck.afp.com")

## Transfer Learning from the liar dataset

In [None]:
# Load the Liar dataset
data3 = pd.read_csv(LIAR_dataFile)

#Clean up the NaN's
data3 = data3.where(cond=data3.notna(), other=None)
print("Count:", data3.count())

#Take a peek at the data to ensure all is correct
data3.head(3)

Unnamed: 0,id,date,speaker,statement,sources,paragraph_based_content,fullText_based_content,label-liar
0,18178,2020-03-18T13:26:42-04:00,Instagram posts,"""COVID-19 started because we eat animals.""",['https://www.cdc.gov/coronavirus/2019-ncov/ca...,['Vegan Instagram users are pinning the 2019 c...,Vegan Instagram users are pinning the 2019 cor...,barely-true
1,3350,2011-03-04T09:12:59-05:00,Glenn Beck,Says Michelle Obama has 43 people on her staff...,['http://www.glennbeck.com/2011/02/25/while-wo...,['Glenn Beck rekindled a falsehood about the s...,Glenn Beck rekindled a falsehood about the siz...,pants-fire
2,14343,2017-07-21T11:52:44-04:00,Mike Pence,"Says President Donald Trump ""has signed more l...",['https://nrf.com/events/retail-advocates-summ...,['Vice President Mike Pence says that when it ...,Vice President Mike Pence says that when it co...,half-true


In [None]:
# check the unique labels
labels = data3["label-liar"].unique()

# peek at the labels to validate
print(labels)

In [None]:
# true speakers
counts_true = collections.Counter(data3[(data3["label-liar"]=="mostly-true") | (data3["label-liar"]=="true")]["speaker"])
counts_true = dict(counts_true.most_common())

# false speakers
counts_false = collections.Counter(data3[(data3["label-liar"]=="false" )| (data3["label-liar"]=="pants-fire")]["speaker"])
counts_false = dict(counts_false.most_common())

In [None]:
false_percent = {}

for k, v in counts_false.items():
    total = v
    if k in counts_true:
        total += counts_true[k]
    false_percent[k] = v/total

In [None]:
true_percent = {}

for k, v in counts_true.items():
    total = v
    if k in counts_false:
        total += counts_false[k]
    true_percent[k] = v/total

In [None]:
@labeling_function()
def speaker(row):
    speaker = row["speaker"]
    if(speaker in true_percent and true_percent[speaker] > 0.6):
        return REAL
    if(speaker in false_percent and false_percent[speaker] > 0.6):
        return FAKE
    return ABSTAIN

## Setup required to training the snorkel model

In [None]:
#Split the data in a 80 - 20 % ratio between train and test
data_size = data.shape[0]
train_data_size = int(data_size*0.8)

print("Data shape",data.shape)
print("Training data size",train_data_size)

data = data.sample(frac=1, random_state=1)

In [None]:
# data[['www.twitter.com','www.politifact.com','www.snopes.com','www.factcheck.org','factcheck.afp.com','www.washingtonpost.com/news/fact-checker/','www.realclearpolitics.com','www.glennbeck.com']] = data[['www.twitter.com','www.politifact.com','www.snopes.com','www.factcheck.org','factcheck.afp.com','www.washingtonpost.com/news/fact-checker/','www.realclearpolitics.com','www.glennbeck.com']].astype(str)

## Generating agreement from weak classifiers

In [None]:
lfs = [
        label_rp,
        label_wp, 
        label_snopes,
        label_politifact,
        factcheckqa_sentiment,
        factcheckafpqa_sentiment,
        speaker
      ]

#need to convert panda DF's to Spark DF's and then to RDDs
rdd_data = spark.createDataFrame(data).rdd

# Split the data in an 80-20 ratio for train and test
df_train = data[:train_data_size]
df_train = df_train.where((pd.notnull(df_train)), None)

df_valid = data[train_data_size:]
df_valid = df_valid.where((pd.notnull(df_valid)), None)

# invoke Snorkel
applier = SparkLFApplier(lfs=lfs)
L_train = applier.apply(rdd_data)
LFAnalysis(L=L_train, lfs=lfs).lf_summary()

Unnamed: 0,j,Polarity,Coverage,Overlaps,Conflicts
label_rp,0,"[0, 1]",0.007889,0.006574,0.001906
label_wp,1,"[0, 1]",0.010387,0.0094,0.003353
label_snopes,2,[0],0.027807,0.026887,0.003024
label_politifact,3,"[0, 1]",0.126413,0.093479,0.037142
factcheckqa_sentiment,4,"[0, 1]",0.020379,0.018998,0.009663
factcheckafpqa_sentiment,5,"[0, 1]",0.000986,0.000986,0.000592
speaker,6,"[0, 1]",0.71792,0.134236,0.044833


## Clean up datatypes for spark runtime

In [None]:
# # Training dataset
# df_train[['www.twitter.com','www.politifact.com','www.snopes.com','www.factcheck.org','factcheck.afp.com','www.washingtonpost.com/news/fact-checker/','www.realclearpolitics.com','www.glennbeck.com']] = df_train[['www.twitter.com','www.politifact.com','www.snopes.com','www.factcheck.org','factcheck.afp.com','www.washingtonpost.com/news/fact-checker/','www.realclearpolitics.com','www.glennbeck.com']].astype(str)

# # Validation dataset
# df_valid[['www.twitter.com','www.politifact.com','www.snopes.com','www.factcheck.org','factcheck.afp.com','www.washingtonpost.com/news/fact-checker/','www.realclearpolitics.com','www.glennbeck.com']] = df_valid[['www.twitter.com','www.politifact.com','www.snopes.com','www.factcheck.org','factcheck.afp.com','www.washingtonpost.com/news/fact-checker/','www.realclearpolitics.com','www.glennbeck.com']].astype(str)


## Convert Pandas dataframe to Spark dataframe

In [None]:
# Need to convert panda DF's to Spark DF's and then to RDDs

# Training dataset conversion
rdd_train = spark.createDataFrame(df_train).rdd

# Validation dataset conversion
rdd_valid = spark.createDataFrame(df_valid).rdd

In [None]:
majority_model = MajorityLabelVoter()
preds_train_majority = majority_model.predict(L=L_train)
L_valid = applier.apply(rdd_valid)

Y_valid = df_valid["label_numeric"].values
LFAnalysis(L_valid, lfs).lf_summary(Y_valid)

Unnamed: 0,j,Polarity,Coverage,Overlaps,Conflicts,Correct,Incorrect,Emp. Acc.
label_rp,0,"[0, 1]",0.007558,0.005587,0.001972,23,0,1.0
label_wp,1,"[0, 1]",0.012488,0.012159,0.004601,38,0,1.0
label_snopes,2,[0],0.028262,0.026947,0.002629,77,9,0.895349
label_politifact,3,"[0, 1]",0.120933,0.086756,0.03352,224,144,0.608696
factcheckqa_sentiment,4,"[0, 1]",0.019717,0.017746,0.009201,37,23,0.616667
factcheckafpqa_sentiment,5,"[0, 1]",0.001643,0.001643,0.000986,2,3,0.4
speaker,6,"[0, 1]",0.704568,0.129149,0.041735,1645,499,0.767257


## Training the model

In [None]:
label_model = LabelModel()
label_model.fit(L_train=L_train, n_epochs=200, log_freq=100, seed=42)
preds_train_label = label_model.predict(L=L_train)
preds_valid_label = label_model.predict(L=L_valid)
L_valid = applier.apply(rdd_valid)

Y_valid = df_valid["label_numeric"].values
LFAnalysis(L_valid, lfs).lf_summary(Y_valid)

Unnamed: 0,j,Polarity,Coverage,Overlaps,Conflicts,Correct,Incorrect,Emp. Acc.
label_rp,0,"[0, 1]",0.007558,0.005587,0.001972,23,0,1.0
label_wp,1,"[0, 1]",0.012488,0.012159,0.004601,38,0,1.0
label_snopes,2,[0],0.028262,0.026947,0.002629,77,9,0.895349
label_politifact,3,"[0, 1]",0.120933,0.086756,0.03352,224,144,0.608696
factcheckqa_sentiment,4,"[0, 1]",0.019717,0.017746,0.009201,37,23,0.616667
factcheckafpqa_sentiment,5,"[0, 1]",0.001643,0.001643,0.000986,2,3,0.4
speaker,6,"[0, 1]",0.704568,0.129149,0.041735,1645,499,0.767257


## Check model quality

In [None]:
f1_micro = label_model.score(L_valid, Y_valid, metrics=["f1_micro"])
accuracy = label_model.score(L_valid, Y_valid, metrics=["accuracy"])
recall = label_model.score(L_valid, Y_valid, metrics=["recall"])
precision = label_model.score(L_valid, Y_valid, metrics=["precision"])

print("{}\n{}\n{}\n{}".format(f1_micro, accuracy, recall, precision))

## Clean up

In [None]:
snorkel_predictions = np.concatenate((preds_train_label,preds_valid_label))
snorkel_predictions.shape

In [None]:
#data["snorkel_labels"] = snorkel_predictions

# peek at the data to validate
data3.head(3)

Unnamed: 0,id,date,speaker,statement,sources,paragraph_based_content,fullText_based_content,label-liar
0,18178,2020-03-18T13:26:42-04:00,Instagram posts,"""COVID-19 started because we eat animals.""",['https://www.cdc.gov/coronavirus/2019-ncov/ca...,['Vegan Instagram users are pinning the 2019 c...,Vegan Instagram users are pinning the 2019 cor...,barely-true
1,3350,2011-03-04T09:12:59-05:00,Glenn Beck,Says Michelle Obama has 43 people on her staff...,['http://www.glennbeck.com/2011/02/25/while-wo...,['Glenn Beck rekindled a falsehood about the s...,Glenn Beck rekindled a falsehood about the siz...,pants-fire
2,14343,2017-07-21T11:52:44-04:00,Mike Pence,"Says President Donald Trump ""has signed more l...",['https://nrf.com/events/retail-advocates-summ...,['Vice President Mike Pence says that when it ...,Vice President Mike Pence says that when it co...,half-true


In [None]:
# save to disk
data.to_csv("/dbfs/FileStore/test/data_nlp.csv")