# Data Preparation for Authenticity Model

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re

# Function to scrape one page of fact-checks
def scrape_fact_checks(page_number):
    url = f"https://www.politifact.com/factchecks/?page={page_number}"
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')

    # List to store the fact-check data
    data = []

    # Find all fact-check articles on the page
    fact_checks = soup.find_all('article', class_='m-statement')

    for fact in fact_checks:
        # Extract Author/Speaker
        author = fact.find('a', class_='m-statement__name').text.strip()

        # Extract the Date of the statement
        date_string = fact.find('div', class_='m-statement__desc').text.strip()

        # Use a regular expression to extract only the date portion (e.g., October 8, 2024)
        date_match = re.search(r'([A-Za-z]+ \d{1,2}, \d{4})', date_string)
        date = date_match.group(0) if date_match else "No date found"

        # Extract the Claim (statement being fact-checked)
        claim = fact.find('div', class_='m-statement__quote').find('a').text.strip()

        # Extract the URL to the full fact-check article
        link = "https://www.politifact.com" + fact.find('div', class_='m-statement__quote').find('a')['href']

        # Extract the Rating (e.g., False, Pants on Fire)
        rating = fact.find('div', class_='m-statement__meter').find('img')['alt'].strip()

        # Append the extracted information to the list
        data.append({
            'Author/Speaker': author,
            'Date': date,
            'Claim': claim,
            'Rating': rating,
            'Link to Full Article': link
        })

    return data

In [2]:
import time

# Loop through multiple pages and collect data
def scrape_multiple_pages(start_page, end_page):
    all_data = []
    for page_number in range(start_page, end_page + 1):
        print(f"Scraping page {page_number}...")
        page_data = scrape_fact_checks(page_number)
        all_data.extend(page_data)
        time.sleep(2)  # Sleep for 2 seconds between each page request

    return all_data

# Scrape data from page 1 to 2
data = scrape_multiple_pages(1, 2)
politifact_data = pd.DataFrame(data)
test_link = politifact_data['Link to Full Article'].iloc[0]

test_response = requests.get(test_link)
soup = BeautifulSoup(test_response.text, 'html.parser')

In [6]:
!pip install pandas 



In [9]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Load the Liar Plus dataset
data = pd.read_csv('../data/train2.tsv', sep='\t', header=None)

# Assign column names
column_names = ['index','id', 'label', 'statement', 'subject', 'speaker', 'job_title', 'state_info',
                'party_affiliation', 'barely_true', 'false', 'half_true', 'mostly_true',
                'pants_on_fire', 'context', 'justification']
data.columns = column_names

# Data Cleaning
def clean_data(df):
    # Check for NaN values
    # print("Columns with NaN values:")
    # print(df.isnull().sum())

    # Handle NaN values
    # For numerical columns, fill NaN with median
    numerical_columns = ['barely_true', 'false', 'half_true', 'mostly_true', 'pants_on_fire']
    df[numerical_columns] = df[numerical_columns].fillna(df[numerical_columns].median())

    # For categorical columns, fill NaN with a new category 'Unknown'
    categorical_columns = ['subject', 'speaker', 'job_title', 'state_info', 'party_affiliation', 'context']
    df[categorical_columns] = df[categorical_columns].fillna('Unknown')

    # For text columns, fill NaN with an empty string
    text_columns = ['statement', 'justification']
    df[text_columns] = df[text_columns].fillna('')

    # Handle NaN in the label column
    if 'label' in df.columns:
        # If there are NaN values in the label, we'll drop those rows
        df = df.dropna(subset=['label'])

    # Check if all NaN values have been handled
    # print("\nRemaining NaN values after cleaning:")
    # print(df.isnull().sum())

    return df

# Clean the data
data = clean_data(data)

# Define features and target
X = data.drop(['index','id', 'label', 'barely_true', 'false', 'half_true', 'mostly_true', 'pants_on_fire'], axis=1)
y = data['label']



ModuleNotFoundError: No module named 'pandas'

In [None]:
data

Unnamed: 0,index,id,label,statement,subject,speaker,job_title,state_info,party_affiliation,barely_true,false,half_true,mostly_true,pants_on_fire,context,justification
0,0.0,2635.json,false,Says the Annies List political group supports ...,abortion,dwayne-bohac,State representative,Texas,republican,0.0,1.0,0.0,0.0,0.0,a mailer,That's a premise that he fails to back up. Ann...
1,1.0,10540.json,half-true,When did the decline of coal start? It started...,"energy,history,job-accomplishments",scott-surovell,State delegate,Virginia,democrat,0.0,0.0,1.0,1.0,0.0,a floor speech.,"Surovell said the decline of coal ""started whe..."
2,2.0,324.json,mostly-true,"Hillary Clinton agrees with John McCain ""by vo...",foreign-policy,barack-obama,President,Illinois,democrat,70.0,71.0,160.0,163.0,9.0,Denver,Obama said he would have voted against the ame...
3,3.0,1123.json,false,Health care reform legislation is likely to ma...,health-care,blog-posting,Unknown,Unknown,none,7.0,19.0,3.0,5.0,44.0,a news release,The release may have a point that Mikulskis co...
4,4.0,9028.json,half-true,The economic turnaround started at the end of ...,"economy,jobs",charlie-crist,Unknown,Florida,democrat,15.0,9.0,20.0,19.0,2.0,an interview on CNN,"Crist said that the economic ""turnaround start..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10237,10264.0,5473.json,mostly-true,There are a larger number of shark attacks in ...,"animals,elections",aclu-florida,Unknown,Florida,none,0.0,1.0,1.0,1.0,0.0,"interview on ""The Colbert Report""",They compounded their error by combining full ...
10238,10265.0,3408.json,mostly-true,Democrats have now become the party of the [At...,elections,alan-powell,Unknown,Georgia,republican,0.0,0.0,0.0,1.0,0.0,an interview,"Romney said that ""Obamacare means that for up..."
10239,10266.0,3959.json,half-true,Says an alternative to Social Security that op...,"retirement,social-security",herman-cain,Unknown,Georgia,republican,4.0,11.0,5.0,3.0,3.0,a Republican presidential debate,But that it leaves out important details and t...
10240,10267.0,2253.json,false,On lifting the U.S. Cuban embargo and allowing...,"florida,foreign-policy",jeff-greene,Unknown,Florida,democrat,3.0,1.0,3.0,0.0,0.0,a televised debate on Miami's WPLG-10 against ...,"We checked the research and, quite frankly, fi..."


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Get the 'statement' column from the Liar Plus dataset and 'Claim' from PolitiFact
liar_statements = X['statement']
politifact_claims = politifact_data['Claim']

# Use TF-IDF Vectorizer to convert text to numerical features
vectorizer = TfidfVectorizer(stop_words='english')

# Combine both statements and claims for vectorization
combined_text = pd.concat([liar_statements, politifact_claims], axis=0)

# Fit and transform the combined text using TF-IDF
tfidf_matrix = vectorizer.fit_transform(combined_text)

# Split the transformed matrix into two parts: one for Liar Plus, one for PolitiFact
liar_tfidf = tfidf_matrix[:len(liar_statements)]
politifact_tfidf = tfidf_matrix[len(liar_statements):]

# Compute cosine similarity between every statement in Liar Plus and every claim in PolitiFact
similarity_matrix = cosine_similarity(liar_tfidf, politifact_tfidf)

# Find the highest similarity score for each Liar Plus statement
max_similarity = similarity_matrix.max(axis=1)

# Set a threshold for similarity (e.g., 0.8) to define a "cross-referenced" statement
threshold = 0.8
X['cross_referenced'] = (max_similarity >= threshold).astype(int)

# Now the Liar Plus dataset contains a new binary feature 'cross_referenced'


In [None]:
# Define a credibility score based on job title (you can customize this based on your data)
def assign_credibility_score(job_title):
    if "scientist" in job_title.lower() or "doctor" in job_title.lower():
        return 3  # High credibility
    elif "senator" in job_title.lower() or "president" in job_title.lower():
        return 2  # Medium credibility
    else:
        return 1  # Low credibility

# Apply the credibility score based on job title
X['credibility_score'] = X['job_title'].apply(assign_credibility_score)

# Encode party affiliation as a feature (OneHotEncode for categorical variables)
from sklearn.preprocessing import OneHotEncoder
encoder = OneHotEncoder(handle_unknown='ignore')
party_encoded = encoder.fit_transform(X[['party_affiliation']]).toarray()
party_columns = encoder.get_feature_names_out(['party_affiliation'])
party_df = pd.DataFrame(party_encoded, columns=party_columns)

# Add party affiliation back to the dataset
X = pd.concat([X.reset_index(drop=True), party_df.reset_index(drop=True)], axis=1)

In [None]:
# Function to detect if justification contains references to studies or data
def contains_cited_data(justification):
    keywords = ['according to', 'research', 'study', 'data', 'shown by', 'reported']
    for keyword in keywords:
        if keyword in justification.lower():
            return 1  # Cited data present
    return 0  # No cited data

# Apply the function to the justification column
X['cited_data'] = X['justification'].apply(contains_cited_data)

In [None]:
# Combine all the engineered features
X['cross_referenced'] = (max_similarity >= threshold).astype(int)  # Cross-referencing feature
X['credibility_score'] = X['job_title'].apply(assign_credibility_score)  # Author credentials feature
X['cited_data'] = X['justification'].apply(contains_cited_data)  # Cited data verification feature

# Drop columns that have been transformed into features
X.drop(['statement','subject',	'speaker',	'state_info','context','job_title', 'party_affiliation', 'justification'], axis=1, inplace=True)

# Add the OneHotEncoded party affiliation back to the dataset
X = pd.concat([X.reset_index(drop=True), party_df.reset_index(drop=True)], axis=1)

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

# Define features and target
y = data['label']  # Assuming 'label' is your target for Authenticity classification
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a Random Forest classifier
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Evaluate the model
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy:.4f}")
print(report)


Accuracy: 0.3887
              precision    recall  f1-score   support

 barely-true       0.46      0.37      0.41       339
       false       0.37      0.46      0.41       401
   half-true       0.36      0.41      0.38       438
 mostly-true       0.40      0.39      0.40       382
  pants-fire       0.44      0.39      0.41       148
        true       0.36      0.31      0.33       340

    accuracy                           0.39      2048
   macro avg       0.40      0.39      0.39      2048
weighted avg       0.39      0.39      0.39      2048



In [None]:
# Feature importance
feature_importance = pd.DataFrame({
    'feature': X_train.columns,
    'importance': model.feature_importances_
}).sort_values('importance', ascending=False)

print(feature_importance.head(10))  # Print top 10 most important features

                         feature  importance
3                    mostly_true    0.201231
2                      half_true    0.195256
1                          false    0.185748
0                    barely_true    0.169383
4                  pants_on_fire    0.115252
31                    cited_data    0.058779
6              credibility_score    0.020338
13    party_affiliation_democrat    0.006433
52  party_affiliation_republican    0.005825
49        party_affiliation_none    0.005693
