In [1]:
import pandas as pd
import numpy as np
import string

import nltk
from nltk.corpus import stopwords
from nltk.sentiment.vader import SentimentIntensityAnalyzer

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.multiclass import OneVsRestClassifier
from sklearn.multiclass import OneVsOneClassifier

import warnings
warnings.filterwarnings("ignore")

In [2]:
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import make_moons, make_circles, make_classification
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

Plans for Next Week: 

1. Build compound factor
2. Build pipeline for veracity verification
3. Update doc

1. Balance dataset SMOTE
2. Ensemble method
3. Google search API
- Google Search API
- Coloab Enterprise
4. Maybe try ranking the features
5. Give higher weight for higher accuracy model

### Data Cleaning

In [7]:
def read_dataset(csv):
    df = pd.read_csv(csv)
    df = df.drop(columns=["percentages", "check_nums"]).drop_duplicates().dropna()
    
    mapping = {
        "true": 0,
        "mostly-true": 1,
        "half-true": 2,
        "barely-true": 3,
        "false": 4,
        "pants-fire": 5
    }
    
    df["label"] = df["label"].map(mapping)
    
    df = df[pd.to_numeric(df["label"], errors="coerce").notna()]
    df = df[["content","article","summaries","label"]]
    df["content"] = df["content"].str.replace(r'[“\”]', '', regex=True)
    df["summaries"] = df["summaries"].str.replace(r'[\[\]\'"]', '', regex=True)
    df.columns = ["title", "article", "summary", "label"]

    return df

df = read_dataset("politifact_data_combined.csv")
df = df = df[df['summary'] != '']
df.head(2)

Unnamed: 0,title,article,summary,label
0,Haaretz investigation reveals discrepancies in...,A viral Oct. 28 social media post claimed that...,"Haaretz, an Israeli newspaper, said on X that ...",4.0
1,Wisconsin has historically … and I think large...,"In 2016, Wisconsin helped to swing the preside...",Although Wisconsin has voted for more Democrat...,3.0


In [4]:
for i in range(0,6):
    print(len(df[df["label"]==i]))

178
314
440
720
3159
1062


In [5]:
# from imblearn.over_sampling import SMOTE

ModuleNotFoundError: No module named 'imblearn'

### Feature 1: ClickBait (Cosine Similarity Between Title and Article)

In [4]:
# 1. Calculate the TF-IDF for title and article

tfidf_vectorizer = TfidfVectorizer()

tfidf_title = tfidf_vectorizer.fit_transform(df["title"])
tfidf_article = tfidf_vectorizer.transform(df["article"])


# 2. Cosine Similarity

cosine = cosine_similarity(tfidf_title, tfidf_article)
cosine_sim = cosine.diagonal()

df["similarity"] = cosine_sim

### Feature 2: Sentiment Analysis  (pos=1, neg=-1, neu=0)

In [5]:
# 1. Sentiment Analysis Using NLTK

analyzer = SentimentIntensityAnalyzer()
df["sentiment"] = df["article"].apply(lambda x: analyzer.polarity_scores(x)["compound"])

### Feature 3: Quality of Writing (Type-Token Ratio (TTR))

In [6]:
# 1. Remove stopwords and punctuation & Make lowercase

punctuation = set(string.punctuation)
stopwords = set(stopwords.words("english"))

def remove_stopwords(text):
    words = text.split()
    filtered_words = [w for w in words if w not in stopwords]
    return " ".join(filtered_words)

def remove_punctuation(text):
    cleaned_text = ''.join([char for char in text if char not in punctuation])
    return cleaned_text

df["article"] = df["article"].apply(lambda x: x.lower())
df["article"] = df["article"].apply(remove_punctuation)
df["article"] = df["article"].apply(remove_stopwords)

# 2. TTR = unique_words/total_words

df['ttr'] = df['article'].apply(lambda x: x.split()).apply(lambda words: len(set(words)) / len(words))

### Feature 4: Expressiveness (Adjectives)

In [7]:
# 1. Open List of Adjectives (Link: https://gist.github.com/hugsy/8910dc78d208e40de42deb29e62df913)
    ### Additional Sources: https://github.com/taikuukaits/SimpleWordlists/tree/master

with open("adjectives.txt", "r") as file:
    adjectives = [line.strip() for line in file]
    
# 2. Count adjectives

def count_adjectives(text):
    words = text.split()
    adjective_count = sum(1 for word in words if word.lower() in adjectives) / len(words)
    return adjective_count

df["adjectives"] = df["article"].apply(count_adjectives)

### New DataFrame

In [8]:
df_binary = df.copy()

def binary_map(val):
    if val in [0, 1, 2]:
        return 0
    elif val in [3, 4, 5]:
        return 1

df_binary['label'] = df_binary['label'].apply(binary_map)

df_binary.head(2)

Unnamed: 0,title,article,summary,label,similarity,sentiment,ttr,adjectives
0,Haaretz investigation reveals discrepancies in...,viral oct 28 social media post claimed israel ...,"Haaretz, an Israeli newspaper, said on X that ...",1,0.457559,-0.9994,0.593137,0.031863
1,Wisconsin has historically … and I think large...,2016 wisconsin helped swing presidential vote ...,Although Wisconsin has voted for more Democrat...,1,0.358756,0.9919,0.640472,0.098232


### Predictions (One vs Rest)

In [28]:
df.head(2)

Unnamed: 0,title,article,summary,label,similarity,sentiment,ttr,adjectives
0,Haaretz investigation reveals discrepancies in...,viral oct 28 social media post claimed israel ...,"Haaretz, an Israeli newspaper, said on X that ...",4.0,0.457559,-0.9994,0.593137,0.031863
1,Wisconsin has historically … and I think large...,2016 wisconsin helped swing presidential vote ...,Although Wisconsin has voted for more Democrat...,3.0,0.358756,0.9919,0.640472,0.098232


In [10]:
X = df.drop(columns=["title","article","summary","label"])
y = df["label"]

X_train, X_test, y_train, y_test_multi = train_test_split(X, y, test_size=0.2, random_state=42)

In [11]:
classifiers = [
    KNeighborsClassifier(2),
    SVC(kernel="linear", C=0.025),
    SVC(gamma=2, C=1),
    DecisionTreeClassifier(max_depth=5),
    RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),
    MLPClassifier(alpha=1, max_iter=1000),
    AdaBoostClassifier(),
    GaussianNB(),
    QuadraticDiscriminantAnalysis()]

for classifier in classifiers:
    clf = classifier.fit(X_train, y_train)
    predictions = clf.predict(X_test)
    print(accuracy_score(y_test_multi, predictions))

0.37106382978723407
0.534468085106383
0.534468085106383
0.5395744680851063
0.5319148936170213
0.5336170212765957
0.512340425531915
0.534468085106383
0.5302127659574468


In [12]:
classifiers = [
    KNeighborsClassifier(2),
    SVC(kernel="linear", C=0.025),
    SVC(gamma=2, C=1),
    DecisionTreeClassifier(max_depth=5),
    RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),
    MLPClassifier(alpha=1, max_iter=1000),
    AdaBoostClassifier(),
    GaussianNB(),
    QuadraticDiscriminantAnalysis()]

for classifier in classifiers:
    clf = OneVsOneClassifier(classifier).fit(X_train, y_train)
    predictions = clf.predict(X_test)
    print(accuracy_score(y_test_multi, predictions))

0.38042553191489364
0.534468085106383
0.534468085106383
0.5293617021276595
0.5293617021276595
0.534468085106383
0.5285106382978724
0.534468085106383
0.5302127659574468


In [13]:
classifiers = [
    KNeighborsClassifier(2),
    SVC(kernel="linear", C=0.025),
    SVC(gamma=2, C=1),
    DecisionTreeClassifier(max_depth=5),
    RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),
    MLPClassifier(alpha=1, max_iter=1000),
    AdaBoostClassifier(),
    GaussianNB(),
    QuadraticDiscriminantAnalysis()]

for classifier in classifiers:
    clf = OneVsRestClassifier(classifier).fit(X_train, y_train)
    predictions = clf.predict(X_test)
    print(accuracy_score(y_test_multi, predictions))

0.39574468085106385
0.5285106382978724
0.5191489361702127
0.5251063829787234
0.531063829787234
0.534468085106383
0.531063829787234
0.5353191489361702
0.5293617021276595


### Predictions (Binary)

In [30]:
X = df_binary.drop(columns=["title","article","summary","label"])
y = df_binary["label"]

X_train, X_test, y_train, y_test_binary = train_test_split(X, y, test_size=0.2, random_state=42)

In [15]:
classifiers = [
    KNeighborsClassifier(2),
    SVC(kernel="linear", C=0.025),
    SVC(gamma=2, C=1),
    DecisionTreeClassifier(max_depth=5),
    RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),
    MLPClassifier(alpha=1, max_iter=1000),
    AdaBoostClassifier(),
    GaussianNB(),
    QuadraticDiscriminantAnalysis()]

names = ["Nearest Neighbors", "Linear SVM", "RBF SVM",
         "Decision Tree", "Random Forest", "Neural Net", "AdaBoost",
         "Naive Bayes", "QDA"]

max_score = 0.0
max_class = ''

for name, clf in zip(names, classifiers):
    clf.fit(X_train, y_train)
    score = 100.0 * clf.score(X_test, y_test_binary)

    if score > max_score:
        clf_best = clf
        max_score = score
        max_class = name

print(80*'-' )
print('Best --> Classifier = %s, Score (test, accuracy) = %.2f' %(max_class, max_score))

--------------------------------------------------------------------------------
Best --> Classifier = Linear SVM, Score (test, accuracy) = 83.15


In [37]:
clf = AdaBoostClassifier()
clf.fit(X_train, y_train)
predictions = clf.predict(df_binary.drop(columns=["title","article","summary","label"]))
df_binary["predictions"] = predictions

print(accuracy_score(y_test_binary, clf.predict(X_test)))

df_binary[df_binary["predictions"]==0]

0.8212765957446808


Unnamed: 0,title,article,summary,label,similarity,sentiment,ttr,adjectives,predictions
11,The Milwaukee Brewers are complaining about a ...,state legislature searching way pay stadium up...,"Under an early stadium funding proposal, $135 ...",1,0.605329,0.9092,0.601653,0.047934,0
120,New York Attorney General Letitia James and Ju...,new york judge ruled last month fraud lawsuit ...,New York officials did not determine the value...,1,0.504848,0.9966,0.562823,0.075731,0
123,"Under the Obama-Biden administration, we inves...",frozen bottles water slushy popsicles melting ...,Burying power lines can help protect against o...,1,0.578070,0.9899,0.575758,0.067821,0
129,"When I took office, the auto industry was on i...",united auto workers strike leading us automake...,"In 2008 and 2009, interventions by the Bush an...",1,0.591055,0.9849,0.576196,0.068966,0
136,"Just in the last four years, $80 billion in fe...",2024 sen joe manchin dwva reelection manchin s...,"Sen. Joe Manchin, D-W.Va., actually understate...",0,0.639529,0.9742,0.599548,0.067873,0
...,...,...,...,...,...,...,...,...,...
6728,State Forces Citizens to Pay for ‘Stargazing P...,update feb 13 2020 free thought project quick ...,Certain state parks on Long Island require vis...,1,0.600434,0.9949,0.604869,0.078652,0
6739,Very close to 160 million people are now worki...,president donald trump loves brag economytrump...,Trump cited raw employment data in a Milwaukee...,0,0.415663,0.9964,0.575150,0.080160,0
6742,"Under my administration, 7 million Americans h...",third state union address president trump clai...,Data shows about 7 million fewer Americans par...,0,0.450584,0.9679,0.609407,0.073620,0
6758,"Traditionally, the Speaker says: ‘Members of C...",following president donald trump’s 2020 state ...,"Between the 2007 and 2018\xa0addresses, every ...",0,0.871262,0.9996,0.464752,0.169713,0


In [38]:
X = df_binary.drop(columns=["title","article","summary","label"])
y = df["label"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [39]:
clf = SVC(kernel="linear", C=0.025)
clf.fit(X_train, y_train)
predictions = clf.predict(X_test)
print(accuracy_score(y_test_multi, predictions))

0.534468085106383
