In [1]:
import pandas as pd
import numpy as np
import string

import nltk
from nltk.corpus import stopwords
from nltk.sentiment.vader import SentimentIntensityAnalyzer

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

import warnings
warnings.filterwarnings("ignore")

### Data Cleaning

In [2]:
### Read & clean the original dataset 

def read_dataset(csv):
    df = pd.read_csv(csv)
    df = df.drop(columns=["percentages", "check_nums"]).drop_duplicates().dropna()
    mapping = {
        "false": 4,
        "half-true": 2,
        "mostly-true": 1,
        "true": 0,
        "barely-true": 3,
        "pants-fire": 5
    }
    
    df["label"] = df["label"].map(mapping)
    df = df[pd.to_numeric(df["label"], errors="coerce").notna()]
    df = df[["content","article","summaries","label"]]
    df["content"] = df["content"].str.replace(r'[“\”]', '', regex=True)
    df["summaries"] = df["summaries"].str.replace(r'[\[\]\'"]', '', regex=True)
    df.columns = ["title", "article", "summary", "label"]
    return df

df = read_dataset("politifact_data_combined.csv")
df = df = df[df['summary'] != '']
df.head(2)

Unnamed: 0,title,article,summary,label
0,Haaretz investigation reveals discrepancies in...,A viral Oct. 28 social media post claimed that...,"Haaretz, an Israeli newspaper, said on X that ...",0.0
1,Wisconsin has historically … and I think large...,"In 2016, Wisconsin helped to swing the preside...",Although Wisconsin has voted for more Democrat...,4.0


### Feature 1: ClickBait (Cosine Similarity Between Title and Article)

In [5]:
# 1. Calculate the TF-IDF for title and article

tfidf_vectorizer = TfidfVectorizer()

tfidf_title = tfidf_vectorizer.fit_transform(df["title"])
tfidf_article = tfidf_vectorizer.transform(df["article"])


# 2. Cosine Similarity

cosine = cosine_similarity(tfidf_title, tfidf_article)
cosine_sim = cosine.diagonal()

df["similarity"] = cosine_sim

### Feature 2: Sentiment Analysis  (pos=1, neg=-1, neu=0)

In [6]:
# 1. Sentiment Analysis Using NLTK

analyzer = SentimentIntensityAnalyzer()
df["sentiment"] = df["article"].apply(lambda x: analyzer.polarity_scores(x)["compound"])

### Feature 3: Quality of Writing (Type-Token Ratio (TTR))

In [14]:
# 1. Remove stopwords and punctuation & Make lowercase

punctuation = set(string.punctuation)
stopwords = set(stopwords.words("english"))

def remove_stopwords(text):
    words = text.split()
    filtered_words = [w for w in words if w not in stopwords]
    return " ".join(filtered_words)

def remove_punctuation(text):
    cleaned_text = ''.join([char for char in text if char not in punctuation])
    return cleaned_text

df["article"] = df["article"].apply(lambda x: x.lower())
df["article"] = df["article"].apply(remove_punctuation)
df["article"] = df["article"].apply(remove_stopwords)

# 2. TTR = unique_words/total_words

df['ttr'] = df['article'].apply(lambda x: x.split()).apply(lambda words: len(set(words)) / len(words))

### Feature 4: Expressiveness (Adjectives)

In [17]:
# 1. Open List of Adjectives (Link: https://gist.github.com/hugsy/8910dc78d208e40de42deb29e62df913)
    ### Additional Sources: https://github.com/taikuukaits/SimpleWordlists/tree/master

with open("adjectives.txt", "r") as file:
    adjectives = [line.strip() for line in file]
    
# 2. Count adjectives

def count_adjectives(text):
    words = text.split()
    adjective_count = sum(1 for word in words if word.lower() in adjectives) / len(words)
    return adjective_count

df["adjectives"] = df["article"].apply(count_adjectives)

### Predictions

In [21]:
df.head(2)

Unnamed: 0,title,article,summary,label,similarity,sentiment,ttr,adjectives
0,Haaretz investigation reveals discrepancies in...,viral oct 28 social media post claimed israel ...,"Haaretz, an Israeli newspaper, said on X that ...",0.0,0.457559,-0.9994,0.593137,0.031863
1,Wisconsin has historically … and I think large...,2016 wisconsin helped swing presidential vote ...,Although Wisconsin has voted for more Democrat...,4.0,0.358756,0.9919,0.640472,0.098232


In [22]:
X = df.drop(columns=["title","article","summary","label"])
y = df["label"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [20]:
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import make_moons, make_circles, make_classification
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

In [23]:
classifiers = [
    KNeighborsClassifier(2),
    SVC(kernel="linear", C=0.025),
    SVC(gamma=2, C=1),
    DecisionTreeClassifier(max_depth=5),
    RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),
    MLPClassifier(alpha=1, max_iter=1000),
    AdaBoostClassifier(),
    GaussianNB(),
    QuadraticDiscriminantAnalysis()]

names = ["Nearest Neighbors", "Linear SVM", "RBF SVM",
         "Decision Tree", "Random Forest", "Neural Net", "AdaBoost",
         "Naive Bayes", "QDA"]

max_score = 0.0
max_class = ''

for name, clf in zip(names, classifiers):
    clf.fit(X_train, y_train)
    score = 100.0 * clf.score(X_test, y_test)

    if score > max_score:
        clf_best = clf
        max_score = score
        max_class = name

print(80*'-' )
print('Best --> Classifier = %s, Score (test, accuracy) = %.2f' %(max_class, max_score))

--------------------------------------------------------------------------------
Best --> Classifier = Decision Tree, Score (test, accuracy) = 53.96
