In [1]:
import pandas as pd
import numpy as np
import string

from imblearn.over_sampling import SMOTE

import nltk
from nltk.corpus import stopwords
from nltk.sentiment.vader import SentimentIntensityAnalyzer

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.multiclass import OneVsRestClassifier
from sklearn.multiclass import OneVsOneClassifier

import warnings
warnings.filterwarnings("ignore")

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import make_moons, make_circles, make_classification
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

### Data Cleaning

In [3]:
def read_dataset(csv):
    df = pd.read_csv(csv)
    df = df.drop(columns=["percentages", "check_nums"]).drop_duplicates().dropna()
    
    mapping = {
        "true": 0,
        "mostly-true": 1,
        "half-true": 2,
        "barely-true": 3,
        "false": 4,
        "pants-fire": 5
    }
    
    df["label"] = df["label"].map(mapping)
    
    df = df[pd.to_numeric(df["label"], errors="coerce").notna()]
    df = df[["content","article","summaries","label"]]
    df["content"] = df["content"].str.replace(r'[“\”]', '', regex=True)
    df["summaries"] = df["summaries"].str.replace(r'[\[\]\'"]', '', regex=True)
    df.columns = ["title", "article", "summary", "label"]

    return df

df = read_dataset("politifact_data_combined.csv")
df = df = df[df['summary'] != '']
df.head(2)

Unnamed: 0,title,article,summary,label
0,Haaretz investigation reveals discrepancies in...,A viral Oct. 28 social media post claimed that...,"Haaretz, an Israeli newspaper, said on X that ...",4.0
1,Wisconsin has historically … and I think large...,"In 2016, Wisconsin helped to swing the preside...",Although Wisconsin has voted for more Democrat...,3.0


### Feature 1: ClickBait (Cosine Similarity Between Title and Article)

In [4]:
# 1. Calculate the TF-IDF for title and article

tfidf_vectorizer = TfidfVectorizer()

tfidf_title = tfidf_vectorizer.fit_transform(df["title"])
tfidf_article = tfidf_vectorizer.transform(df["article"])


# 2. Cosine Similarity

cosine = cosine_similarity(tfidf_title, tfidf_article)
cosine_sim = cosine.diagonal()

df["similarity"] = cosine_sim

In [5]:
# Create custom labels

def feature_mapping(value):
    if value <= (min_val + interval):
        return 0
    elif value <= (min_val + 2 * interval):
        return 1
    elif value <= (min_val + 3 * interval):
        return 2
    elif value <= (min_val + 4 * interval):
        return 3
    elif value <= (min_val + 5 * interval):
        return 4
    else:
        return 5

# min_val, max_val = df["similarity"].min(), df["similarity"].max()
# interval = (max_val - min_val) / 6

# df["similarity"] = df["similarity"].apply(feature_mapping)

### Feature 2: Sentiment Analysis  (pos=1, neg=-1, neu=0)

In [6]:
# 1. Sentiment Analysis Using NLTK

analyzer = SentimentIntensityAnalyzer()
df["sentiment"] = df["article"].apply(lambda x: analyzer.polarity_scores(x)["compound"])

In [7]:
# Create custom labels

# min_val, max_val = df['sentiment'].min(), df['sentiment'].max()
# interval = (max_val - min_val) / 6

# df['sentiment'] = df['sentiment'].apply(feature_mapping)

### Feature 3: Quality of Writing (Type-Token Ratio (TTR))

In [8]:
# 1. Remove stopwords and punctuation & Make lowercase

punctuation = set(string.punctuation)
stopwords = set(stopwords.words("english"))

def remove_stopwords(text):
    words = text.split()
    filtered_words = [w for w in words if w not in stopwords]
    return " ".join(filtered_words)

def remove_punctuation(text):
    cleaned_text = ''.join([char for char in text if char not in punctuation])
    return cleaned_text

df["article"] = df["article"].apply(lambda x: x.lower())
df["article"] = df["article"].apply(remove_punctuation)
df["article"] = df["article"].apply(remove_stopwords)

# 2. TTR = unique_words/total_words

df['ttr'] = df['article'].apply(lambda x: x.split()).apply(lambda words: len(set(words)) / len(words))

In [9]:
# min_val, max_val = df['ttr'].min(), df['ttr'].max()
# interval = (max_val - min_val) / 6

# df['ttr'] = df['ttr'].apply(feature_mapping)

### Feature 4: Expressiveness (Adjectives)

In [10]:
# 1. Open List of Adjectives (Link: https://gist.github.com/hugsy/8910dc78d208e40de42deb29e62df913)
    ### Additional Sources: https://github.com/taikuukaits/SimpleWordlists/tree/master

with open("adjectives.txt", "r") as file:
    adjectives = [line.strip() for line in file]
    
# 2. Count adjectives

def count_adjectives(text):
    words = text.split()
    adjective_count = sum(1 for word in words if word.lower() in adjectives) / len(words)
    return adjective_count

df["adjectives"] = df["article"].apply(count_adjectives)

In [11]:
# min_val, max_val = df["adjectives"].min(), df["adjectives"].max()
# interval = (max_val - min_val) / 6

# df["adjectives"] = df["adjectives"].apply(feature_mapping)

### SMOTE + Modeling

In [12]:
df.head(2)

Unnamed: 0,title,article,summary,label,similarity,sentiment,ttr,adjectives
0,Haaretz investigation reveals discrepancies in...,viral oct 28 social media post claimed israel ...,"Haaretz, an Israeli newspaper, said on X that ...",4.0,0.457559,-0.9994,0.593137,0.031863
1,Wisconsin has historically … and I think large...,2016 wisconsin helped swing presidential vote ...,Although Wisconsin has voted for more Democrat...,3.0,0.358756,0.9919,0.640472,0.098232


In [13]:
### Distribution of labels -> Requires Oversampling later

for i in range(0,6):
    print("Number of Data Points for Label " + str(i) + " is "+ str(len(df[df["label"]==i])))
    print("This is " + str(len(df[df["label"]==i]) / len(df)) + " of the entire dataset \n")

Number of Data Points for Label 0 is 178
This is 0.030308190022135195 of the entire dataset 

Number of Data Points for Label 1 is 314
This is 0.053465009364890174 of the entire dataset 

Number of Data Points for Label 2 is 440
This is 0.07491912140303082 of the entire dataset 

Number of Data Points for Label 3 is 720
This is 0.12259492593223224 of the entire dataset 

Number of Data Points for Label 4 is 3159
This is 0.537885237527669 of the entire dataset 

Number of Data Points for Label 5 is 1062
This is 0.18082751575004258 of the entire dataset 



In [14]:
# oversampling

X = df.drop(columns=["title","article","summary","label"])
y = df["label"]

smote = SMOTE(sampling_strategy={0: 400, 1: 500, 2: 500, 3: 800, 4: 3159, 5: 1062}, random_state=42)

X_resampled, y_resampled = smote.fit_resample(X, y)

In [15]:
# train-test split

X_train, X_test, y_train, y_test_multi = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

In [16]:
classifiers = [
    KNeighborsClassifier(3),
    SVC(kernel="linear", C=0.025),
    SVC(gamma=2, C=1),
    DecisionTreeClassifier(max_depth=5),
    RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),
    MLPClassifier(alpha=1, max_iter=1000),
    AdaBoostClassifier(),
    GaussianNB(),
    QuadraticDiscriminantAnalysis()]

for classifier in classifiers:
    clf = classifier.fit(X_train, y_train)
    predictions = clf.predict(X_test)
    print(classification_report(y_test_multi, predictions))

              precision    recall  f1-score   support

         0.0       0.39      0.73      0.51        82
         1.0       0.22      0.37      0.27        94
         2.0       0.23      0.27      0.25        91
         3.0       0.24      0.21      0.22       145
         4.0       0.59      0.56      0.57       651
         5.0       0.34      0.19      0.24       222

    accuracy                           0.43      1285
   macro avg       0.33      0.39      0.34      1285
weighted avg       0.44      0.43      0.43      1285

              precision    recall  f1-score   support

         0.0       0.00      0.00      0.00        82
         1.0       0.00      0.00      0.00        94
         2.0       0.00      0.00      0.00        91
         3.0       0.00      0.00      0.00       145
         4.0       0.51      1.00      0.67       651
         5.0       0.00      0.00      0.00       222

    accuracy                           0.51      1285
   macro avg       0.08

#### Predictions (One vs One)

In [17]:
classifiers = [
    KNeighborsClassifier(3),
    SVC(kernel="linear", C=0.025),
    SVC(gamma=2, C=1),
    DecisionTreeClassifier(max_depth=5),
    RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),
    MLPClassifier(alpha=1, max_iter=1000),
    AdaBoostClassifier(),
    GaussianNB(),
    QuadraticDiscriminantAnalysis()]

for classifier in classifiers:
    clf = classifier.fit(X_train, y_train)
    predictions = clf.predict(X_test)
    print(classification_report(y_test_multi, predictions))

              precision    recall  f1-score   support

         0.0       0.39      0.73      0.51        82
         1.0       0.22      0.37      0.27        94
         2.0       0.23      0.27      0.25        91
         3.0       0.24      0.21      0.22       145
         4.0       0.59      0.56      0.57       651
         5.0       0.34      0.19      0.24       222

    accuracy                           0.43      1285
   macro avg       0.33      0.39      0.34      1285
weighted avg       0.44      0.43      0.43      1285

              precision    recall  f1-score   support

         0.0       0.00      0.00      0.00        82
         1.0       0.00      0.00      0.00        94
         2.0       0.00      0.00      0.00        91
         3.0       0.00      0.00      0.00       145
         4.0       0.51      1.00      0.67       651
         5.0       0.00      0.00      0.00       222

    accuracy                           0.51      1285
   macro avg       0.08

#### Predictions (One vs Rest)

In [18]:
classifiers = [
    KNeighborsClassifier(3),
    SVC(kernel="linear", C=0.025),
    SVC(gamma=2, C=1),
    DecisionTreeClassifier(max_depth=5),
    RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),
    MLPClassifier(alpha=1, max_iter=1000),
    AdaBoostClassifier(),
    GaussianNB(),
    QuadraticDiscriminantAnalysis()]

for classifier in classifiers:
    clf = classifier.fit(X_train, y_train)
    predictions = clf.predict(X_test)
    print(classification_report(y_test_multi, predictions))

              precision    recall  f1-score   support

         0.0       0.39      0.73      0.51        82
         1.0       0.22      0.37      0.27        94
         2.0       0.23      0.27      0.25        91
         3.0       0.24      0.21      0.22       145
         4.0       0.59      0.56      0.57       651
         5.0       0.34      0.19      0.24       222

    accuracy                           0.43      1285
   macro avg       0.33      0.39      0.34      1285
weighted avg       0.44      0.43      0.43      1285

              precision    recall  f1-score   support

         0.0       0.00      0.00      0.00        82
         1.0       0.00      0.00      0.00        94
         2.0       0.00      0.00      0.00        91
         3.0       0.00      0.00      0.00       145
         4.0       0.51      1.00      0.67       651
         5.0       0.00      0.00      0.00       222

    accuracy                           0.51      1285
   macro avg       0.08

### Best Models

In [19]:
# KNN & increased samples in SMOTE for Minority Class

X = df.drop(columns=["title","article","summary","label"])
y = df["label"]

smote = SMOTE(sampling_strategy={0: 3000, 1: 3000, 2: 3000, 3: 3000, 4: 3159, 5: 3000}, random_state=42)

X_resampled, y_resampled = smote.fit_resample(X, y)

X_train, X_test, y_train, y_test_multi = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

knn = KNeighborsClassifier(3).fit(X_train, y_train)
predictions = knn.predict(X_test)
print(classification_report(y_test_multi, predictions))

              precision    recall  f1-score   support

         0.0       0.79      0.91      0.84       608
         1.0       0.77      0.87      0.81       632
         2.0       0.73      0.86      0.79       588
         3.0       0.72      0.76      0.74       592
         4.0       0.69      0.29      0.40       625
         5.0       0.69      0.75      0.72       587

    accuracy                           0.74      3632
   macro avg       0.73      0.74      0.72      3632
weighted avg       0.73      0.74      0.72      3632



### Predictions (Binary)

In [20]:
df_binary = df.copy()

def binary_map(val):
    if val in [0, 1, 2]:
        return 0
    elif val in [3, 4, 5]:
        return 1

df_binary['label'] = df_binary['label'].apply(binary_map)

df_binary.head(2)

Unnamed: 0,title,article,summary,label,similarity,sentiment,ttr,adjectives
0,Haaretz investigation reveals discrepancies in...,viral oct 28 social media post claimed israel ...,"Haaretz, an Israeli newspaper, said on X that ...",1,0.457559,-0.9994,0.593137,0.031863
1,Wisconsin has historically … and I think large...,2016 wisconsin helped swing presidential vote ...,Although Wisconsin has voted for more Democrat...,1,0.358756,0.9919,0.640472,0.098232


In [21]:
X = df_binary.drop(columns=["title","article","summary","label"])
y = df_binary["label"]

X_train, X_test, y_train, y_test_binary = train_test_split(X, y, test_size=0.2, random_state=42)

In [22]:
classifiers = [
    KNeighborsClassifier(2),
    SVC(kernel="linear", C=0.025),
    SVC(gamma=2, C=1),
    DecisionTreeClassifier(max_depth=5),
    RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),
    MLPClassifier(alpha=1, max_iter=1000),
    AdaBoostClassifier(),
    GaussianNB(),
    QuadraticDiscriminantAnalysis(),
    AdaBoostClassifier()]

names = ["Nearest Neighbors", "Linear SVM", "RBF SVM",
         "Decision Tree", "Random Forest", "Neural Net", "AdaBoost",
         "Naive Bayes", "QDA"]

max_score = 0.0
max_class = ''

for name, clf in zip(names, classifiers):
    clf.fit(X_train, y_train)
    score = 100.0 * clf.score(X_test, y_test_binary)

    if score > max_score:
        clf_best = clf
        max_score = score
        max_class = name

print(80*'-' )
print('Best --> Classifier = %s, Score (test, accuracy) = %.2f' %(max_class, max_score))

--------------------------------------------------------------------------------
Best --> Classifier = Random Forest, Score (test, accuracy) = 83.40
