# AI Final Project: multi-class KNN fake news identifier

*Team Members: Josh Yiznitsky, Olivia Fountain, Luke Kaplan*

This notebook contains code for designing a multi-class KNN model for determining fake news.

In [1]:
#import data and add headers
import pandas as pd

multi_train = pd.read_csv('liar_dataset/train.tsv', 
                            delimiter='\t',
                            header=None,
                            names=['id','label','statement','subject','speaker','job_title','state_info','party_affiliation','barely_true_counts','false_counts','half_true_counts','mostly_true_counts','pants_on_fire_counts','context'])

multi_test = pd.read_csv('liar_dataset/test.tsv', 
                            delimiter='\t',
                            header=None,
                            names=['id','label','statement','subject','speaker','job_title','state_info','party_affiliation','barely_true_counts','false_counts','half_true_counts','mostly_true_counts','pants_on_fire_counts','context'])

print("training data: ", multi_train['label'].value_counts())
print("test data: ", multi_test['label'].value_counts())

training data:  label
half-true      2114
false          1995
mostly-true    1962
true           1676
barely-true    1654
pants-fire      839
Name: count, dtype: int64
test data:  label
half-true      265
false          249
mostly-true    241
barely-true    212
true           208
pants-fire      92
Name: count, dtype: int64


In [2]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re
import pandas as pd

# Download NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')

def preprocess_dataset(dataset):

    def preprocess_text(text):
        if not isinstance(text, str):
            return text
        # Lowercasing
        text = text.lower()
        # Remove non-alphanumeric characters
        text = re.sub(r'\W', ' ', text)
        # Remove all single characters
        text = re.sub(r'\s+[a-zA-Z]\s+', ' ', text)
        # Substituting multiple spaces with single space
        text = re.sub(r'\s+', ' ', text, flags=re.I)

        # Tokenization
        tokens = text.split()

        # Removal of Stop Words and Lemmatization
        lem = WordNetLemmatizer()
        tokens = [lem.lemmatize(word) for word in tokens if word not in set(stopwords.words('english'))]
        return ' '.join(tokens)

    #drop all columns except label and statement
    dataset = dataset[['label', 'statement']]

    #remove rows with missing values
    dataset = dataset.dropna()

    #apply preprocessing to statement column
    dataset['statement'] = dataset['statement'].apply(preprocess_text)

    return dataset

#preprocess data training and test data
multi_train = preprocess_dataset(multi_train)
multi_test = preprocess_dataset(multi_test)

#create multi-class label set
multi_train = multi_train.copy()
multi_test = multi_test.copy()

#add a column next to label with the number of the label
multi_train['label_num'] = multi_train['label'].map({'pants-fire': 0, 'false': 1, 'barely-true': 2, 'half-true': 3, 'mostly-true': 4, 'true': 5})
multi_test['label_num'] = multi_test['label'].map({'pants-fire': 0, 'false': 1, 'barely-true': 2, 'half-true': 3, 'mostly-true': 4, 'true': 5})

#create binary label set
binary_train = multi_train.copy()
binary_test = multi_test.copy()

#convert labels to true and false for binary label set
binary_train['label'] = binary_train['label'].map({'pants-fire': 'false', 'false': 'false', 'barely-true': 'false', 'half-true': 'true', 'mostly-true': 'true', 'true': 'true'})
binary_test['label'] = binary_test['label'].map({'pants-fire': 'false', 'false': 'false', 'barely-true': 'false', 'half-true': 'true', 'mostly-true': 'true', 'true': 'true'})


#add a column next to label with the number of the label
binary_train['label_num'] = binary_train['label'].map({'false': 0, 'true': 1})
binary_test['label_num'] = binary_test['label'].map({'false': 0, 'true': 1})







[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/joshyiz/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/joshyiz/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [3]:
print("mult_class training data: ")
multi_train.head(5)

mult_class training data: 


Unnamed: 0,label,statement,label_num
0,false,say annies list political group support third ...,1
1,half-true,decline coal start started natural gas took st...,3
2,mostly-true,hillary clinton agrees john mccain voting give...,4
3,false,health care reform legislation likely mandate ...,1
4,half-true,economic turnaround started end term,3


In [4]:
print("test data: ")
multi_test.head(5)

test data: 


Unnamed: 0,label,statement,label_num
0,true,building wall mexico border take literally year,5
1,false,wisconsin pace double number layoff year,1
2,false,say john mccain done nothing help vet,1
3,half-true,suzanne bonamici support plan cut choice medic...,3
4,pants-fire,asked reporter whether he center criminal sche...,0


In [5]:
print("binary training data: ")
binary_train.head(5)

binary training data: 


Unnamed: 0,label,statement,label_num
0,False,say annies list political group support third ...,0
1,True,decline coal start started natural gas took st...,1
2,True,hillary clinton agrees john mccain voting give...,1
3,False,health care reform legislation likely mandate ...,0
4,True,economic turnaround started end term,1


In [6]:
print("test data: ")
binary_test.head(5)

test data: 


Unnamed: 0,label,statement,label_num
0,True,building wall mexico border take literally year,1
1,False,wisconsin pace double number layoff year,0
2,False,say john mccain done nothing help vet,0
3,True,suzanne bonamici support plan cut choice medic...,1
4,False,asked reporter whether he center criminal sche...,0


In [7]:
import spacy

nlp = spacy.load("en_core_web_lg")


In [8]:
def add_vectors(dataset):
    dataset['vector'] = dataset['statement'].apply(lambda text: nlp(text).vector)
    return dataset

multi_train = add_vectors(multi_train)
multi_test = add_vectors(multi_test)
binary_train = add_vectors(binary_train)
binary_test = add_vectors(binary_test)



In [9]:
multi_train.head()

Unnamed: 0,label,statement,label_num,vector
0,false,say annies list political group support third ...,1,"[-0.09125201, -0.2517279, -0.13689697, 1.06777..."
1,half-true,decline coal start started natural gas took st...,3,"[-0.7686892, 0.5762692, -2.3380945, 0.43318465..."
2,mostly-true,hillary clinton agrees john mccain voting give...,4,"[-0.42389503, 0.18668796, -1.7479826, 0.480140..."
3,false,health care reform legislation likely mandate ...,1,"[-0.25652453, 1.6648048, -2.437448, 0.03223201..."
4,half-true,economic turnaround started end term,3,"[-2.139714, 1.5611031, -0.645526, 1.4543259, 3..."


In [10]:
multi_test.head()

Unnamed: 0,label,statement,label_num,vector
0,true,building wall mexico border take literally year,5,"[-0.30884856, -0.40162143, -2.9716258, 0.24982..."
1,false,wisconsin pace double number layoff year,1,"[-0.65460676, 1.1971833, 0.6555855, 1.644395, ..."
2,false,say john mccain done nothing help vet,1,"[1.9692186, 0.9003071, -2.135753, -0.63334715,..."
3,half-true,suzanne bonamici support plan cut choice medic...,3,"[-1.3098322, 1.0344467, -2.7497778, 0.15368445..."
4,pants-fire,asked reporter whether he center criminal sche...,0,"[-0.55129397, 0.8915434, -1.4848228, 0.9362866..."


In [11]:
binary_train.head()

Unnamed: 0,label,statement,label_num,vector
0,False,say annies list political group support third ...,0,"[-0.09125201, -0.2517279, -0.13689697, 1.06777..."
1,True,decline coal start started natural gas took st...,1,"[-0.7686892, 0.5762692, -2.3380945, 0.43318465..."
2,True,hillary clinton agrees john mccain voting give...,1,"[-0.42389503, 0.18668796, -1.7479826, 0.480140..."
3,False,health care reform legislation likely mandate ...,0,"[-0.25652453, 1.6648048, -2.437448, 0.03223201..."
4,True,economic turnaround started end term,1,"[-2.139714, 1.5611031, -0.645526, 1.4543259, 3..."


In [12]:
binary_test.head()

Unnamed: 0,label,statement,label_num,vector
0,True,building wall mexico border take literally year,1,"[-0.30884856, -0.40162143, -2.9716258, 0.24982..."
1,False,wisconsin pace double number layoff year,0,"[-0.65460676, 1.1971833, 0.6555855, 1.644395, ..."
2,False,say john mccain done nothing help vet,0,"[1.9692186, 0.9003071, -2.135753, -0.63334715,..."
3,True,suzanne bonamici support plan cut choice medic...,1,"[-1.3098322, 1.0344467, -2.7497778, 0.15368445..."
4,False,asked reporter whether he center criminal sche...,0,"[-0.55129397, 0.8915434, -1.4848228, 0.9362866..."


In [13]:
Xm_train = multi_train['vector'].to_list()
Xm_test = multi_test['vector'].to_list()
Ym_train = multi_train['label_num']
Ym_test = multi_test['label_num']

In [14]:
from sklearn.neighbors import KNeighborsClassifier

# Create KNN classifier for multi-class
MultiKnn = KNeighborsClassifier(n_neighbors = 3, metric='euclidean')

# Fit the classifier to the data
MultiKnn.fit(Xm_train, Ym_train)

# get the predictions
Multipredictions = MultiKnn.predict(Xm_test)

In [15]:
from sklearn.metrics import classification_report

print(classification_report(Ym_test, Multipredictions))

              precision    recall  f1-score   support

           0       0.12      0.26      0.16        92
           1       0.20      0.33      0.25       249
           2       0.16      0.19      0.18       212
           3       0.21      0.15      0.18       265
           4       0.20      0.11      0.14       241
           5       0.27      0.10      0.14       208

    accuracy                           0.19      1267
   macro avg       0.19      0.19      0.18      1267
weighted avg       0.20      0.19      0.18      1267



In [16]:
Xb_train = binary_train['vector'].to_list()
Xb_test = binary_test['vector'].to_list()
Yb_train = binary_train['label_num']
Yb_test = binary_test['label_num']

In [34]:
#Create KNN classifier for binary class
BinaryKnn = KNeighborsClassifier(n_neighbors = 7, metric='euclidean')

# Fit the classifier to the data
BinaryKnn.fit(Xb_train, Yb_train)

# get predictions
BinaryPredictions = BinaryKnn.predict(Xb_test)

In [35]:
print(classification_report(Yb_test, BinaryPredictions))

              precision    recall  f1-score   support

           0       0.51      0.52      0.51       553
           1       0.62      0.61      0.62       714

    accuracy                           0.57      1267
   macro avg       0.56      0.57      0.57      1267
weighted avg       0.57      0.57      0.57      1267

