# AI Final Project: multi-class KNN fake news identifier

*Team Members: Josh Yiznitsky, Olivia Fountain, Luke Kaplan*

This notebook contains code for designing a multi-class KNN model for determining fake news.

In [8]:
#import data and add headers
import pandas as pd

training_data = pd.read_csv('liar_dataset/train.tsv', 
                            delimiter='\t',
                            header=None,
                            names=['id','label','statement','subject','speaker','job_title','state_info','party_affiliation','barely_true_counts','false_counts','half_true_counts','mostly_true_counts','pants_on_fire_counts','context'])

print(training_data['label'].value_counts())

label
half-true      2114
false          1995
mostly-true    1962
true           1676
barely-true    1654
pants-fire      839
Name: count, dtype: int64


In [26]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re

# Download NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')

def preprocess_text(text):
    if not isinstance(text, str):
        return text
    # Lowercasing
    text = text.lower()
    # Remove non-alphanumeric characters
    text = re.sub(r'\W', ' ', text)
    # Remove all single characters
    text = re.sub(r'\s+[a-zA-Z]\s+', ' ', text)
    # Substituting multiple spaces with single space
    text = re.sub(r'\s+', ' ', text, flags=re.I)

    # Tokenization
    tokens = text.split()

    # Removal of Stop Words and Lemmatization
    lem = WordNetLemmatizer()
    tokens = [lem.lemmatize(word) for word in tokens if word not in set(stopwords.words('english'))]
    return ' '.join(tokens)


#identify columns with text
text_columns = training_data.select_dtypes(include=['object', 'string']).columns
#disregard id and label columns
text_columns = text_columns.drop(['id','label'])
print("Text columns identified:", text_columns)
# Fill missing values with empty string
training_data[text_columns] = training_data[text_columns].fillna('')

training_data[text_columns] = training_data[text_columns].applymap(preprocess_text)

training_data.head(5)


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/joshyiz/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/joshyiz/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
  training_data[text_columns] = training_data[text_columns].applymap(preprocess_text)


Text columns identified: Index(['statement', 'subject', 'speaker', 'job_title', 'state_info',
       'party_affiliation', 'context'],
      dtype='object')


Unnamed: 0,id,label,statement,subject,speaker,job_title,state_info,party_affiliation,barely_true_counts,false_counts,half_true_counts,mostly_true_counts,pants_on_fire_counts,context
0,2635.json,false,say annies list political group support third ...,abortion,dwayne bohac,state representative,texas,republican,0.0,1.0,0.0,0.0,0.0,mailer
1,10540.json,half-true,decline coal start started natural gas took st...,energy history job accomplishment,scott surovell,state delegate,virginia,democrat,0.0,0.0,1.0,1.0,0.0,floor speech
2,324.json,mostly-true,hillary clinton agrees john mccain voting give...,foreign policy,barack obama,president,illinois,democrat,70.0,71.0,160.0,163.0,9.0,denver
3,1123.json,false,health care reform legislation likely mandate ...,health care,blog posting,,,none,7.0,19.0,3.0,5.0,44.0,news release
4,9028.json,half-true,economic turnaround started end term,economy job,charlie crist,,florida,democrat,15.0,9.0,20.0,19.0,2.0,interview cnn
