# AI Final Project: multi-class KNN fake news identifier

*Team Members: Josh Yiznitsky, Olivia Fountain, Luke Kaplan*

This notebook contains code for designing a multi-class KNN model for determining fake news.

In [1]:
#import data and add headers
import pandas as pd

training_data = pd.read_csv('liar_dataset/train.tsv', 
                            delimiter='\t',
                            header=None,
                            names=['id','label','statement','subject','speaker','job_title','state_info','party_affiliation','barely_true_counts','false_counts','half_true_counts','mostly_true_counts','pants_on_fire_counts','context'])

test_data = pd.read_csv('liar_dataset/test.tsv', 
                            delimiter='\t',
                            header=None,
                            names=['id','label','statement','subject','speaker','job_title','state_info','party_affiliation','barely_true_counts','false_counts','half_true_counts','mostly_true_counts','pants_on_fire_counts','context'])

print("training data: ", training_data['label'].value_counts())
print("test data: ", test_data['label'].value_counts())

training data:  label
half-true      2114
false          1995
mostly-true    1962
true           1676
barely-true    1654
pants-fire      839
Name: count, dtype: int64
test data:  label
half-true      265
false          249
mostly-true    241
barely-true    212
true           208
pants-fire      92
Name: count, dtype: int64


In [2]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re
import pandas as pd

# Download NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')

def preprocess_dataset(dataset):

    def preprocess_text(text):
        if not isinstance(text, str):
            return text
        # Lowercasing
        text = text.lower()
        # Remove non-alphanumeric characters
        text = re.sub(r'\W', ' ', text)
        # Remove all single characters
        text = re.sub(r'\s+[a-zA-Z]\s+', ' ', text)
        # Substituting multiple spaces with single space
        text = re.sub(r'\s+', ' ', text, flags=re.I)

        # Tokenization
        tokens = text.split()

        # Removal of Stop Words and Lemmatization
        lem = WordNetLemmatizer()
        tokens = [lem.lemmatize(word) for word in tokens if word not in set(stopwords.words('english'))]
        return ' '.join(tokens)

    # Identify columns with text
    text_columns = dataset.select_dtypes(include=['object', 'string']).columns
    # Disregard id and label columns
    text_columns = text_columns.drop(['id', 'label'])

    # Fill missing values with empty string
    dataset[text_columns] = dataset[text_columns].fillna('')

    # Apply preprocessing to text columns
    dataset[text_columns] = dataset[text_columns].applymap(preprocess_text)

    #remove id column
    dataset = dataset.drop(columns=['id'])

    #add a column next to label with the number of the label
    
    dataset['label_num'] = dataset['label'].map({'pants-fire': 0, 'false': 1, 'barely-true': 2, 'half-true': 3, 'mostly-true': 4, 'true': 5})

    return dataset

#preprocess data training and test data
training_data = preprocess_dataset(training_data)
test_data = preprocess_dataset(test_data)



KeyboardInterrupt: 

In [20]:
print("training data: ")
training_data.head(5)

training data: 


Unnamed: 0,label,statement,subject,speaker,job_title,state_info,party_affiliation,barely_true_counts,false_counts,half_true_counts,mostly_true_counts,pants_on_fire_counts,context,label_num
0,false,say annies list political group support third ...,abortion,dwayne bohac,state representative,texas,republican,0.0,1.0,0.0,0.0,0.0,mailer,1
1,half-true,decline coal start started natural gas took st...,energy history job accomplishment,scott surovell,state delegate,virginia,democrat,0.0,0.0,1.0,1.0,0.0,floor speech,3
2,mostly-true,hillary clinton agrees john mccain voting give...,foreign policy,barack obama,president,illinois,democrat,70.0,71.0,160.0,163.0,9.0,denver,4
3,false,health care reform legislation likely mandate ...,health care,blog posting,,,none,7.0,19.0,3.0,5.0,44.0,news release,1
4,half-true,economic turnaround started end term,economy job,charlie crist,,florida,democrat,15.0,9.0,20.0,19.0,2.0,interview cnn,3


In [14]:
print("test data: ")
test_data.head(5)

test data: 


Unnamed: 0,label,statement,subject,speaker,job_title,state_info,party_affiliation,barely_true_counts,false_counts,half_true_counts,mostly_true_counts,pants_on_fire_counts,context
0,true,building wall mexico border take literally year,immigration,rick perry,governor,texas,republican,30,30,42,23,18,radio interview
1,false,wisconsin pace double number layoff year,job,katrina shankland,state representative,wisconsin,democrat,2,1,0,0,0,news conference
2,false,say john mccain done nothing help vet,military veteran voting record,donald trump,president elect,new york,republican,63,114,51,37,61,comment abc week
3,half-true,suzanne bonamici support plan cut choice medic...,medicare message machine 2012 campaign adverti...,rob cornilles,consultant,oregon,republican,1,1,3,1,1,radio show
4,pants-fire,asked reporter whether center criminal scheme ...,campaign finance legal issue campaign advertising,state democratic party wisconsin,,wisconsin,democrat,5,7,2,2,7,web video


In [16]:
from sklearn.feature_extraction.text import CountVectorizer

def convert_strings_to_vectors(dataset, column_number):
    # Get the column name based on the column number
    column_name = dataset.columns[column_number]

    # Create an instance of CountVectorizer
    vectorizer = CountVectorizer()

    # Fit and transform the specified column into vectors
    vectors = vectorizer.fit_transform(dataset[column_name])

    # Replace the column with the vectorized values
    dataset[column_name] = vectors

    return dataset



AttributeError: 'csr_matrix' object has no attribute 'head'

In [15]:
from sklearn.neighbors import KNeighborsClassifier

# Split the preprocessed data into features (X) and labels (y)
X_train = training_data.drop('label', axis=1)
y_train = training_data['label']
x_test = test_data.drop('label', axis=1)
y_test = test_data['label']

# Create an instance of the KNeighborsClassifier class with the desired number of neighbors
knn = KNeighborsClassifier(n_neighbors=5)

# Fit the model to the training data
knn.fit(X_train, y_train)

# Predict the labels for the test data
y_pred = knn.predict(X_test)

# Evaluate the accuracy of the model
accuracy = (y_pred == y_test).mean()
print("Accuracy:", accuracy)



ValueError: could not convert string to float: 'say annies list political group support third trimester abortion demand'