Preprocessing the 3 files: test, train, valid.
Only including rows that are true, mostly true, false, and pants-fire.
Converting those names to the standard True and False values.

In [1]:
import pandas as pd
import numpy as np

train = pd.read_csv('liar_dataset/train.tsv', 
                            delimiter='\t',
                            header=None,
                            names=['id','label','statement','subject','speaker','job_title','state_info','party_affiliation','barely_true_counts','false_counts','half_true_counts','mostly_true_counts','pants_on_fire_counts','context'])
test = pd.read_csv('liar_dataset/test.tsv', 
                            delimiter='\t',
                            header=None,
                            names=['id','label','statement','subject','speaker','job_title','state_info','party_affiliation','barely_true_counts','false_counts','half_true_counts','mostly_true_counts','pants_on_fire_counts','context'])
valid = pd.read_csv('liar_dataset/valid.tsv', 
                            delimiter='\t',
                            header=None,
                            names=['id','label','statement','subject','speaker','job_title','state_info','party_affiliation','barely_true_counts','false_counts','half_true_counts','mostly_true_counts','pants_on_fire_counts','context'])

bools = ['true', 'false']

train['label'] = np.where(train['label'] == 'mostly-true', 'true', train['label'])
train['label'] = np.where(train['label'] == 'pants-fire', 'false', train['label'])
train = train.loc[train['label'].isin(bools)]

test['label'] = np.where(test['label'] == 'mostly-true', 'true', test['label'])
test['label'] = np.where(test['label'] == 'pants-fire', 'false', test['label'])
test = test.loc[test['label'].isin(bools)]

valid['label'] = np.where(valid['label'] == 'mostly-true', 'true', valid['label'])
valid['label'] = np.where(valid['label'] == 'pants-fire', 'false', valid['label'])
valid = valid.loc[valid['label'].isin(bools)]

print("training data: ", train['label'].value_counts())
print("test data: ", test['label'].value_counts())
print("validation data: ", valid['label'].value_counts())

training data:  label
true     3638
false    2834
Name: count, dtype: int64
test data:  label
true     449
false    341
Name: count, dtype: int64
validation data:  label
true     420
false    379
Name: count, dtype: int64


More preprocessing

In [2]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re

# Download NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')

def clean_text(text):
    if not isinstance(text, str):
        return text
    # Lowercasing
    text = text.lower()
    # Remove non-alphanumeric characters
    text = re.sub(r'\W', ' ', text)
    # Remove all single characters
    text = re.sub(r'\s+[a-zA-Z]\s+', ' ', text)
    # Substituting multiple spaces with single space
    text = re.sub(r'\s+', ' ', text, flags=re.I)

    # Tokenization
    tokens = text.split()

    # Removal of Stop Words and Lemmatization
    lem = WordNetLemmatizer()
    tokens = [lem.lemmatize(word) for word in tokens if word not in set(stopwords.words('english'))]
    return ' '.join(tokens)

train['statement'] = train['statement'].apply(clean_text)
valid['statement'] = valid['statement'].apply(clean_text)
test['statement'] = test['statement'].apply(clean_text)

print("training_data: ")
train.head(5)


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/joshyiz/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/joshyiz/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


training_data: 


Unnamed: 0,id,label,statement,subject,speaker,job_title,state_info,party_affiliation,barely_true_counts,false_counts,half_true_counts,mostly_true_counts,pants_on_fire_counts,context
0,2635.json,False,say annies list political group support third ...,abortion,dwayne-bohac,State representative,Texas,republican,0.0,1.0,0.0,0.0,0.0,a mailer
2,324.json,True,hillary clinton agrees john mccain voting give...,foreign-policy,barack-obama,President,Illinois,democrat,70.0,71.0,160.0,163.0,9.0,Denver
3,1123.json,False,health care reform legislation likely mandate ...,health-care,blog-posting,,,none,7.0,19.0,3.0,5.0,44.0,a news release
5,12465.json,True,chicago bear starting quarterback last 10 year...,education,robin-vos,Wisconsin Assembly speaker,Wisconsin,republican,0.0,3.0,2.0,5.0,1.0,a an online opinion-piece
9,9741.json,True,say gop primary opponent glenn grothman joe le...,"energy,message-machine-2014,voting-record",duey-stroebel,State representative,Wisconsin,republican,0.0,0.0,0.0,1.0,0.0,an online video


TF-IDF Vectorization - converts the string into quantitative data

In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_features=1000)  # Number of features variable
X_train = vectorizer.fit_transform(train['statement'])
X_valid = vectorizer.transform(valid['statement'])
X_test = vectorizer.transform(test['statement'])


Converts the output to binary form

In [4]:
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()
y_train = encoder.fit_transform(train['statement'])
y_valid = encoder.transform(valid['statement'])
y_test = encoder.transform(test['statement'])

ValueError: y contains previously unseen labels: 'obama sworn office use holy bible instead kuran equivalency bible different belief'

Train the model

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

model = LogisticRegression()
model.fit(X_train, y_train)

Validate the model

In [None]:
y_valid_pred = model.predict(X_valid)
print("Validation Accuracy:", accuracy_score(y_valid, y_valid_pred))
print(classification_report(y_valid, y_valid_pred))


Test the model 

In [None]:
y_test_pred = model.predict(X_test)
print("Test Accuracy:", accuracy_score(y_test, y_test_pred))
print(classification_report(y_test, y_test_pred))