Preprocessing the 3 files: test, train, valid.
Only including rows that are true, mostly true, false, and pants-fire.
Converting those names to the standard True and False values.

In [1]:
import pandas as pd
import numpy as np

train = pd.read_csv('liar_dataset/train.tsv', 
                            delimiter='\t',
                            header=None,
                            names=['id','label','statement','subject','speaker','job_title','state_info','party_affiliation','barely_true_counts','false_counts','half_true_counts','mostly_true_counts','pants_on_fire_counts','context'])
test = pd.read_csv('liar_dataset/test.tsv', 
                            delimiter='\t',
                            header=None,
                            names=['id','label','statement','subject','speaker','job_title','state_info','party_affiliation','barely_true_counts','false_counts','half_true_counts','mostly_true_counts','pants_on_fire_counts','context'])
valid = pd.read_csv('liar_dataset/valid.tsv', 
                            delimiter='\t',
                            header=None,
                            names=['id','label','statement','subject','speaker','job_title','state_info','party_affiliation','barely_true_counts','false_counts','half_true_counts','mostly_true_counts','pants_on_fire_counts','context'])

bools = ['true', 'false']

train['label'] = np.where(train['label'] == 'mostly-true', 'true', train['label'])
train['label'] = np.where(train['label'] == 'pants-fire', 'false', train['label'])
train = train.loc[train['label'].isin(bools)]

test['label'] = np.where(test['label'] == 'mostly-true', 'true', test['label'])
test['label'] = np.where(test['label'] == 'pants-fire', 'false', test['label'])
test = test.loc[test['label'].isin(bools)]

valid['label'] = np.where(valid['label'] == 'mostly-true', 'true', valid['label'])
valid['label'] = np.where(valid['label'] == 'pants-fire', 'false', valid['label'])
valid = valid.loc[valid['label'].isin(bools)]

After subsetting the data for binary T/F values, the number of rows in each table is as follows:
- valid: 799
- test: 790
- train: 6472

More preprocessing

In [2]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re

# Download NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')

def clean_text(text):
    if not isinstance(text, str):
        return text
    # Lowercasing
    text = text.lower()
    # Remove non-alphanumeric characters
    text = re.sub(r'\W', ' ', text)
    # Remove all single characters
    text = re.sub(r'\s+[a-zA-Z]\s+', ' ', text)
    # Substituting multiple spaces with single space
    text = re.sub(r'\s+', ' ', text, flags=re.I)

    # Tokenization
    tokens = text.split()

    # Removal of Stop Words and Lemmatization
    lem = WordNetLemmatizer()
    tokens = [lem.lemmatize(word) for word in tokens if word not in set(stopwords.words('english'))]
    return ' '.join(tokens)

train['statement'] = train['statement'].apply(clean_text)
valid['statement'] = valid['statement'].apply(clean_text)
test['statement'] = test['statement'].apply(clean_text)

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/lukekaplan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/lukekaplan/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


TF-IDF Vectorization - converts the string into quantitative data

In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_features=1000)  # Number of features variable
X_train = vectorizer.fit_transform(train['title'])
X_valid = vectorizer.transform(valid['title'])
X_test = vectorizer.transform(test['title'])


ModuleNotFoundError: No module named 'sklearn'