In [1]:
import sys

In [2]:
!{sys.executable} -m pip install numpy
!{sys.executable} -m pip install pandas
!{sys.executable} -m pip install matplotlib
!{sys.executable} -m pip install seaborn
!{sys.executable} -m pip install scikit-learn
!{sys.executable} -m pip install nltk
!{sys.executable} -m pip install gensim
!{sys.executable} -m pip install textblob



Importing Libraries

In [3]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
%matplotlib inline

In [4]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics.pairwise import cosine_similarity

In [5]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import string

In [6]:
from gensim.models import Word2Vec, KeyedVectors

In [7]:
from textblob import TextBlob

Preliminary Analysis

Importing the Dataset

In [8]:
cwd = os.getcwd()
dataset_dir = os.path.join(cwd,'Dataset')
test_df = pd.read_csv(os.path.join(dataset_dir,'test.tsv'), delimiter='\t')
train_df = pd.read_csv(os.path.join(dataset_dir,'train.tsv'), delimiter='\t')
valid_df = pd.read_csv(os.path.join(dataset_dir,'valid.tsv'), delimiter='\t')

In [9]:
print("Test dataset shape:", test_df.shape)
print("Train dataset shape:", train_df.shape)
print("Validation dataset shape:", valid_df.shape)

Test dataset shape: (1266, 14)
Train dataset shape: (10239, 14)
Validation dataset shape: (1283, 14)


In [10]:
test_df.columns = ['ID', 'Label', 'Statement', 'Subject', 'Speaker', 'JobTitle', 'State', 'Party', 
                   'BarelyTrueCounts', 'FalseCounts', 'HalfTrueCounts', 'MostlyTrueCounts', 'PantsOnFireCounts', 'Context']

train_df.columns = ['ID', 'Label', 'Statement', 'Subject', 'Speaker', 'JobTitle', 'State', 'Party', 
                    'BarelyTrueCounts', 'FalseCounts', 'HalfTrueCounts', 'MostlyTrueCounts', 'PantsOnFireCounts', 'Context']

valid_df.columns = ['ID', 'Label', 'Statement', 'Subject', 'Speaker', 'JobTitle', 'State', 'Party', 
                    'BarelyTrueCounts', 'FalseCounts', 'HalfTrueCounts', 'MostlyTrueCounts', 'PantsOnFireCounts', 'Context']

In [11]:
print('Test dataset null values:\n',test_df.isna().sum())

Test dataset null values:
 ID                     0
Label                  0
Statement              0
Subject                0
Speaker                0
JobTitle             325
State                262
Party                  0
BarelyTrueCounts       0
FalseCounts            0
HalfTrueCounts         0
MostlyTrueCounts       0
PantsOnFireCounts      0
Context               17
dtype: int64


In [12]:
print('Train dataset null values:\n',train_df.isna().sum())

Train dataset null values:
 ID                      0
Label                   0
Statement               0
Subject                 2
Speaker                 2
JobTitle             2897
State                2208
Party                   2
BarelyTrueCounts        2
FalseCounts             2
HalfTrueCounts          2
MostlyTrueCounts        2
PantsOnFireCounts       2
Context               102
dtype: int64


In [13]:
print('Valid dataset null values:\n',valid_df.isna().sum())

Valid dataset null values:
 ID                     0
Label                  0
Statement              0
Subject                0
Speaker                0
JobTitle             345
State                279
Party                  0
BarelyTrueCounts       0
FalseCounts            0
HalfTrueCounts         0
MostlyTrueCounts       0
PantsOnFireCounts      0
Context               12
dtype: int64


As there as no null values in 'Statement' feature which is our main feature we will proceed with preprocessing

In [14]:
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\kanai\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\kanai\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\kanai\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\kanai\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     C:\Users\kanai\AppData\Roaming\nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to
[nltk_data]     C:\Users\kanai\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-

True

In [15]:
def clean_text(text):
    text = text.lower()

    tokens = word_tokenize(text)
    
    tokens = [token for token in tokens if token not in string.punctuation]
    
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]
    
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    
    cleaned_text = ' '.join(tokens)
    
    return cleaned_text

In [16]:
train_df['CleanedStatement'] = train_df['Statement'].apply(clean_text)
test_df['CleanedStatement'] = test_df['Statement'].apply(clean_text)
valid_df['CleanedStatement'] = valid_df['Statement'].apply(clean_text)

In [17]:
print("Sample cleaned statement:", train_df['CleanedStatement'].iloc[0])

Sample cleaned statement: decline coal start started natural gas took started begin president george w. bush administration


Using Bag-of-Words different text vectorization techniques (CountVectorizer and TfidfVectorizer)

In [52]:
count_vectorizer = CountVectorizer()

train_cvectors = count_vectorizer.fit_transform(train_df['CleanedStatement'])
print(len(count_vectorizer.get_feature_names()))
test_cvectors = count_vectorizer.transform(test_df['CleanedStatement'])
valid_cvectors = count_vectorizer.transform(valid_df['CleanedStatement'])

10680


In [19]:
print("Shape of training count vectors:", train_cvectors.shape)
print("Shape of test count vectors:", test_cvectors.shape)
print("Shape of validation count vectors:", valid_cvectors.shape)

Shape of training count vectors: (10239, 10680)
Shape of test count vectors: (1266, 10680)
Shape of validation count vectors: (1283, 10680)


In [20]:
termfreq_vectorizer = TfidfVectorizer()

train_tvectors = termfreq_vectorizer.fit_transform(train_df['CleanedStatement'])
print(len(termfreq_vectorizer.get_feature_names()))
test_tvectors = termfreq_vectorizer.transform(test_df['CleanedStatement'])
valid_tvectors = termfreq_vectorizer.transform(valid_df['CleanedStatement'])

10680


In [21]:
print("Shape of training term frequency vectors:", train_tvectors.shape)
print("Shape of test term frequency vectors:", test_tvectors.shape)
print("Shape of validation term frequency vectors:", valid_tvectors.shape)

Shape of training term frequency vectors: (10239, 10680)
Shape of test term frequency vectors: (1266, 10680)
Shape of validation term frequency vectors: (1283, 10680)


Making a word2vec model and word embedding for this dataset

In [22]:
train_sentences = train_df['CleanedStatement'].apply(word_tokenize)

model = Word2Vec(train_sentences, vector_size=100, window=5, min_count=1, workers=4)

model_dir = os.path.join(cwd,'Model')
model.wv.save_word2vec_format(os.path.join(model_dir,'word2vec_liar_model.bin'), binary=True)

In [23]:
word_embeddings = KeyedVectors.load_word2vec_format(os.path.join(model_dir,'word2vec_liar_model.bin'), binary=True)

embedding_dim = word_embeddings.vector_size
vocab_size = len(word_embeddings.key_to_index)

embedding_matrix = np.zeros((vocab_size, embedding_dim))
word_to_index = {}

for word, index in word_embeddings.key_to_index.items():
    embedding_vector = word_embeddings.get_vector(word)
    embedding_matrix[index] = embedding_vector
    word_to_index[word] = index

def text_to_indices(text):
    indices = []
    for word in text.split():
        if word in word_to_index:
            indices.append(word_to_index[word])
    return indices

In [24]:
train_indices = train_df['CleanedStatement'].apply(text_to_indices)
test_indices = test_df['CleanedStatement'].apply(text_to_indices)
valid_indices = valid_df['CleanedStatement'].apply(text_to_indices)

Creating new features for classification task

In [25]:
# Length of the statement
train_df['StatementLength'] = train_df['CleanedStatement'].apply(lambda x: len(x.split()))
test_df['StatementLength'] = test_df['CleanedStatement'].apply(lambda x: len(x.split()))
valid_df['StatementLength'] = valid_df['CleanedStatement'].apply(lambda x: len(x.split()))

In [26]:
# Presence of specific keywords or phrases
keywords = ['fake', 'hoax']
for keyword in keywords:
    train_df[keyword] = train_df['CleanedStatement'].str.contains(keyword, case=False).astype(int)
    test_df[keyword] = test_df['CleanedStatement'].str.contains(keyword, case=False).astype(int)
    valid_df[keyword] = valid_df['CleanedStatement'].str.contains(keyword, case=False).astype(int)

In [27]:
# Sentiment scores
train_df['SentimentScore'] = train_df['CleanedStatement'].apply(lambda x: TextBlob(x).sentiment.polarity)
test_df['SentimentScore'] = test_df['CleanedStatement'].apply(lambda x: TextBlob(x).sentiment.polarity)
valid_df['SentimentScore'] = valid_df['CleanedStatement'].apply(lambda x: TextBlob(x).sentiment.polarity)

In [28]:
# Linguistic features
train_df['PosTags'] = train_df['CleanedStatement'].apply(lambda x: nltk.pos_tag(nltk.word_tokenize(x)))
test_df['PosTags'] = test_df['CleanedStatement'].apply(lambda x: nltk.pos_tag(nltk.word_tokenize(x)))
valid_df['PosTags'] = valid_df['CleanedStatement'].apply(lambda x: nltk.pos_tag(nltk.word_tokenize(x)))

In [29]:
print(list(train_df.columns))

['ID', 'Label', 'Statement', 'Subject', 'Speaker', 'JobTitle', 'State', 'Party', 'BarelyTrueCounts', 'FalseCounts', 'HalfTrueCounts', 'MostlyTrueCounts', 'PantsOnFireCounts', 'Context', 'CleanedStatement', 'StatementLength', 'fake', 'hoax', 'SentimentScore', 'PosTags']


In [30]:
train_df.describe()

Unnamed: 0,BarelyTrueCounts,FalseCounts,HalfTrueCounts,MostlyTrueCounts,PantsOnFireCounts,StatementLength,fake,hoax,SentimentScore
count,10237.0,10237.0,10237.0,10237.0,10237.0,10239.0,10239.0,10239.0,10239.0
mean,11.534336,13.287682,17.135391,16.43587,6.202012,11.258912,0.0,0.000195,0.021412
std,18.974349,24.113808,35.847862,36.153089,16.129599,6.47954,0.0,0.013975,0.198455
min,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,-1.0
25%,0.0,0.0,0.0,0.0,0.0,8.0,0.0,0.0,0.0
50%,2.0,2.0,3.0,3.0,1.0,10.0,0.0,0.0,0.0
75%,12.0,12.0,13.0,11.0,5.0,14.0,0.0,0.0,0.05
max,70.0,114.0,160.0,163.0,105.0,368.0,0.0,1.0,1.0


In [31]:
test_df.describe()

Unnamed: 0,BarelyTrueCounts,FalseCounts,HalfTrueCounts,MostlyTrueCounts,PantsOnFireCounts,StatementLength,fake,hoax,SentimentScore
count,1266.0,1266.0,1266.0,1266.0,1266.0,1266.0,1266.0,1266.0,1266.0
mean,11.755924,13.452607,17.547393,16.907583,6.007109,11.589258,0.00079,0.00079,0.015841
std,18.981072,23.961261,36.117022,36.513901,15.062162,11.201158,0.028105,0.028105,0.197356
min,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,-1.0
25%,0.0,0.0,1.0,1.0,0.0,8.0,0.0,0.0,0.0
50%,3.0,3.0,3.0,3.0,1.0,10.0,0.0,0.0,0.0
75%,12.0,16.75,15.0,14.0,6.0,14.0,0.0,0.0,0.05
max,70.0,114.0,160.0,163.0,105.0,344.0,1.0,1.0,0.8


In [32]:
valid_df.describe()

Unnamed: 0,BarelyTrueCounts,FalseCounts,HalfTrueCounts,MostlyTrueCounts,PantsOnFireCounts,StatementLength,fake,hoax,SentimentScore
count,1283.0,1283.0,1283.0,1283.0,1283.0,1283.0,1283.0,1283.0,1283.0
mean,11.812938,13.843336,17.237724,16.60873,6.886984,11.242401,0.000779,0.000779,0.021658
std,19.02673,24.553414,35.633825,35.977736,17.603286,4.504255,0.027918,0.027918,0.199413
min,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,-1.0
25%,0.0,0.5,0.5,0.0,0.0,8.0,0.0,0.0,0.0
50%,3.0,3.0,3.0,3.0,1.0,11.0,0.0,0.0,0.0
75%,12.0,17.0,13.0,12.0,5.0,14.0,0.0,0.0,0.066667
max,70.0,114.0,160.0,163.0,105.0,32.0,1.0,1.0,1.0


Implementing models

In [33]:
#Selected features to be fed to models
selected_features = ['StatementLength', 'SentimentScore', 'fake', 'hoax', 'PosTags']

In [34]:
y_train = train_df['Label']
y_test = test_df['Label']
y_valid = valid_df['Label']

In [37]:
X_train = train_df[selected_features]
X_test = test_df[selected_features]
X_valid = valid_df[selected_features]

In [None]:
print("Shape of training feature matrix:", X_train.shape)
print("Shape of testing feature matrix:", X_test.shape)
print("Shape of validation feature matrix:", X_valid.shape)

In [None]:
model = MultinomialNB()

model.fit(X_train, y_train)

y_pred_train = model.predict(X_train)
y_pred_test = model.predict(X_test)
y_pred_valid = model.predict(X_valid)

In [None]:
accuracy_train = accuracy_score(y_train, y_pred_train)
accuracy_test = accuracy_score(y_test, y_pred_test)
accuracy_valid = accuracy_score(y_valid, y_pred_valid)

print("Training Accuracy:", accuracy_train)
print("Testing Accuracy:", accuracy_test)
print("Validation Accuracy:", accuracy_valid)

In [53]:
'''train_c_cos_sim = cosine_similarity(train_cvectors, train_cvectors)
test_c_cos_sim = cosine_similarity(test_cvectors, test_cvectors)
valid_c_cos_sim = cosine_similarity(valid_cvectors, valid_cvectors)

train_c_cos_sim_df = pd.DataFrame(train_c_cos_sim, columns=['CountVectorCosineSimilarity'])
test_c_cos_sim_df = pd.DataFrame(test_c_cos_sim, columns=['CountVectorCosineSimilarity'])
valid_c_cos_sim_df = pd.DataFrame(valid_c_cos_sim, columns=['CountVectorCosineSimilarity'])

X_train = pd.concat([X_train.reset_index(drop=True), train_c_cos_sim_df], axis=1)
X_test = pd.concat([X_test.reset_index(drop=True), test_c_cos_sim_df], axis=1)
X_valid = pd.concat([X_valid.reset_index(drop=True), valid_c_cos_sim_df], axis=1)'''

ValueError: Shape of passed values is (10239, 10239), indices imply (10239, 1)

In [36]:
'''train_t_cos_sim = cosine_similarity(train_tvectors, train_tvectors)
test_t_cos_sim = cosine_similarity(test_tvectors, test_tvectors)
valid_t_cos_sim = cosine_similarity(valid_tvectors, valid_tvectors)

train_t_cos_sim_df = pd.DataFrame(train_t_cos_sim, columns=['TermFreqVectorCosineSimilarity'])
test_t_cos_sim_df = pd.DataFrame(test_t_cos_sim, columns=['TermFreqVectorCosineSimilarity'])
valid_t_cos_sim_df = pd.DataFrame(valid_t_cos_sim, columns=['TermFreqVectorCosineSimilarity'])

X_train = pd.concat([X_train, train_t_cos_sim_df], axis=1)
X_test = pd.concat([X_test, test_t_cos_sim_df], axis=1)
X_valid = pd.concat([X_valid, valid_t_cos_sim_df], axis=1)'''

ValueError: Shape of passed values is (10239, 10239), indices imply (10239, 1)