In [1]:
!pip install sentence-transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting sentence-transformers
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86.0 KB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting transformers<5.0.0,>=4.6.0
  Downloading transformers-4.27.3-py3-none-any.whl (6.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.8/6.8 MB[0m [31m45.0 MB/s[0m eta [36m0:00:00[0m
Collecting sentencepiece
  Downloading sentencepiece-0.1.97-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m56.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting huggingface-hub>=0.4.0
  Downloading huggingface_hub-0.13.3-py3-none-any.whl (199 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.8

In [2]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

import re
import glob
import gensim
from gensim.models import Word2Vec
import nltk
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.tokenize import TreebankWordTokenizer
from nltk.stem import WordNetLemmatizer
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('averaged_perceptron_tagger')

import sklearn
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.preprocessing import Binarizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, classification_report, confusion_matrix
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC

from sentence_transformers import SentenceTransformer, losses, models, util
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
from sentence_transformers.readers import InputExample

import warnings
from wordcloud import STOPWORDS, WordCloud
warnings.filterwarnings('ignore')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


**Data Extraction and Preprocessing**

In [4]:
!unzip '/content/drive/MyDrive/Datasets Sem-6/IR Datasets/Project/liar_dataset.zip'

Archive:  /content/drive/MyDrive/Datasets Sem-6/IR Datasets/Project/liar_dataset.zip
  inflating: README                  
  inflating: test.tsv                
  inflating: train.tsv               
  inflating: valid.tsv               


In [5]:
train_df = pd.read_csv('/content/train.tsv',sep='\t', header = None)
test_df = pd.read_csv('/content/test.tsv',sep='\t', header = None)
val_df = pd.read_csv('/content/valid.tsv',sep='\t', header = None)

In [6]:
train_df = train_df.drop([0, 8, 9, 10, 11, 12], axis = 1)
test_df = test_df.drop([0, 8, 9, 10, 11, 12], axis = 1)
val_df = val_df.drop([0, 8, 9, 10, 11, 12], axis = 1)

In [7]:
train_df.columns = ['label', 'statement', 'subject', 'speaker', 'speaker job title', 'state info', 'party affilation', 'location of statement']
test_df.columns = ['label', 'statement', 'subject', 'speaker', 'speaker job title', 'state info', 'party affilation', 'location of statement']
val_df.columns = ['label', 'statement', 'subject', 'speaker', 'speaker job title', 'state info', 'party affilation', 'location of statement']

In [8]:
train_df = train_df.dropna()
train_df = train_df.reset_index(drop=True)
test_df = test_df.dropna()
test_df = test_df.reset_index(drop=True)
val_df = val_df.dropna()
val_df = val_df.reset_index(drop=True)

In [9]:
train_df = pd.concat([train_df, val_df])
train_df = train_df.reset_index()

In [10]:
print('Training Data Dimensions -', train_df.shape)
print('Testing Data Dimensions -', test_df.shape)

Training Data Dimensions - (7585, 9)
Testing Data Dimensions - (853, 8)


In [11]:
labels_dict = {'mostly-true':4,'barely-true':2,'half-true':3,'false':1, 'true':5,'pants-fire':0}
train_df['label'] = train_df['label'].apply(lambda x: labels_dict[x])
test_df['label'] = test_df['label'].apply(lambda x: labels_dict[x])

**Model Training and Evaluations**

In [12]:
def preprocess(text):
  text = text.lower() # lower - casing the text
  text = re.sub('<[^>]*>', ' ', text)
  text = re.sub('[\W]+', ' ', text)
  tokenizer = TreebankWordTokenizer()
  words = tokenizer.tokenize(text)
  text = ' '.join(words)
  stop_words = set(stopwords.words('english'))
  word_tokens = word_tokenize(text)
  filtered_sentence = [w for w in word_tokens if not w in stop_words] # removal of stopwords
  text = ' '.join(filtered_sentence)
  return text

**TFIDF Vectorizer using Pos-Taggings of the given text**

In [13]:
def get_XY(data):
  y = data['label'].values
  X = data['statement'].values
  for i in range(len(X)):
    X[i] = preprocess(X[i])
  return X,y

In [14]:
X_train, y_train = get_XY(train_df)
X_test, y_test = get_XY(test_df)

In [15]:
def getPosTags(X):
  for i in range(len(X)):
    text = X[i]
    tokens = word_tokenize(text)
    pos_tags = nltk.pos_tag(tokens)
    pos_taggings = []
    for word, pos_tag in pos_tags:
      pos_taggings.append(pos_tag)
    text = ' '.join(pos_taggings)
    X[i] = text
  return X

In [16]:
X_train = getPosTags(X_train)
X_test = getPosTags(X_test)

In [17]:
model1 = Pipeline([
    ('vectorizer', TfidfVectorizer(analyzer='word', ngram_range=(1,6))),
    ('norm', Binarizer()),
    ('clf', LogisticRegression(solver = 'liblinear')),
])

In [18]:
model1.fit(X_train, y_train)
test_preds = model1.predict(X_test)

In [19]:
print(classification_report(y_test, test_preds))

              precision    recall  f1-score   support

           0       0.21      0.12      0.15        43
           1       0.27      0.34      0.30       167
           2       0.18      0.14      0.16       139
           3       0.23      0.24      0.24       182
           4       0.20      0.21      0.21       173
           5       0.20      0.19      0.20       149

    accuracy                           0.22       853
   macro avg       0.22      0.21      0.21       853
weighted avg       0.22      0.22      0.22       853



In [28]:
model2 = Pipeline([
    ('vectorizer', TfidfVectorizer(analyzer='word', ngram_range=(1,6))),
    ('norm', Binarizer()),
    ('clf', SVC()),
])

In [29]:
model2.fit(X_train, y_train)
test_preds = model2.predict(X_test)

In [30]:
print(classification_report(y_test, test_preds))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00        43
           1       0.27      0.53      0.36       167
           2       0.19      0.02      0.04       139
           3       0.27      0.33      0.30       182
           4       0.21      0.28      0.24       173
           5       0.21      0.09      0.13       149

    accuracy                           0.25       853
   macro avg       0.19      0.21      0.18       853
weighted avg       0.22      0.25      0.21       853



In [34]:
model3 = Pipeline([
    ('vectorizer', TfidfVectorizer(analyzer='word', ngram_range=(1,6))),
    ('norm', Binarizer()),
    ('clf', DecisionTreeClassifier()),
])

In [35]:
model3.fit(X_train, y_train)
test_preds = model3.predict(X_test)

In [36]:
print(classification_report(y_test, test_preds))

              precision    recall  f1-score   support

           0       0.11      0.19      0.14        43
           1       0.21      0.23      0.22       167
           2       0.12      0.10      0.11       139
           3       0.23      0.19      0.21       182
           4       0.22      0.24      0.23       173
           5       0.18      0.18      0.18       149

    accuracy                           0.19       853
   macro avg       0.18      0.19      0.18       853
weighted avg       0.19      0.19      0.19       853



In [37]:
model4 = Pipeline([
    ('vectorizer', TfidfVectorizer(analyzer='word', ngram_range=(1,6))),
    ('norm', Binarizer()),
    ('clf', MLPClassifier()),
])

In [38]:
model4.fit(X_train, y_train)
test_preds = model4.predict(X_test)

In [39]:
print(classification_report(y_test, test_preds))

              precision    recall  f1-score   support

           0       0.08      0.09      0.09        43
           1       0.23      0.28      0.25       167
           2       0.18      0.16      0.17       139
           3       0.24      0.26      0.25       182
           4       0.20      0.18      0.19       173
           5       0.20      0.18      0.19       149

    accuracy                           0.21       853
   macro avg       0.19      0.19      0.19       853
weighted avg       0.21      0.21      0.21       853



**Using Contextual Word Embeddings**

In [40]:
cont_model2 = SentenceTransformer('all-MiniLM-L6-v2')

Downloading (…)e9125/.gitattributes:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

Downloading (…)_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading (…)7e55de9125/README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

Downloading (…)55de9125/config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading (…)125/data_config.json:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading (…)e9125/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

Downloading (…)9125/train_script.py:   0%|          | 0.00/13.2k [00:00<?, ?B/s]

Downloading (…)7e55de9125/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)5de9125/modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

In [41]:
def preprocess(text):
  text = text.lower() # lower - casing the text
  text = re.sub('<[^>]*>', ' ', text)
  text = re.sub('[\W]+', ' ', text)
  tokenizer = TreebankWordTokenizer()
  words = tokenizer.tokenize(text)
  text = ' '.join(words)
  return text

def getFeatures(df, model):
  X = list(df['statement'].values)
  for i in range(len(X)):
    text = X[i]
    text = preprocess(text)
    embeddings = model.encode(text)
    X[i] = embeddings
  return X

In [43]:
X_train = getFeatures(train_df, cont_model2)
y_train = list(train_df['label'].values)
X_test = getFeatures(test_df, cont_model2)
y_test = list(test_df['label'].values)

In [44]:
model1 = MLPClassifier(activation = 'tanh')
model1.fit(X_train, y_train)
test_preds = model1.predict(X_test)

In [45]:
print(classification_report(y_test, test_preds))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00        43
           1       0.27      0.30      0.28       167
           2       0.18      0.10      0.13       139
           3       0.23      0.38      0.29       182
           4       0.24      0.22      0.23       173
           5       0.25      0.21      0.23       149

    accuracy                           0.24       853
   macro avg       0.19      0.20      0.19       853
weighted avg       0.22      0.24      0.22       853



In [46]:
model2 = LogisticRegression(solver = 'lbfgs')
model2.fit(X_train, y_train)
test_preds = model2.predict(X_test)

In [47]:
print(classification_report(y_test, test_preds))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00        43
           1       0.28      0.46      0.34       167
           2       0.15      0.01      0.03       139
           3       0.25      0.41      0.31       182
           4       0.26      0.37      0.31       173
           5       0.30      0.05      0.09       149

    accuracy                           0.26       853
   macro avg       0.21      0.22      0.18       853
weighted avg       0.24      0.26      0.22       853



**Using Non-Contextual Word Embeddings**

In [48]:
glove_path = '/content/drive/MyDrive/glove.6B.100d.txt'
embeddings = {}
file1 = open(glove_path)
for i in file1:
  line = i.split()
  vector = np.asarray(line[1:], dtype = 'float32')
  embeddings[line[0]] = vector
file1.close()

In [49]:
def preprocess(text):
  text = text.lower() # lower - casing the text
  text = re.sub('<[^>]*>', ' ', text)
  text = re.sub('[\W]+', ' ', text)
  tokenizer = TreebankWordTokenizer()
  words = tokenizer.tokenize(text)
  text = ' '.join(words)
  ps = PorterStemmer()
  lemmatizer = WordNetLemmatizer()
  stemming = []
  lemmit = []
  for word in text.split():
    stemming.append(ps.stem(word))
  for word in stemming:
    lemmit.append(lemmatizer.lemmatize(word))
  text = ' '.join(lemmit)
  return text

def non_cont_embeddings(data_frame, non_cont_model):
  embeddings = []
  for i in range(len(data_frame)):
    text = data_frame['statement'][i]
    text = preprocess(text)
    tokens = word_tokenize(text)
    embedding = np.zeros((len(non_cont_model['and'])))
    cnt = 1
    for j in tokens:
      try:
        embedding += non_cont_model[j]
        cnt += 1
      except:
        pass
    embedding = embedding/cnt
    embeddings.append(embedding)
  
  return embeddings

In [50]:
X_train = non_cont_embeddings(train_df, embeddings)
y_train = list(train_df['label'].values)
X_test = non_cont_embeddings(test_df, embeddings)
y_test = list(test_df['label'].values)

In [51]:
model4 = MLPClassifier(activation = 'relu')
model4.fit(X_train, y_train)
test_preds = model4.predict(X_test)

In [52]:
print(classification_report(y_test, test_preds))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00        43
           1       0.22      0.27      0.24       167
           2       0.25      0.04      0.07       139
           3       0.24      0.38      0.30       182
           4       0.23      0.40      0.29       173
           5       0.24      0.05      0.09       149

    accuracy                           0.23       853
   macro avg       0.20      0.19      0.17       853
weighted avg       0.23      0.23      0.20       853

