In [0]:
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix,classification_report
from sklearn import metrics   
from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
import matplotlib.pyplot as plt
import pickle
import pandas as pd
import numpy as np
import re
import nltk


nltk.download('stopwords')
nltk.download('punkt')
!pip install jsonlines


import jsonlines

stop_words = set(stopwords.words('english'))
pst=PorterStemmer()

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
Collecting jsonlines
  Downloading https://files.pythonhosted.org/packages/4f/9a/ab96291470e305504aa4b7a2e0ec132e930da89eb3ca7a82fbe03167c131/jsonlines-1.2.0-py2.py3-none-any.whl
Installing collected packages: jsonlines
Successfully installed jsonlines-1.2.0


Preprocessing text

In [0]:
def text_clean(text):
  text=text.lower()
  texty=word_tokenize(text)
  texty=[word for word in texty if not word in stop_words]
  for i in range(len(texty)):
    texty[i]=pst.stem(texty[i])
  sen = (" ").join(texty)
  sen=re.sub('[^a-zA-Z]',' ', sen)
  sen = re.sub(r"\s+[a-zA-Z]\s+",' ', sen)
  sen = re.sub(r'\s+', ' ', sen)
  #stop_tokens=[word for words in ]
  return str(sen)

In [0]:
chstr=text_clean('A dark-haired drummer is playing his set with enthusiasm.')
print(chstr)
print(word_tokenize(chstr))

dark hair drummer play set enthusiasm 
['dark', 'hair', 'drummer', 'play', 'set', 'enthusiasm']


In [0]:
s1,s2,label=[],[],[]
no_entry=[]
Labels = ['contradiction','neutral','entailment','-']
def data_read(): 
  s1.clear()
  s2.clear()
  label.clear()
  no_entry.clear()
  str_dev='/content/drive/My Drive/Colab Notebooks/Datasets/proj3/snli_1.0_dev.jsonl'
  str_train='/content/drive/My Drive/Colab Notebooks/Datasets/proj3/snli_1.0_train.jsonl'
  co=0
  with jsonlines.open(str_train) as td:
    for line in td.iter():
      #print(line['sentence1']+'\n',line['sentence2']+'\n',line['gold_label'])
      co=co+1
      s1.append(text_clean(line['sentence1']))
      s2.append(text_clean(line['sentence2']))   
      if(line['gold_label']=='-'):
        no_entry.append(co)
      label.append(Labels.index(line['gold_label'].lower()))
      #label.append(line['gold_label'].lower())
data_read()
label=np.array(label)   

In [0]:
print("{} sentences do not belong to any class".format(len(no_entry)))

785 sentences do not belong to any class


In [0]:
xtext=[]
for i in range(len(s1)):
  fet=[]
  s1i,s2i=word_tokenize(s1[i]),word_tokenize(s2[i])
  for tokens in s1i:
    fet.append("s1_"+tokens)
  for tokens in s2i:
    fet.append("s2_"+tokens)
  xtext.append(" ".join(fet))
print(len(xtext))

550152


In [0]:
print(xtext[0])

s1_person s1_hors s1_jump s1_broken s1_airplan s2_person s2_train s2_hors s2_competit


TF-IDF usage

In [0]:
tfidf_vec=TfidfVectorizer(use_idf=True)
full_fit=tfidf_vec.fit(xtext)

In [0]:
feat1=full_fit.transform(xtext)

In [0]:
pickle.dump(full_fit, open("tfidf.pickle", "wb"))
feat1.shape

(550152, 31370)

Training data

In [0]:
model = LogisticRegression(verbose=1, solver='newton-cg',random_state=0, C=5, penalty='l2',max_iter=1000)
model.fit(feat1, label)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  2.7min finished


LogisticRegression(C=5, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=1000,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=0, solver='newton-cg', tol=0.0001, verbose=1,
                   warm_start=False)

In [0]:
filename = 'logistic.sav'
pickle.dump(model, open(filename, 'wb'))

In [0]:
preds = model.predict(feat1)
print("Train accuracy : {}".format(model.score(feat1,label)))

In [0]:
confusion_matrix = confusion_matrix(label, preds)
print(confusion_matrix)

[[115406  29651  38130      0]
 [ 31994 112692  38078      0]
 [ 26402  23628 133386      0]
 [   197    261    327      0]]


In [0]:
print(classification_report(label, preds))

  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0       0.66      0.63      0.65    183187
           1       0.68      0.62      0.65    182764
           2       0.64      0.73      0.68    183416
           3       0.00      0.00      0.00       785

    accuracy                           0.66    550152
   macro avg       0.49      0.49      0.49    550152
weighted avg       0.66      0.66      0.66    550152



Loading saved model and Testing

In [0]:
filename='logistic.sav'
sav_model=pickle.load(open(filename,'rb'))

In [0]:
Labels = ['contradiction','neutral','entailment','-']
str_test='/content/drive/My Drive/Colab Notebooks/Datasets/proj3/snli_1.0_test.jsonl'
s1_test,s2_test,label_test=[],[],[]
with jsonlines.open(str_test) as td:
    for line in td.iter():
      #print(line['sentence1']+'\n',line['sentence2']+'\n',line['gold_label'])
      s1_test.append(text_clean(line['sentence1']))
      s2_test.append(text_clean(line['sentence2']))
      label_test.append(Labels.index(line['gold_label'].lower()))
      #label.append(line['gold_label'].lower())
label_test=np.array(label_test)   

In [0]:
xtext_test=[]
for i in range(len(s1_test)):
  fet=[]
  s1i,s2i=word_tokenize(s1_test[i]),word_tokenize(s2_test[i])
  for tokens in s1i:
    fet.append("s1_"+tokens)
  for tokens in s2i:
    fet.append("s2_"+tokens)
  xtext_test.append(" ".join(fet))
print(len(xtext_test))

10000


In [0]:
tfidf_mod=pickle.load(open('tfidf.pickle','rb'))
feat_test=tfidf_mod.transform(xtext_test)
print((feat_test.shape))

(10000, 31370)


In [0]:
pred_test = sav_model.predict(feat_test)
print("Test accuracy : {}".format(sav_model.score(feat_test,label_test)))

Test accuracy : 0.6317


In [0]:
print(len(pred_test))

10000


In [0]:
fp=open("tfidf.txt","w")
for out in pred_test:
  fp.write(Labels[out]+"\n")

In [0]:
print(classification_report(label_test, pred_test))

              precision    recall  f1-score   support

           0       0.62      0.60      0.61      3237
           1       0.65      0.61      0.63      3219
           2       0.62      0.72      0.67      3368
           3       0.00      0.00      0.00       176

    accuracy                           0.63     10000
   macro avg       0.47      0.48      0.48     10000
weighted avg       0.62      0.63      0.63     10000



  _warn_prf(average, modifier, msg_start, len(result))
