# **Question Answer Classification**

Please run on Google colab and upload the training and testing files.

In [1]:
## Download model and dataset
import nltk
!python -m spacy download en_core_web_lg
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')
nltk.download('vader_lexicon')

[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_lg')
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [2]:
## Import Libraries

import pandas as pd
import numpy as np
import spacy
import nltk
import en_core_web_lg
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from warnings import filterwarnings
filterwarnings('ignore')
from sklearn.decomposition import PCA
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import confusion_matrix,classification_report

stop_words=stopwords.words('english')
nlp=en_core_web_lg.load()

### Read Datasets

In [3]:
## Read the dataset
train_df=pd.read_csv("p2_train.csv")
test_df=pd.read_csv("p2_test.csv")
train_size=len(train_df)
## Concatenate the train and test dataset to preprocess and generate features
df=pd.concat([train_df,test_df],axis=0)
df=df[['precedent','subsequent','question','response','type']]
df.head()

Unnamed: 0,precedent,subsequent,question,response,type
0,The most dangerous <&quot;>neighborhood<&quot;...,At least 1/2 of these deaths are gang related....,Should all the blacks in the state move out?,I can't make decisions for other people. But I...,answered
1,Riiiiiiiight.,&gt;To claim that hip-hop only spreads a deli...,"So, correct me if I'm wrong, but what you're s...",\nGive me an example of another depiction.,attacked
2,Time? Money?,At one point you have to prioritize.,What about the all country?,And how is a critic going to help you do that ...,attacked
3,"&gt; And there is no connotation, look up sta...",By not including any other group that is stati...,"And men commit more crimes than women, ages 20...","It's like that, but citizens have an entitleme...",answered
4,What the fuck does that even mean?,Can you demonstrate where Quinn slept with so...,How is that Quinn's fault?,It's not Quinn's fault. But for defending Quin...,answered


## Preprocessing Features

In [4]:
## Preprocessing the questions and response
## Removes special symbols and lower cases 

def preprocessing(df,col):
 
  df[col]=df[col].replace('<&quot;>',"",regex=True) 
  df[col]=df[col].str.lower()
  df[col]=df[col].replace('\n','', regex=True)
  df[col]=df[col].str.strip()
  df[col]=df[col].str.replace('[^\w\s\?]','')
  df[col]=df[col].str.replace('$','')
  df[col]=df[col].str.replace('.','')
  return df

df=preprocessing(df,'question')
df=preprocessing(df,'response')
df.head()

Unnamed: 0,precedent,subsequent,question,response,type
0,The most dangerous <&quot;>neighborhood<&quot;...,At least 1/2 of these deaths are gang related....,should all the blacks in the state move out?,i cant make decisions for other people but i c...,answered
1,Riiiiiiiight.,&gt;To claim that hip-hop only spreads a deli...,so correct me if im wrong but what youre sayin...,give me an example of another depiction,attacked
2,Time? Money?,At one point you have to prioritize.,what about the all country?,and how is a critic going to help you do that ...,attacked
3,"&gt; And there is no connotation, look up sta...",By not including any other group that is stati...,and men commit more crimes than women ages 20 ...,its like that but citizens have an entitlement...,answered
4,What the fuck does that even mean?,Can you demonstrate where Quinn slept with so...,how is that quinns fault?,its not quinns fault but for defending quinn i...,answered


## Baseline Features

In [5]:
## Sentence Vector Representation 
'''
question_vector_rep - vector representation of question
response_vector_rep - vector representation of response
vector - vector represenation of both question and response
'''

def vector_representation(sentence):
    doc=nlp(sentence)
    vector=doc.vector
    return vector

def get_vectors(df,col):
    df[col+'_vector_rep']=df[col].apply(lambda x: vector_representation(x))
    return df

## Get vector represenation of question and response
vectors_df=get_vectors(df,'question')
vectors_df=get_vectors(df,'response')

## Concatenate the question and response vectors
vectors=[]
question_vectors=df['question_vector_rep'].tolist()
response_vectors=df['response_vector_rep'].tolist()

for i in range(len(question_vectors)):
  vector=np.concatenate((question_vectors[i],response_vectors[i]),axis=None)
  vectors.append(vector)


vectors_df['vector']=vectors

## Convert categorical labels into Integer representation
## {0: answered, 1: attacked ,2: irrelevant, 3: agreed}
vectors_df.type = pd.Categorical(pd.factorize(vectors_df.type)[0])

## The embeddings are in 600 dimensional after concatenating. We can reduce this high dimensions 
## into lower dimension by dimensionality reduction methods. We ensure that it captures 75-90% variance in the data

pca=PCA(n_components=60)
pca_res=pca.fit_transform(vectors)
l=[]
for i in pca_res:
  l.append(np.array(i)) 
vectors_df['vector']=l
vectors_df.head()

Unnamed: 0,precedent,subsequent,question,response,type,question_vector_rep,response_vector_rep,vector
0,The most dangerous <&quot;>neighborhood<&quot;...,At least 1/2 of these deaths are gang related....,should all the blacks in the state move out?,i cant make decisions for other people but i c...,0,"[0.04330626, 0.045562994, -0.016042003, -0.148...","[-0.083568744, 0.15506375, -0.23541515, -0.054...","[-0.5864192303622856, -0.022218359114974773, 0..."
1,Riiiiiiiight.,&gt;To claim that hip-hop only spreads a deli...,so correct me if im wrong but what youre sayin...,give me an example of another depiction,1,"[-0.047989298, 0.14819294, -0.2549167, -0.1186...","[-0.15461029, 0.15779972, -0.18892084, 0.15702...","[-0.08770394395260009, 0.13676199650971663, -0..."
2,Time? Money?,At one point you have to prioritize.,what about the all country?,and how is a critic going to help you do that ...,1,"[-0.0005612299, 0.28547665, -0.08796167, -0.17...","[-0.045013502, 0.1485028, -0.25629047, -0.0049...","[0.09976164830915897, -0.17617712717874473, 0...."
3,"&gt; And there is no connotation, look up sta...",By not including any other group that is stati...,and men commit more crimes than women ages 20 ...,its like that but citizens have an entitlement...,0,"[-0.12268909, 0.15060432, -0.08220947, -0.1092...","[-0.015303677, 0.07679086, -0.13528888, -0.083...","[-0.6706652955257439, 0.13793602850620543, -0...."
4,What the fuck does that even mean?,Can you demonstrate where Quinn slept with so...,how is that quinns fault?,its not quinns fault but for defending quinn i...,0,"[-0.090715826, 0.17173333, -0.03452379, -0.105...","[-0.15782003, 0.11515223, -0.16754769, 0.00083...","[0.4266863824720189, 0.4956174718237338, 0.504..."


In [6]:
## Modeling with only embeddings feature
train_X,train_Y = vectors_df['vector'].iloc[:train_size],vectors_df['type'].iloc[:train_size]
test_X,test_Y = vectors_df['vector'].iloc[train_size:],vectors_df['type'].iloc[train_size:]

model_SVC = LinearSVC(class_weight='balanced')
model_LR=LogisticRegression()
model_NB = GaussianNB()

model_NB.fit(train_X.tolist(), train_Y)
model_SVC.fit(train_X.tolist(), train_Y)
model_LR.fit(train_X.tolist(),train_Y)

predicted_labels_SVC = model_SVC.predict(test_X.tolist())
predicted_labels_NB = model_NB.predict(test_X.tolist())
predicted_labels_LR = model_LR.predict(test_X.tolist())

print ("SVC Report")
print (confusion_matrix(test_Y,predicted_labels_SVC))
print (classification_report(test_Y,predicted_labels_SVC))

print ("Naive Bayes")
print (confusion_matrix(test_Y,predicted_labels_NB))
print (classification_report(test_Y,predicted_labels_NB))

print ("LR Report")
print (confusion_matrix(test_Y,predicted_labels_LR))
print (classification_report(test_Y,predicted_labels_LR))

SVC Report
[[246  26  28  20]
 [ 12  16   8   3]
 [ 14  10  12   2]
 [  7   0   0   6]]
              precision    recall  f1-score   support

           0       0.88      0.77      0.82       320
           1       0.31      0.41      0.35        39
           2       0.25      0.32      0.28        38
           3       0.19      0.46      0.27        13

    accuracy                           0.68       410
   macro avg       0.41      0.49      0.43       410
weighted avg       0.75      0.68      0.71       410

Naive Bayes
[[180  95  41   4]
 [ 17  16   4   2]
 [ 17   9  11   1]
 [  5   3   4   1]]
              precision    recall  f1-score   support

           0       0.82      0.56      0.67       320
           1       0.13      0.41      0.20        39
           2       0.18      0.29      0.22        38
           3       0.12      0.08      0.10        13

    accuracy                           0.51       410
   macro avg       0.32      0.33      0.30       410
weighted

In [7]:
## POS Tagging Features
## Generate pos tags for both questions and responses
'''
question_pos_rep - Pos tag representations of questions
response_pos_rep - Pos tag representations of responses
pos-tags - Concatenation of both vectors
'''

def pos_tags_representation(sentence):
  if len(sentence)!=0:
    
    tagged_tokens=nltk.pos_tag(sentence.split())
    words,tags=zip(*tagged_tokens)
    return list(tags)
  else:
    return []

def get_pos_tags(df,col):
  df[col+'_postag_rep']=df[col].apply(lambda x: pos_tags_representation(x))
  return df


# Concatenate the pos tags of both question and respone sentences
def concatenate_tags():
  tags_df=get_pos_tags(vectors_df,'question')
  tags_df=get_pos_tags(vectors_df,'response')
  tags_df['pos-tags']=tags_df['question_postag_rep']+tags_df['response_postag_rep']
  return tags_df

tags_df=concatenate_tags()
tags_df.head()





Unnamed: 0,precedent,subsequent,question,response,type,question_vector_rep,response_vector_rep,vector,question_postag_rep,response_postag_rep,pos-tags
0,The most dangerous <&quot;>neighborhood<&quot;...,At least 1/2 of these deaths are gang related....,should all the blacks in the state move out?,i cant make decisions for other people but i c...,0,"[0.04330626, 0.045562994, -0.016042003, -0.148...","[-0.083568744, 0.15506375, -0.23541515, -0.054...","[-0.5864192303622856, -0.022218359114974773, 0...","[MD, PDT, DT, NNS, IN, DT, NN, NN, NN]","[NN, VBP, NN, NNS, IN, JJ, NNS, CC, NN, MD, VB...","[MD, PDT, DT, NNS, IN, DT, NN, NN, NN, NN, VBP..."
1,Riiiiiiiight.,&gt;To claim that hip-hop only spreads a deli...,so correct me if im wrong but what youre sayin...,give me an example of another depiction,1,"[-0.047989298, 0.14819294, -0.2549167, -0.1186...","[-0.15461029, 0.15779972, -0.18892084, 0.15702...","[-0.08770394395260009, 0.13676199650971663, -0...","[RB, JJ, PRP, IN, VBN, RB, CC, WP, NN, VBG, VB...","[VB, PRP, DT, NN, IN, DT, NN]","[RB, JJ, PRP, IN, VBN, RB, CC, WP, NN, VBG, VB..."
2,Time? Money?,At one point you have to prioritize.,what about the all country?,and how is a critic going to help you do that ...,1,"[-0.0005612299, 0.28547665, -0.08796167, -0.17...","[-0.045013502, 0.1485028, -0.25629047, -0.0049...","[0.09976164830915897, -0.17617712717874473, 0....","[WP, IN, DT, DT, NNS]","[CC, WRB, VBZ, DT, JJ, VBG, TO, VB, PRP, VB, D...","[WP, IN, DT, DT, NNS, CC, WRB, VBZ, DT, JJ, VB..."
3,"&gt; And there is no connotation, look up sta...",By not including any other group that is stati...,and men commit more crimes than women ages 20 ...,its like that but citizens have an entitlement...,0,"[-0.12268909, 0.15060432, -0.08220947, -0.1092...","[-0.015303677, 0.07679086, -0.13528888, -0.083...","[-0.6706652955257439, 0.13793602850620543, -0....","[CC, NNS, VBP, JJR, NNS, IN, NNS, VBZ, CD, TO,...","[PRP$, IN, DT, CC, NNS, VBP, DT, NN, IN, NN, I...","[CC, NNS, VBP, JJR, NNS, IN, NNS, VBZ, CD, TO,..."
4,What the fuck does that even mean?,Can you demonstrate where Quinn slept with so...,how is that quinns fault?,its not quinns fault but for defending quinn i...,0,"[-0.090715826, 0.17173333, -0.03452379, -0.105...","[-0.15782003, 0.11515223, -0.16754769, 0.00083...","[0.4266863824720189, 0.4956174718237338, 0.504...","[WRB, VBZ, IN, NN, NN]","[PRP$, RB, JJ, NN, CC, IN, VBG, NN, IN, NNS, W...","[WRB, VBZ, IN, NN, NN, PRP$, RB, JJ, NN, CC, I..."


In [8]:
## Create count vector representation for pos tags

## Create a mapping for the pos_tags so that number of diffent type of pos tags are represented by count vectorizer
all_tags=[]
for i in range(len(tags_df)):  
  all_tags+=(tags_df['pos-tags'].iloc[i])
tags_list=list(set(all_tags))
mapping={}
for i in range(len(tags_list)):
  mapping[tags_list[i]]=i

def pos_vector_mapping(pos_list):
  one_hot=np.zeros(len(mapping))
  for i in pos_list:
    one_hot[mapping[i]]+=1

  return one_hot  

tags_df['pos_tag_vector']=df['pos-tags'].apply(lambda x: pos_vector_mapping(x))
tags_df.head()


Unnamed: 0,precedent,subsequent,question,response,type,question_vector_rep,response_vector_rep,vector,question_postag_rep,response_postag_rep,pos-tags,pos_tag_vector
0,The most dangerous <&quot;>neighborhood<&quot;...,At least 1/2 of these deaths are gang related....,should all the blacks in the state move out?,i cant make decisions for other people but i c...,0,"[0.04330626, 0.045562994, -0.016042003, -0.148...","[-0.083568744, 0.15506375, -0.23541515, -0.054...","[-0.5864192303622856, -0.022218359114974773, 0...","[MD, PDT, DT, NNS, IN, DT, NN, NN, NN]","[NN, VBP, NN, NNS, IN, JJ, NNS, CC, NN, MD, VB...","[MD, PDT, DT, NNS, IN, DT, NN, NN, NN, NN, VBP...","[5.0, 1.0, 1.0, 6.0, 1.0, 12.0, 0.0, 0.0, 6.0,..."
1,Riiiiiiiight.,&gt;To claim that hip-hop only spreads a deli...,so correct me if im wrong but what youre sayin...,give me an example of another depiction,1,"[-0.047989298, 0.14819294, -0.2549167, -0.1186...","[-0.15461029, 0.15779972, -0.18892084, 0.15702...","[-0.08770394395260009, 0.13676199650971663, -0...","[RB, JJ, PRP, IN, VBN, RB, CC, WP, NN, VBG, VB...","[VB, PRP, DT, NN, IN, DT, NN]","[RB, JJ, PRP, IN, VBN, RB, CC, WP, NN, VBG, VB...","[1.0, 0.0, 0.0, 5.0, 2.0, 6.0, 0.0, 0.0, 1.0, ..."
2,Time? Money?,At one point you have to prioritize.,what about the all country?,and how is a critic going to help you do that ...,1,"[-0.0005612299, 0.28547665, -0.08796167, -0.17...","[-0.045013502, 0.1485028, -0.25629047, -0.0049...","[0.09976164830915897, -0.17617712717874473, 0....","[WP, IN, DT, DT, NNS]","[CC, WRB, VBZ, DT, JJ, VBG, TO, VB, PRP, VB, D...","[WP, IN, DT, DT, NNS, CC, WRB, VBZ, DT, JJ, VB...","[1.0, 1.0, 1.0, 6.0, 1.0, 2.0, 0.0, 0.0, 2.0, ..."
3,"&gt; And there is no connotation, look up sta...",By not including any other group that is stati...,and men commit more crimes than women ages 20 ...,its like that but citizens have an entitlement...,0,"[-0.12268909, 0.15060432, -0.08220947, -0.1092...","[-0.015303677, 0.07679086, -0.13528888, -0.083...","[-0.6706652955257439, 0.13793602850620543, -0....","[CC, NNS, VBP, JJR, NNS, IN, NNS, VBZ, CD, TO,...","[PRP$, IN, DT, CC, NNS, VBP, DT, NN, IN, NN, I...","[CC, NNS, VBP, JJR, NNS, IN, NNS, VBZ, CD, TO,...","[5.0, 3.0, 3.0, 14.0, 7.0, 15.0, 0.0, 0.0, 5.0..."
4,What the fuck does that even mean?,Can you demonstrate where Quinn slept with so...,how is that quinns fault?,its not quinns fault but for defending quinn i...,0,"[-0.090715826, 0.17173333, -0.03452379, -0.105...","[-0.15782003, 0.11515223, -0.16754769, 0.00083...","[0.4266863824720189, 0.4956174718237338, 0.504...","[WRB, VBZ, IN, NN, NN]","[PRP$, RB, JJ, NN, CC, IN, VBG, NN, IN, NNS, W...","[WRB, VBZ, IN, NN, NN, PRP$, RB, JJ, NN, CC, I...","[2.0, 2.0, 1.0, 0.0, 1.0, 6.0, 0.0, 0.0, 0.0, ..."


## Baseline Model with embeddings and pos tags

In [9]:
## Combine both embeddings and pos tags for baseline models
word_pos=[]
for i in range(len(tags_df)):
  word_pos.append(list(tags_df['vector'].iloc[i])+list(tags_df['pos_tag_vector'].iloc[i]))

tags_df['word_pos']=word_pos
tags_df.head()

Unnamed: 0,precedent,subsequent,question,response,type,question_vector_rep,response_vector_rep,vector,question_postag_rep,response_postag_rep,pos-tags,pos_tag_vector,word_pos
0,The most dangerous <&quot;>neighborhood<&quot;...,At least 1/2 of these deaths are gang related....,should all the blacks in the state move out?,i cant make decisions for other people but i c...,0,"[0.04330626, 0.045562994, -0.016042003, -0.148...","[-0.083568744, 0.15506375, -0.23541515, -0.054...","[-0.5864192303622856, -0.022218359114974773, 0...","[MD, PDT, DT, NNS, IN, DT, NN, NN, NN]","[NN, VBP, NN, NNS, IN, JJ, NNS, CC, NN, MD, VB...","[MD, PDT, DT, NNS, IN, DT, NN, NN, NN, NN, VBP...","[5.0, 1.0, 1.0, 6.0, 1.0, 12.0, 0.0, 0.0, 6.0,...","[-0.5864192303622856, -0.022218359114974773, 0..."
1,Riiiiiiiight.,&gt;To claim that hip-hop only spreads a deli...,so correct me if im wrong but what youre sayin...,give me an example of another depiction,1,"[-0.047989298, 0.14819294, -0.2549167, -0.1186...","[-0.15461029, 0.15779972, -0.18892084, 0.15702...","[-0.08770394395260009, 0.13676199650971663, -0...","[RB, JJ, PRP, IN, VBN, RB, CC, WP, NN, VBG, VB...","[VB, PRP, DT, NN, IN, DT, NN]","[RB, JJ, PRP, IN, VBN, RB, CC, WP, NN, VBG, VB...","[1.0, 0.0, 0.0, 5.0, 2.0, 6.0, 0.0, 0.0, 1.0, ...","[-0.08770394395260009, 0.13676199650971663, -0..."
2,Time? Money?,At one point you have to prioritize.,what about the all country?,and how is a critic going to help you do that ...,1,"[-0.0005612299, 0.28547665, -0.08796167, -0.17...","[-0.045013502, 0.1485028, -0.25629047, -0.0049...","[0.09976164830915897, -0.17617712717874473, 0....","[WP, IN, DT, DT, NNS]","[CC, WRB, VBZ, DT, JJ, VBG, TO, VB, PRP, VB, D...","[WP, IN, DT, DT, NNS, CC, WRB, VBZ, DT, JJ, VB...","[1.0, 1.0, 1.0, 6.0, 1.0, 2.0, 0.0, 0.0, 2.0, ...","[0.09976164830915897, -0.17617712717874473, 0...."
3,"&gt; And there is no connotation, look up sta...",By not including any other group that is stati...,and men commit more crimes than women ages 20 ...,its like that but citizens have an entitlement...,0,"[-0.12268909, 0.15060432, -0.08220947, -0.1092...","[-0.015303677, 0.07679086, -0.13528888, -0.083...","[-0.6706652955257439, 0.13793602850620543, -0....","[CC, NNS, VBP, JJR, NNS, IN, NNS, VBZ, CD, TO,...","[PRP$, IN, DT, CC, NNS, VBP, DT, NN, IN, NN, I...","[CC, NNS, VBP, JJR, NNS, IN, NNS, VBZ, CD, TO,...","[5.0, 3.0, 3.0, 14.0, 7.0, 15.0, 0.0, 0.0, 5.0...","[-0.6706652955257439, 0.13793602850620543, -0...."
4,What the fuck does that even mean?,Can you demonstrate where Quinn slept with so...,how is that quinns fault?,its not quinns fault but for defending quinn i...,0,"[-0.090715826, 0.17173333, -0.03452379, -0.105...","[-0.15782003, 0.11515223, -0.16754769, 0.00083...","[0.4266863824720189, 0.4956174718237338, 0.504...","[WRB, VBZ, IN, NN, NN]","[PRP$, RB, JJ, NN, CC, IN, VBG, NN, IN, NNS, W...","[WRB, VBZ, IN, NN, NN, PRP$, RB, JJ, NN, CC, I...","[2.0, 2.0, 1.0, 0.0, 1.0, 6.0, 0.0, 0.0, 0.0, ...","[0.4266863824720189, 0.4956174718237338, 0.504..."


In [10]:


## Modeling with only embeddings feature
train_X,train_Y = tags_df['word_pos'].iloc[:train_size],tags_df['type'].iloc[:train_size]
test_X,test_Y = tags_df['word_pos'].iloc[train_size:],tags_df['type'].iloc[train_size:]

model_SVC = LinearSVC(random_state=2,class_weight='balanced')
model_LR = LogisticRegression(max_iter=1000)
model_KNN = KNeighborsClassifier()

model_LR.fit(train_X.tolist(), train_Y)
model_SVC.fit(train_X.tolist(), train_Y)
model_KNN.fit(train_X.tolist(), train_Y)

predicted_labels_SVC = model_SVC.predict(test_X.tolist())
predicted_labels_LR = model_LR.predict(test_X.tolist())
predicted_labels_KNN = model_KNN.predict(test_X.tolist())

print ("SVC Report")
print (confusion_matrix(test_Y,predicted_labels_SVC))
print (classification_report(test_Y,predicted_labels_SVC))

print ("Logistic Regression")
print (confusion_matrix(test_Y,predicted_labels_LR))
print (classification_report(test_Y,predicted_labels_LR))

print ("KNN")
print (confusion_matrix(test_Y,predicted_labels_KNN))
print (classification_report(test_Y,predicted_labels_KNN))

SVC Report
[[274  14  22  10]
 [ 19  13   4   3]
 [ 20   8   9   1]
 [  8   1   0   4]]
              precision    recall  f1-score   support

           0       0.85      0.86      0.85       320
           1       0.36      0.33      0.35        39
           2       0.26      0.24      0.25        38
           3       0.22      0.31      0.26        13

    accuracy                           0.73       410
   macro avg       0.42      0.43      0.43       410
weighted avg       0.73      0.73      0.73       410

Logistic Regression
[[292  12  15   1]
 [ 25  10   4   0]
 [ 23   8   7   0]
 [ 10   0   1   2]]
              precision    recall  f1-score   support

           0       0.83      0.91      0.87       320
           1       0.33      0.26      0.29        39
           2       0.26      0.18      0.22        38
           3       0.67      0.15      0.25        13

    accuracy                           0.76       410
   macro avg       0.52      0.38      0.41       410


## NEW FEATURES

In [11]:
## Generate sentiment values for the questions and response

def sentiment_vector(sentence):
  analyser = SentimentIntensityAnalyzer()
  score_dict=analyser.polarity_scores(sentence)
  return score_dict['compound']


tags_df['question_sentiment']=tags_df['question'].apply(lambda x: sentiment_vector(x))
tags_df['response_sentiment']=tags_df['response'].apply(lambda x: sentiment_vector(x))


In [12]:
## Genreate cosine similarity between the questions and response

def get_cosine_similarity(feature_vec_1, feature_vec_2):    
    return cosine_similarity(feature_vec_1.reshape(1, -1), feature_vec_2.reshape(1, -1))[0][0]

cosines=[]
for i in range(len(tags_df)):
  cosines.append(get_cosine_similarity(tags_df['question_vector_rep'].iloc[i],tags_df['response_vector_rep'].iloc[i]))
tags_df['cosine_similarity']=cosines

In [13]:
all_features=[]
for i in range(len(tags_df)):
  all_features.append(tags_df['word_pos'].iloc[i]+[tags_df['question_sentiment'].iloc[i],tags_df['response_sentiment'].iloc[i],tags_df['cosine_similarity'].iloc[i]])

tags_df['features_vector']=all_features



In [19]:
# Modeling all the features
train_X,train_Y = tags_df['features_vector'].iloc[:train_size],tags_df['type'].iloc[:train_size]
test_X,test_Y = tags_df['features_vector'].iloc[train_size:],tags_df['type'].iloc[train_size:]
train_X


model_SVC = LinearSVC(random_state=2)
model_LR = LogisticRegression(max_iter=1000)
model_KNN = KNeighborsClassifier()

model_LR.fit(train_X.tolist(), train_Y)
model_SVC.fit(train_X.tolist(), train_Y)
model_KNN.fit(train_X.tolist(), train_Y)

predicted_labels_SVC = model_SVC.predict(test_X.tolist())
predicted_labels_LR = model_LR.predict(test_X.tolist())
predicted_labels_KNN = model_KNN.predict(test_X.tolist())

print ("SVC Report")
print (confusion_matrix(test_Y,predicted_labels_SVC))
print (classification_report(test_Y,predicted_labels_SVC))

print ("Logistic Regression")
print (confusion_matrix(test_Y,predicted_labels_LR))
print (classification_report(test_Y,predicted_labels_LR))

print ("KNN")
print (confusion_matrix(test_Y,predicted_labels_KNN))
print (classification_report(test_Y,predicted_labels_KNN))

SVC Report
[[306  10   3   1]
 [ 29  10   0   0]
 [ 24   9   5   0]
 [ 10   0   1   2]]
              precision    recall  f1-score   support

           0       0.83      0.96      0.89       320
           1       0.34      0.26      0.29        39
           2       0.56      0.13      0.21        38
           3       0.67      0.15      0.25        13

    accuracy                           0.79       410
   macro avg       0.60      0.37      0.41       410
weighted avg       0.75      0.79      0.75       410

Logistic Regression
[[298  12   9   1]
 [ 25  11   3   0]
 [ 23   9   6   0]
 [  8   0   2   3]]
              precision    recall  f1-score   support

           0       0.84      0.93      0.88       320
           1       0.34      0.28      0.31        39
           2       0.30      0.16      0.21        38
           3       0.75      0.23      0.35        13

    accuracy                           0.78       410
   macro avg       0.56      0.40      0.44       410
