In [0]:
# Run, restart and then skip this
# !python3 -m spacy download en_core_web_lg

In [0]:
import nltk 
import pandas as pd
import numpy as np
import spacy
import warnings
import sklearn
import pickle
from sklearn.svm import SVC
from sklearn import svm
from sklearn.linear_model import SGDClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import confusion_matrix
warnings.filterwarnings('ignore')

In [0]:
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nlp = spacy.load('en_core_web_lg')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


In [0]:
from nltk.corpus import stopwords, wordnet
from nltk.tokenize import word_tokenize, sent_tokenize 
stop_words = set(stopwords.words('english')) 
from nltk import pos_tag

In [0]:
dfTrain = pd.read_csv("data/UTDallasSpecialFeatureCSV.csv")
dfTest = pd.read_csv("data/WinogradSpecialFeatureCSV.csv")
# dfTrainH = pd.read_csv("data/UTDallasCSV.csv")
# dfTestH = pd.read_csv("data/WinogradChallengeCSV.csv")

In [0]:
print(dfTrain.shape)

(1321, 6)


In [0]:
dfTrain.head(100)

Unnamed: 0,Question String,Pronoun,Split1,Split2,Option1,Option2
0,The bee landed on the flower because it had po...,it,The bee landed on the flower because,it had pollen.,The bee,the flower
1,The bee landed on the flower because it wanted...,it,The bee landed on the flower because,it wanted pollen.,The bee,the flower
2,"When Debbie splashed Tina, she got in trouble.",she,"When Debbie splashed Tina,",she got in trouble.,Debbie,Tina
3,"When Debbie splashed Tina, she got wet.",she,"When Debbie splashed Tina,",she got wet.,Debbie,Tina
4,The bus driver yelled at a kid after she drove...,she,The bus driver yelled at a kid after,she drove her vehicle.,The bus driver,a kid
...,...,...,...,...,...,...
95,John flicked Bill because he had a fly on his ...,he,John flicked Bill because,he had a fly on his shirt.,John,Bill
96,The zombies chased the survivors because they ...,they,The zombies chased the survivors because,they were hungry.,The zombies,the survivors
97,The zombies chased the survivors because they ...,they,The zombies chased the survivors because,they were tasty.,The zombies,the survivors
98,Medvedev will cede the presidency to Putin bec...,he,Medvedev will cede the presidency to Putin bec...,he is more popular.,Medvedev,Putin


In [0]:
def tokenizePhrase2(sent):
  sent = nltk.word_tokenize(sent)
  return sent

In [0]:
li = (len(dfTrain))*[0]
dfTrain['TokenP2'] = li
dfTrain['TokenP2'] = dfTrain.apply(lambda x: tokenizePhrase2(str(x['Split2'])), axis = 1)
li = (len(dfTest))*[0]
dfTest['TokenP2'] = li
dfTest['TokenP2'] = dfTest.apply(lambda x: tokenizePhrase2(str(x['Split2'])), axis = 1)

In [0]:
print(pos_tag(dfTrain.iloc[0]['TokenP2']))

[('it', 'PRP'), ('had', 'VBD'), ('pollen', 'VBN'), ('.', '.')]


In [0]:
def findPOSSim(choice, token):
  tagged = pos_tag(token)
  adj = []
  verb = []
  for tag in tagged:
    if(tag[1][0] == 'V'):
        verb.append(tag[0])
    if(tag[1][0] == 'J'):
        adj.append(tag[0])

  aSim = 0
  vSim = 0
  for ad in adj:
    aSim += nlp.vocab[choice].similarity(nlp.vocab[ad])
  for vb in verb:
    vSim += nlp.vocab[choice].similarity(nlp.vocab[vb])
  # print(adj, verb)
  # print(aSim, vSim)
  return (aSim, vSim)

In [0]:
li = (len(dfTrain))*[0]
dfTrain['Ch1AdSim'] = li
dfTrain['Ch1VbSim'] = li
dfTrain['Ch1AdSim'], dfTrain['Ch1VbSim'] = zip(*dfTrain.apply(lambda x: findPOSSim(x['Option1'], x['TokenP2']), axis = 1))

li = (len(dfTrain))*[0]
dfTrain['Ch2AdSim'] = li
dfTrain['Ch2VbSim'] = li
dfTrain['Ch2AdSim'], dfTrain['Ch2VbSim'] = zip(*dfTrain.apply(lambda x: findPOSSim(x['Option2'], x['TokenP2']), axis = 1))

li = (len(dfTest))*[0]
dfTest['Ch1AdSim'] = li
dfTest['Ch1VbSim'] = li
dfTest['Ch1AdSim'], dfTest['Ch1VbSim'] = zip(*dfTest.apply(lambda x: findPOSSim(x['Option1'], x['TokenP2']), axis = 1))

li = (len(dfTest))*[0]
dfTest['Ch2AdSim'] = li
dfTest['Ch2VbSim'] = li
dfTest['Ch2AdSim'], dfTest['Ch2VbSim'] = zip(*dfTest.apply(lambda x: findPOSSim(x['Option2'], x['TokenP2']), axis = 1))

In [0]:
dfTest.shape

(284, 11)

In [0]:
dfTrainGender = pd.read_csv("data/UTDallasCSVGenderFeature.csv")
dfTestGender = pd.read_csv("data/WinogradChallengeCSVGenderFeature.csv")

In [0]:
dfTrainGender.columns

Index(['Knowledge Base', 'Question', 'Option 1', 'Option 2', 'Actual Answer',
       'Bit Representation', 'Opt1GenderFeature', 'Opt2GenderFeature'],
      dtype='object')

In [0]:
trGendOpt1 = []
trGendOpt2 = []
YTrain = []
cnt = 0
for i, r in dfTrainGender.iterrows():
  if(cnt%2 == 0):
    trGendOpt1.append(r.Opt1GenderFeature)
    trGendOpt2.append(r.Opt2GenderFeature)
    YTrain.append(r['Option 2'])
  cnt += 1

YTest = []
cnt = 0
teGendOpt1 = []
teGendOpt2 = []
for i, r in dfTestGender.iterrows():
  if(cnt%2 == 0):
    teGendOpt1.append(r.Opt1Gender)
    teGendOpt2.append(r.Opt2Gender)
    YTest.append(r['Bit Representation'])
  cnt += 1

In [0]:
len(YTrain)

1321

In [0]:
with open("data/bestPred.pkl", "rb") as fp:
  predTest = pickle.load(fp)

with open("data/bestPredTrain.pkl", "rb") as fp:
  predTrain = pickle.load(fp)

In [0]:
len(predTrain)

1321

In [0]:
XTrain = np.matrix([predTrain, trGendOpt1, trGendOpt2, dfTrain['Ch2AdSim'], dfTrain['Ch2VbSim'], dfTrain['Ch1AdSim'], dfTrain['Ch1VbSim']]).T
XTest = np.matrix([predTest, teGendOpt1, teGendOpt2, dfTest['Ch2AdSim'], dfTest['Ch2VbSim'], dfTest['Ch1AdSim'], dfTest['Ch1VbSim']]).T

In [0]:
print(XTrain.shape, XTest.shape)

(1321, 7) (284, 7)


In [0]:
CLF = MLPClassifier(max_iter=10000, alpha=0.001, warm_start=True)
_ = CLF.fit(XTrain, YTrain)
print('Training Complete')

Training Complete


In [0]:
prediction_result = CLF.predict(XTest)

In [0]:
# Accuracy
np.mean(prediction_result == YTest)

0.6232394366197183

In [0]:
cm = confusion_matrix(prediction_result, YTest)
cm

array([[84, 48],
       [59, 93]])