## Installs and Imports

In [0]:
# !pip install sentencepiece
# !pip install tf_sentencepiece



In [0]:
!pip uninstall --quiet --yes tensorflow
!pip install --quiet tensorflow-gpu==1.13.1

[K     |████████████████████████████████| 345.2MB 49kB/s 
[K     |████████████████████████████████| 3.2MB 35.9MB/s 
[K     |████████████████████████████████| 368kB 45.8MB/s 
[?25h

In [0]:
!pip install --quiet tf-sentencepiece

[K     |████████████████████████████████| 2.7MB 2.7MB/s 
[?25h

In [0]:
import tensorflow as tf
import tensorflow_hub as hub
import numpy as np
import pandas as pd
import sklearn
import pickle
from sklearn.svm import SVC
from sklearn import svm
from sklearn.linear_model import SGDClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import confusion_matrix

In [0]:
import tf_sentencepiece

## Data

In [0]:
df = pd.read_csv('data/UTDallasCSV.csv')
df2 = pd.read_csv('data/WinogradChallengeCSV.csv')

In [0]:
def delBogus(df):
  delInd = []
  for i, r in df.iterrows():
    if(i%2 != 0):
      delInd.append(i)

  df = df.drop(index = delInd, axis = 0)
  return df

In [0]:
df = delBogus(df)
df2 = delBogus(df2)

In [0]:
df2.head()

Unnamed: 0,Knowledge Base,Question,Option 1,Option 2,Actual Answer,Bit Representation,Opt1,Opt2
0,city councilmen refused demonstrators a permit...,what does they refer to?,they refers to city councilmen,they refers to demonstrators,city councilmen,0,city councilmen,demonstrators
2,city councilmen refused demonstrators a permit...,what does they refer to?,they refers to city councilmen,they refers to demonstrators,demonstrators,1,city councilmen,demonstrators
4,trophy doesn't fit into brown suitcase because...,what does it refer to?,it refers to trophy,it refers to suitcase,trophy,0,trophy,suitcase
6,trophy doesn't fit into brown suitcase because...,what does it refer to?,it refers to trophy,it refers to suitcase,suitcase,1,trophy,suitcase
8,joan made sure to thank susan for all help she...,what does she refer to?,she refers to joan,she refers to susan,joan,0,joan,susan


In [0]:
def renameCols(df):
  df.columns = ['KB', 'Question', 'Option1', 'Option2', 'AA', 'Label']
  return df

In [0]:
df = renameCols(df)
df2 = renameCols(df2)

ValueError: ignored

In [0]:
# Generating input for embeddings
def genEmbedInput(df):
  questions = []
  responses = []
  response_contexts = []
  kb = []
  for i,r in df.iterrows():
    questions.append(r.Question)
    responses.append(r.Option1)
    responses.append(r.Option2)
    response_contexts.append(r.KB)
    response_contexts.append(r.KB)
    kb.append(r.KB)
  return questions, responses, response_contexts, kb

In [0]:
def prepareInpData(df):
  data = np.zeros((df.shape[0], 3*512))
  # print(data.shape)
  j = 0
  for i, row in df.iterrows():
    # print(i)
    x = np.concatenate((row.KBE, row.QuestionE))
    if(row.Label == 1):
      x = np.concatenate((x, row.Option2E))
    else:
      x = np.concatenate((x, row.Option1E))
    # print(i - 1)
    data[j] = x
    j += 1
    
  return data

## Training

In [0]:
questions, responses, response_contexts, kb = genEmbedInput(df)

In [0]:
# Set up graph.
g = tf.Graph()
with g.as_default():
  module = hub.Module('https://tfhub.dev/google/universal-sentence-encoder-multilingual-qa/1')
  question_embeddings = module(dict(input=questions), signature="question_encoder", as_dict=True)
  response_embeddings = module(dict(input=responses, context=response_contexts), signature="response_encoder", as_dict=True)
  init_op = tf.group([tf.global_variables_initializer(), tf.tables_initializer()])

g.finalize()


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


In [0]:
# Initialize session.
session = tf.Session(graph=g)
session.run(init_op)

# Compute embeddings.
question_results = session.run(question_embeddings)
response_results = session.run(response_embeddings)

# print(np.inner(question_results["outputs"], response_results["outputs"]))

In [0]:
# len(response_results['outputs'])
qe = []
opt1e = []
opt2e = []
for i in range(len(question_results['outputs'])):
  qe.append(question_results['outputs'][i])
  opt1e.append(response_results['outputs'][(2*i)])
  opt2e.append(response_results['outputs'][(2*i)+1])

In [0]:
df['QuestionE'] = qe
df['Option1E'] = opt1e
df['Option2E'] = opt2e

In [0]:
embed = hub.Module("https://tfhub.dev/google/universal-sentence-encoder-large/3")

In [0]:
with tf.Session() as session:
  session.run([tf.global_variables_initializer(), tf.tables_initializer()])
  message_embeddings = session.run(embed(kb))

INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


In [0]:
kbe = []
for i in range(len(message_embeddings)):
  kbe.append(message_embeddings[i])

In [0]:
df['KBE'] = kbe

In [0]:
print("Number of Class-1: " + str((df['Label']==1).sum(axis=0)))
print("Number of Class-0: " + str((df['Label']==0).sum(axis=0)))

Number of Class-1: 1322
Number of Class-0: 1320


In [0]:
df.head()

Unnamed: 0,KB,Question,Option1,Option2,AA,Label,QuestionE,Option1E,Option2E,KBE
0,bee landed on flower because it had pollen.,what does it refer to?,it refers to bee,it refers to flower,flower,1,"[0.03676931, -0.030496405, -0.0513505, -0.0062...","[-0.005842374, -0.002963165, 0.018208452, 0.01...","[-0.041813824, 0.03193633, -0.023663007, -0.01...","[0.04859948, -0.00012089022, 0.05877012, 0.030..."
1,bee landed on flower because it had pollen.,what had pollen?,bee had pollen,flower had pollen,flower,1,"[0.012102967, 0.03984652, -0.013288025, -0.003...","[-0.018822478, 0.048073266, 0.06687331, -0.028...","[-0.026243063, 0.05944981, 0.037827123, -0.049...","[0.04859948, -0.00012089022, 0.05877012, 0.030..."
2,bee landed on flower because it wanted pollen.,what does it refer to?,it refers to bee,it refers to flower,bee,0,"[0.036769304, -0.030496396, -0.051350504, -0.0...","[-0.005053184, -0.0064505856, 0.020013489, 0.0...","[-0.04075606, 0.029278206, -0.02004551, -0.023...","[0.05312615, -0.0060285414, 0.061304126, 0.025..."
3,bee landed on flower because it wanted pollen.,what wanted pollen?,bee wanted pollen,flower wanted pollen,bee,0,"[0.014923883, 0.03446279, -0.031669125, 0.0117...","[0.002110022, 0.04931997, 0.017973281, -0.0026...","[0.0032249072, 0.062103763, -0.021100633, -0.0...","[0.05312615, -0.0060285414, 0.061304126, 0.025..."
4,"when debbie splashed tina, she got in trouble.",what does she refer to?,she refers to debbie,she refers to tina,debbie,0,"[0.023011189, -0.019143254, -0.03604531, -0.01...","[0.007943569, 0.035678472, -0.013393052, -0.00...","[0.010381961, 0.053909026, 0.0076449844, 0.001...","[-0.036982954, -0.013073404, -0.041511405, 0.0..."


In [0]:
XTrain = prepareInpData(df)

In [0]:
# XTrain.shape

In [0]:
YTrain = df['Label']

## Testing

In [0]:
questions, responses, response_contexts, kb = genEmbedInput(df2)

In [0]:
# Set up graph.
g = tf.Graph()
with g.as_default():
  module = hub.Module('https://tfhub.dev/google/universal-sentence-encoder-multilingual-qa/1')
  question_embeddings = module(dict(input=questions), signature="question_encoder", as_dict=True)
  response_embeddings = module(dict(input=responses, context=response_contexts), signature="response_encoder", as_dict=True)
  init_op = tf.group([tf.global_variables_initializer(), tf.tables_initializer()])

g.finalize()

INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


In [0]:
# Initialize session.
session = tf.Session(graph=g)
session.run(init_op)

# Compute embeddings.
question_results = session.run(question_embeddings)
response_results = session.run(response_embeddings)

# print(np.inner(question_results["outputs"], response_results["outputs"]))

In [0]:
# len(response_results['outputs'])
qe = []
opt1e = []
opt2e = []
for i in range(len(question_results['outputs'])):
  qe.append(question_results['outputs'][i])
  opt1e.append(response_results['outputs'][(2*i)])
  opt2e.append(response_results['outputs'][(2*i)+1])

In [0]:
df2['QuestionE'] = qe
df2['Option1E'] = opt1e
df2['Option2E'] = opt2e

In [0]:
embed = hub.Module("https://tfhub.dev/google/universal-sentence-encoder-large/3")

In [0]:
with tf.Session() as session:
  session.run([tf.global_variables_initializer(), tf.tables_initializer()])
  message_embeddings = session.run(embed(kb))

INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


In [0]:
kbe = []
for i in range(len(message_embeddings)):
  kbe.append(message_embeddings[i])

In [0]:
df2['KBE'] = kbe

In [0]:
print("Number of Class-1: " + str((df2['Label']==1).sum(axis=0)))
print("Number of Class-0: " + str((df2['Label']==0).sum(axis=0)))

Number of Class-1: 282
Number of Class-0: 286


In [0]:
XTest = prepareInpData(df2)

In [0]:
YTest = df2['Label']

## Save and Load Data

In [0]:
import tensorflow as tf
import tensorflow_hub as hub
import numpy as np
import pandas as pd
import sklearn
import pickle
from sklearn.svm import SVC
from sklearn import svm
from sklearn.linear_model import SGDClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import confusion_matrix

In [0]:
# # Save Form-1
# np.savetxt('data/XTrainForm1.csv', XTrain, delimiter = ',')
# with open("data/YTrainForm1.pkl", "wb") as fp:
#   pickle.dump(YTrain, fp)

# np.savetxt('data/XTestForm1.csv', XTest, delimiter = ',')
# with open("data/YTestForm1.pkl", "wb") as fp:
#   pickle.dump(YTest, fp)

In [0]:
# # Load Form-1
XTrain = np.loadtxt('data/XTrainForm1.csv', delimiter = ',')
XTest = np.loadtxt('data/XTestForm1.csv', delimiter = ',')
with open("data/YTrainForm1.pkl", "rb") as fp:
  YTrain = pickle.load(fp)
with open("data/YTestForm1.pkl", "rb") as fp:
  YTest = pickle.load(fp)

In [0]:
# # Save Form-2
# np.savetxt('data/XTrainForm2.csv', XTrain, delimiter = ',')
# with open("data/YTrainForm2.pkl", "wb") as fp:
#   pickle.dump(YTrain, fp)

# np.savetxt('data/XTestForm2.csv', XTest, delimiter = ',')
# with open("data/YTestForm2.pkl", "wb") as fp:
#   pickle.dump(YTest, fp)

In [0]:
# Load Form-2
# XTrain = np.loadtxt('data/XTrainForm2.csv', delimiter = ',')
# XTest = np.loadtxt('data/XTestForm2.csv', delimiter = ',')
# with open("data/YTrainForm2.pkl", "rb") as fp:
#   YTrain = pickle.load(fp)
# with open("data/YTestForm2.pkl", "rb") as fp:
#   YTest = pickle.load(fp)

In [0]:
# # Save Combined
# np.savetxt('data/XTrainCombined.csv', XTrain, delimiter = ',')
# with open("data/YTrainCombined.pkl", "wb") as fp:
#   pickle.dump(YTrain, fp)

# np.savetxt('data/XTestCombined.csv', XTest, delimiter = ',')
# with open("data/YTestCombined.pkl", "wb") as fp:
#   pickle.dump(YTest, fp)

In [0]:
# # Load Combined
# XTrain = np.loadtxt('data/XTrainCombined.csv', delimiter = ',')
# XTest = np.loadtxt('data/XTestCombined.csv', delimiter = ',')
# with open("data/YTrainCombined.pkl", "rb") as fp:
#   YTrain = pickle.load(fp)
# with open("data/YTestCombined.pkl", "rb") as fp:
#   YTest = pickle.load(fp)

## Model

In [0]:
# pCLF = MLPClassifier(max_iter=10000, alpha=0.01, warm_start=True)
pCLF = MLPClassifier(max_iter= 10000, alpha=0.001, warm_start=True)
_ = pCLF.fit(XTrain, YTrain)
print('Training Complete')

Training Complete


In [0]:
prediction_result = pCLF.predict(XTest)

In [0]:
# Accuracy
np.mean(prediction_result == YTest)

0.6232394366197183

In [0]:
cm = confusion_matrix(prediction_result, YTest)
cm

array([[84, 48],
       [59, 93]])

In [0]:
# with open("data/bestPredTrain.pkl", "wb") as fp:
#   pickle.dump(prediction_result_train, fp)

# with open("data/bestPredTrain.pkl", "rb") as fp:
#   pred = pickle.load(fp)

In [0]:
with open("data/bestPred.pkl", "wb") as fp:
  pickle.dump(prediction_result, fp)

with open("data/bestPred.pkl", "rb") as fp:
  pred = pickle.load(fp)

In [0]:
# End