In [27]:
import tensorflow as tf
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow_hub as hub
import numpy as np
%matplotlib inline

In [28]:
df = pd.read_csv("data/question_embeddings0.csv")
df.shape

(10000, 512)

In [29]:
# read training data
train_df = pd.read_csv("data/train.csv")
print("Training Data Frame Dimensions: " + str(train_df.shape))

Training Data Frame Dimensions: (1306122, 3)


In [30]:
# first 5 elements of training data
train_df.head()

Unnamed: 0,qid,question_text,target
0,00002165364db923c7e6,How did Quebec nationalists see their province...,0
1,000032939017120e6e44,"Do you have an adopted dog, how would you enco...",0
2,0000412ca6e4628ce2cf,Why does velocity affect time? Does velocity a...,0
3,000042bf85aa498cd78e,How did Otto von Guericke used the Magdeburg h...,0
4,0000455dfa3e01eae3af,Can I convert montra helicon D to a mountain b...,0


In [31]:
# last 5 elements of training data
train_df.tail()

Unnamed: 0,qid,question_text,target
1306117,ffffcc4e2331aaf1e41e,What other technical skills do you need as a c...,0
1306118,ffffd431801e5a2f4861,Does MS in ECE have good job prospects in USA ...,0
1306119,ffffd48fb36b63db010c,Is foam insulation toxic?,0
1306120,ffffec519fa37cf60c78,How can one start a research project based on ...,0
1306121,ffffed09fedb5088744a,Who wins in a battle between a Wolverine and a...,0


In [32]:
insincere_df = train_df.loc[train_df["target"] == 1]
print("The number of insincere is " + str(insincere_df.shape[0]))

The number of insincere is 80810


In [33]:
sincere_df = train_df.loc[train_df["target"] == 0]
print("The number of sincere is " + str(sincere_df.shape[0]))

The number of sincere is 1225312


In [34]:
# get numpy array of questions
questions = train_df["question_text"].values

In [35]:
test = questions[0:2]
test

array(['How did Quebec nationalists see their province as a nation in the 1960s?',
       'Do you have an adopted dog, how would you encourage people to adopt and not shop?'],
      dtype=object)

In [38]:
# Import the Universal Sentence Encoder's TF Hub module
module_url = "https://tfhub.dev/google/universal-sentence-encoder/2"
embed = hub.Module(module_url)

In [63]:
with tf.Session() as sess:
    sess.run([tf.global_variables_initializer(), tf.tables_initializer()])
    question_embeddings = sess.run(embed(test))
    print(question_embeddings)
    print(question_embeddings.shape)
    embedding_df = pd.DataFrame(question_embeddings)
    embedding_df.to_csv("test.csv", index=False)

INFO:tensorflow:Saver not created because there are no variables in the graph to restore
[[-0.00416658  0.00399125 -0.06326147 ... -0.04322257 -0.04416081
  -0.06633756]
 [ 0.028851   -0.04676896 -0.00342507 ...  0.05266976 -0.03536891
   0.00574462]]
(2, 512)


In [64]:
pd.read_csv("test.csv")

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,502,503,504,505,506,507,508,509,510,511
0,-0.004167,0.003991,-0.063261,0.05159,-0.041,-0.04442,0.084289,-0.007363,-0.021403,-0.022497,...,0.060429,-0.062331,-0.017937,-0.033932,0.009563,-0.063174,0.053774,-0.043223,-0.044161,-0.066338
1,0.028851,-0.046769,-0.003425,0.002011,0.03864,0.033116,0.003396,0.041565,-0.004693,0.022073,...,0.045903,0.027739,-0.002885,0.001373,-0.040582,-0.004657,0.026895,0.05267,-0.035369,0.005745


In [67]:
tf.__version__

'1.10.0'

In [68]:
pd.read_csv("test_real.csv")

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,502,503,504,505,506,507,508,509,510,511
0,-0.004167,0.003991,-0.063261,0.05159,-0.041,-0.04442,0.084289,-0.007363,-0.021403,-0.022497,...,0.060429,-0.062331,-0.017937,-0.033932,0.009563,-0.063174,0.053774,-0.043223,-0.044161,-0.066338
1,0.028851,-0.046769,-0.003425,0.002011,0.03864,0.033116,0.003396,0.041565,-0.004693,0.022073,...,0.045903,0.027739,-0.002885,0.001373,-0.040582,-0.004657,0.026895,0.05267,-0.035369,0.005745


In [23]:
from keras.models import Model
from keras.layers import Input, Dense, Activation
from keras import regularizers

Using TensorFlow backend.


In [26]:
X = Input(shape=(512,), name='input')
hidden1 = Dense(units=1024, activation='relu', name='hidden1')(X)
hidden2 = Dense(units=512, activation='relu', kernel_regularizer=regularizers.l2(0.01), name='hidden2')(hidden1)
hidden3 = Dense(units=256, activation='relu', kernel_regularizer=regularizers.l2(0.01), name='hidden3')(hidden2)
output = Dense(units=1, activation='sigmoid', kernel_regularizer=regularizers.l2(0.01), name='output')(hidden3)
model = Model(inputs=X, outputs=output)
model.compile(optimizer='Adam', loss='binary_crossentropy', metrics=['accuracy'])
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input (InputLayer)           (None, 512)               0         
_________________________________________________________________
hidden1 (Dense)              (None, 1024)              525312    
_________________________________________________________________
hidden2 (Dense)              (None, 512)               524800    
_________________________________________________________________
hidden3 (Dense)              (None, 256)               131328    
_________________________________________________________________
output (Dense)               (None, 1)                 257       
Total params: 1,181,697
Trainable params: 1,181,697
Non-trainable params: 0
_________________________________________________________________


In [84]:
# generate random training data
x_train = np.random.random((100000, 512))
y_train = np.random.randint(2, size=(100000, 1))

In [None]:
model.fit(x_train, y_train, epochs=20)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
 17728/100000 [====>.........................] - ETA: 32s - loss: 0.6932 - acc: 0.5013

In [78]:
np.sum((model.predict(x_train) >= 0.5) == y_train)

910

In [80]:
(model.predict(x_train) >= 0.5)[0:5]

array([[ True],
       [ True],
       [False],
       [False],
       [ True]])

In [81]:
y_train[0:5]

array([[1],
       [1],
       [0],
       [0],
       [1]])