# Importing Libraries

In [0]:
import re
import pandas as pd
import numpy as np
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from collections import Counter 
from nltk.corpus import wordnet # To get words in dictionary with their parts of speech
from nltk.stem import WordNetLemmatizer # lemmatizes word based on it's parts of speech

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


**Importing Datasets**

In [0]:
df_train = pd.read_csv('/content/drive/My Drive/VL_DATA/Train.csv')
df_test = pd.read_csv('/content/drive/My Drive/VL_DATA/Test.csv')
df_train.head()

Unnamed: 0,question,answer_text,distractor
0,Meals can be served,in rooms at 9:00 p. m.,"'outside the room at 3:00 p. m.', 'in the dini..."
1,It can be inferred from the passage that,The local government can deal with the problem...,"'If some tragedies occur again ', ' relevant d..."
2,The author called Tommy 's parents in order to,help them realize their influence on Tommy,"'blame Tommy for his failing grades', 'blame T..."
3,It can be inferred from the passage that,the writer is not very willing to use idioms,'idioms are the most important part in a langu...
4,How can we deal with snake wounds according to...,Stay calm and do n't move .,'Cut the wound and suck the poison out .'


# Preprocessing


Conversion of text to lowercase

In [0]:
df_train['question'] = df_train['question'].str.lower()
df_train['answer_text'] = df_train['answer_text'].str.lower()
df_train['distractor'] = df_train['distractor'].str.lower()

In [0]:
df_train.head()

Unnamed: 0,question,answer_text,distractor
0,meals can be served,in rooms at 9:00 p. m.,"'outside the room at 3:00 p. m.', 'in the dini..."
1,it can be inferred from the passage that,the local government can deal with the problem...,"'if some tragedies occur again ', ' relevant d..."
2,the author called tommy 's parents in order to,help them realize their influence on tommy,"'blame tommy for his failing grades', 'blame t..."
3,it can be inferred from the passage that,the writer is not very willing to use idioms,'idioms are the most important part in a langu...
4,how can we deal with snake wounds according to...,stay calm and do n't move .,'cut the wound and suck the poison out .'


In [0]:
df_test['question'] = df_test['question'].str.lower()
df_test['answer_text'] = df_test['answer_text'].str.lower()

In [0]:
df_test.head()

Unnamed: 0,question,answer_text
0,what 's the main idea of the text ?,the lack of career -- based courses in us high...
1,"in the summer high season , finland does nt se...",the sun is out at night
2,if you want to apply for chinese business inte...,have to get confirmed at least twice
3,"that afternoon , the boy 's clothes were dry b...",nobody made room for him in the water .
4,which of the following statements is not true ?,there are twelve countries in the world wildli...


**Converting the sentences given to list of words**

In [0]:
# word_tokenize(df['question'][0])
for x in range(len(df_train['question'])):
    df_train['question'][x] = word_tokenize(df_train['question'][x])
for y in range(len(df_train['answer_text'])):
    df_train['answer_text'][y] = word_tokenize(df_train['answer_text'][y])
for z in range(len(df_train['distractor'])):
    df_train['distractor'][z] = word_tokenize(df_train['distractor'][z])

In [0]:
df_train.head()

Unnamed: 0,question,answer_text,distractor
0,"[meals, can, be, served]","[in, rooms, at, 9:00, p., m, .]","['outside, the, room, at, 3:00, p., m., ', ,, ..."
1,"[it, can, be, inferred, from, the, passage, that]","[the, local, government, can, deal, with, the,...","['if, some, tragedies, occur, again, ', ,, ', ..."
2,"[the, author, called, tommy, 's, parents, in, ...","[help, them, realize, their, influence, on, to...","['blame, tommy, for, his, failing, grades, ', ..."
3,"[it, can, be, inferred, from, the, passage, that]","[the, writer, is, not, very, willing, to, use,...","['idioms, are, the, most, important, part, in,..."
4,"[how, can, we, deal, with, snake, wounds, acco...","[stay, calm, and, do, n't, move, .]","['cut, the, wound, and, suck, the, poison, out..."


In [0]:
for x in range(len(df_test['question'])):
    df_test['question'][x] = word_tokenize(df_test['question'][x])
for y in range(len(df_test['answer_text'])):
    df_test['answer_text'][y] = word_tokenize(df_test['answer_text'][y])

In [0]:
df_test.head()

Unnamed: 0,question,answer_text
0,"[what, 's, the, main, idea, of, the, text, ?]","[the, lack, of, career, --, based, courses, in..."
1,"[in, the, summer, high, season, ,, finland, do...","[the, sun, is, out, at, night]"
2,"[if, you, want, to, apply, for, chinese, busin...","[have, to, get, confirmed, at, least, twice]"
3,"[that, afternoon, ,, the, boy, 's, clothes, we...","[nobody, made, room, for, him, in, the, water, .]"
4,"[which, of, the, following, statements, is, no...","[there, are, twelve, countries, in, the, world..."


**Removing stopwords from the dataset**

Note:- stopwords are words those occur more frequently like is, the etc.

In [0]:
stop_words = set(stopwords.words('english')) 

In [0]:
for x in range(len(df_train['question'])):
    df_train['question'][x] = [w for w in df_train['question'][x] if not w in stop_words]
for y in range(len(df_train['answer_text'])):
    df_train['answer_text'][y] = [w for w in df_train['answer_text'][y] if not w in stop_words]
for z in range(len(df_train['distractor'])):
    df_train['distractor'][z] = [w for w in df_train['distractor'][z] if not w in stop_words]

In [0]:
df_train.head()

Unnamed: 0,question,answer_text,distractor
0,"[meals, served]","[rooms, 9:00, p., .]","['outside, room, 3:00, p., m., ', ,, 'in, dini..."
1,"[inferred, passage]","[local, government, deal, problem, lacking, mo...","['if, tragedies, occur, ', ,, ', relevant, dep..."
2,"[author, called, tommy, 's, parents, order]","[help, realize, influence, tommy]","['blame, tommy, failing, grades, ', ,, 'blame,..."
3,"[inferred, passage]","[writer, willing, use, idioms]","['idioms, important, part, language, ', ,, 'no..."
4,"[deal, snake, wounds, according, passage, ?]","[stay, calm, n't, move, .]","['cut, wound, suck, poison, ., ']"


# Lemmatization

Converting words to their root form. EXample: root form of **Playing** is **Play**.

In [0]:
WNlemma = nltk.WordNetLemmatizer()
# [WNlemma.lemmatize(t) for t in text7]
for x in range(len(df_train['question'])):
    df_train['question'][x] = [WNlemma.lemmatize(t) for t in df_train['question'][x]]
for y in range(len(df_train['answer_text'])):
    df_train['answer_text'][y] = [WNlemma.lemmatize(t) for t in df_train['answer_text'][y]]
for z in range(len(df_train['distractor'])):
    df_train['distractor'][z] = [WNlemma.lemmatize(t) for t in df_train['distractor'][z]]

In [0]:
df_train.head()

Unnamed: 0,question,answer_text,distractor
0,"[meal, served]","[room, 9:00, p., .]","['outside, room, 3:00, p., m., ', ,, 'in, dini..."
1,"[inferred, passage]","[local, government, deal, problem, lacking, mo...","['if, tragedy, occur, ', ,, ', relevant, depar..."
2,"[author, called, tommy, 's, parent, order]","[help, realize, influence, tommy]","['blame, tommy, failing, grade, ', ,, 'blame, ..."
3,"[inferred, passage]","[writer, willing, use, idiom]","['idioms, important, part, language, ', ,, 'no..."
4,"[deal, snake, wound, according, passage, ?]","[stay, calm, n't, move, .]","['cut, wound, suck, poison, ., ']"


In [0]:
for x in range(len(df_train['question'])):
    df_train['question'][x] = [re.sub(r"[^a-zA-Z0-9]+", '', k) for k in df_train['question'][x]]
for y in range(len(df_train['answer_text'])):
    df_train['answer_text'][y] = [re.sub(r"[^a-zA-Z0-9]+", '', k) for k in df_train['answer_text'][y]]
for z in range(len(df_train['distractor'])):
        df_train['distractor'][z] = [re.sub(r"[^a-zA-Z0-9]+", '', k) for k in df_train['distractor'][z]]

In [0]:
df_train.head()

Unnamed: 0,question,answer_text,distractor
0,"[meal, served]","[room, 900, p, ]","[outside, room, 300, p, m, , , in, dining, , r..."
1,"[inferred, passage]","[local, government, deal, problem, lacking, mo...","[if, tragedy, occur, , , , relevant, departmen..."
2,"[author, called, tommy, s, parent, order]","[help, realize, influence, tommy]","[blame, tommy, failing, grade, , , blame, tomm..."
3,"[inferred, passage]","[writer, willing, use, idiom]","[idioms, important, part, language, , , nonnat..."
4,"[deal, snake, wound, according, passage, ]","[stay, calm, nt, move, ]","[cut, wound, suck, poison, , ]"


In [0]:
lw = []
for i in range(len(df_train['question'])):
    for w in df_train['question'][i]:
        if len(w)>1:
            lw.append(w)
            
for i in range(len(df_train['answer_text'])):
    for w in df_train['answer_text'][i]:
        if len(w)>1:
            lw.append(w)
            
for i in range(len(df_train['distractor'])):
    for w in df_train['distractor'][i]:
        if len(w)>1:
            lw.append(w)

# Processing Test dataset

In [0]:
# word_tokenize(df['question'][0])
for x in range(len(df_test['question'])):
    df_test['question'][x] = [w for w in df_test['question'][x] if not w in stop_words]
for y in range(len(df_test['answer_text'])):
    df_test['answer_text'][y] = [w for w in df_test['answer_text'][y] if not w in stop_words]

In [0]:
# WNlemma = nltk.WordNetLemmatizer()
# [WNlemma.lemmatize(t) for t in text7]
for x in range(len(df_test['question'])):
    df_test['question'][x] = [WNlemma.lemmatize(t) for t in df_test['question'][x]]
for y in range(len(df_test['answer_text'])):
    df_test['answer_text'][y] = [WNlemma.lemmatize(t) for t in df_test['answer_text'][y]]

In [0]:
for x in range(len(df_test['question'])):
    df_test['question'][x] = [re.sub(r"[^a-zA-Z0-9]+", '', k) for k in df_test['question'][x]]
for y in range(len(df_test['answer_text'])):
    df_test['answer_text'][y] = [re.sub(r"[^a-zA-Z0-9]+", '', k) for k in df_test['answer_text'][y]]

In [0]:
test_list = []
for i in range(len(df_test['question'])):
    for w in df_test['question'][i]:
        if len(w)>1:
            test_list.append(w) 
            
for i in range(len(df_test['answer_text'])):
    for w in df_test['answer_text'][i]:
        if len(w)>1:
            test_list.append(w) 

# Training Model
Preparing list of all words to train them using word2vec model

In [None]:
from gensim.models import FastText
from keras.callbacks import LambdaCallback
from keras.layers.recurrent import LSTM
from keras.layers.embeddings import Embedding
from keras.layers import Dense, Activation,Dropout
from keras.models import Sequential
from keras.objectives import cosine_proximity

In [None]:
model_2vec = FastText(size=4, window=5, min_count=0)  # instantiate
model_2vec.build_vocab(sentences=[corpus])
model_2vec.train(sentences=[corpus], total_examples=len(corpus), epochs=10) 

sequences = list()
for i in range(1, len(corpus)):
    sequence = corpus[i-1:i+1]
    sequences.append(sequence)
print('Total Sequences: %d' % len(sequences))

vocab_size,emdedding_size,pretrained_weights

In [None]:
max_sentence_len = 10
pretrained_weights = model_2vec.wv.syn0
vocab_size, emdedding_size = pretrained_weights.shape
def idx2word(idx):
    return model_2vec.wv.index2word[idx]

def word2idx(word):
    return model_2vec.wv.vocab[word].index

In [None]:
print('\nPreparing the data for LSTM...')
train_x = np.zeros([len(sequences), 23440], dtype=np.int32)
train_y = np.zeros([len(sequences), 23440], dtype=np.int32)
for i in range(len(sequences)):
    train_x[i] = word2idx(sequences[i][0])
    train_y[i] = word2idx(sequences[i][1])
print('train_x shape:', train_x.shape)
print('train_y shape:', train_y.shape)

train_x=train_x/23332
train_y=train_y/23332

In [None]:
print('Defining a Simple Keras Model...')
model=Sequential()  
model.add(Embedding(input_dim=model_2vec.wv.syn0.shape[0],output_dim=model_2vec.wv.syn0.shape[1],weights=[model_2vec.wv.syn0])) 
model.add(LSTM(units=500))

model.add(Dense(23440, activation='softmax'))

print(model.summary())

In [0]:
model.compile(optimizer='adam', loss='cosine_proximity',metrics=['accuracy'])
model.fit(train_x ,train_y,epochs=1,batch_size=128)

603161

In [None]:
def sample(preds, temperature=1.0):
    if temperature <= 0:
        return np.argmax(preds)
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

def generate_next(text):
    
    word_idxs = [word2idx(word) for word in text.lower().split()]
    
    for i in range(10):
        prediction = model.predict(x=np.array(word_idxs))
        
        idx = sample(prediction[-1], temperature=0.6)
        
        word_idxs.append(idx)
        l.append(' '.join(idx2word(idx) for idx in word_idxs))
        print(l)

# Generating distractor 
for the test dataset

**Distractor 1**

In [0]:
dist = []
for i in range(len(df_test['answer_text'])):
    ls = []
    for w in df_test['answer_text'][i]:
    if(len(w)>1):
        ls.append(generate_next(w)[0][0])
    dist.append(ls)

  
  if np.issubdtype(vec.dtype, np.int):


In [0]:
distractor = []
for j in range(len(dist)):
    Str = ' '.join([str(elem) for elem in dist[j]])
    distractor.append(Str);

#checking distractors generated
distractor[:25]

**Distractor 2**

In [0]:
dist1 = []
for i in range(len(df_test['answer_text'])):
    ls = []
    for w in df_test['answer_text'][i]:
    if(len(w)>1):
        ls.append(model_test.most_similar(w)[1][0])
    dist1.append(ls)
distractor1 = []
for j in range(len(dist1)):
    Str = ' '.join([str(elem) for elem in dist1[j]])
    distractor1.append(Str);

  
  if np.issubdtype(vec.dtype, np.int):


In [0]:
print(len(distractor1))
print(distractor1[:25])

13500
['monday conclude go may idea writer', 'toast could', 'writer introduces wrote mutate', 'mothered purpose tell story', 'childhood man author exam guimi', 'text want mainly choose eyed cartoons', 'ravaged purpose ios dar counsellor', 'chiver unconscious used writer subsidize', 'muybridge infer windsurfer story title', 'want noun future way', 'leech newsagent tell', 'think directly mainly girl seaside', 'glasses hillsborough', 'clawing kensington marwick', 'passage calgary parent mean action parent', 'title statement article driver', 'splendidly person', 'learn porsche tubby probably', 'notable passage article', 'teensgiving asean prevention chuanzhusi', 'want bette terrifying inferred', 'title following ftc borough beginner said', 'sinar 888', 'birdcage according scientist want according according', 'mentioned charge local reporting']


**Distractor 3**

In [0]:
dist2 = []
for i in range(len(df_test['answer_text'])):
    ls = []
    for w in df_test['answer_text'][i]:
    if(len(w)>1):
        ls.append(model_test.most_similar(w)[2][0])
    dist2.append(ls)
distractor2 = []
for j in range(len(dist1)):
    Str = ' '.join([str(elem) for elem in dist2[j]])
    distractor2.append(Str);

  
  if np.issubdtype(vec.dtype, np.int):


In [0]:
print(len(distractor2))
print(distractor2[:25])

13500
['spell mention team forest many mainly', 'formative order', 'according embryo important 10732', 'competence idea father inferred', 'clements new writer information morrison', 'according think purpose wrong creaking jiansong', 'lassen idea aibo 2300 westminster', 'urgent conclude title following dissolving', 'hugeness writer dunant would tell', 'think soup club advised', 'o2 misled infer', 'statement information following take leaders', 'fes moneyball', 'rubber laver concerned', 'following bun father might estate father', 'tell following statement sentence', 'attained dream', 'true ruled inicates author', 'calling story tell', '1830s stop discussing secondly', 'think cheese terminator article', 'tell passage nosed smugglers mulch rule', 'fluid symbolic', 'hands statement research think statement learn', 'article digitalkidsworld usstudent 19thand']


In [0]:
final_distractor = []
for i in range(13500):
    ls = []
    ls.append(distractor[i])
    ls.append(distractor1[i])
    ls.append(distractor2[i])
#   print(ls)
    final_distractor.append(ls)
# print(final_distractor)

**Joining all three distractors by comma**

In [0]:
dist_list = []
for k in range(len(final_distractor)):
    string = '\',\''.join(str(element) for element in final_distractor[k])
    dist_list.append(string)

In [0]:
for elem in dist_list:
    if elem =='':
        elem = 'hello'#dist_list

In [0]:
dist_list[0]

"1368 showed way thing need story','monday conclude go may idea writer','spell mention team forest many mainly"

**converting Distractor to data frame**

In [0]:
data_frame = pd.DataFrame(dist_list)

In [0]:
data_frame.head()

Unnamed: 0,0
0,"1368 showed way thing need story','monday conc..."
1,"attraeted like','toast could','formative order"
2,"author scholastic chinese patiently','writer i..."
3,"1926 infer take first','mothered purpose tell ..."
4,"producers need true driver inattentively','chi..."


In [0]:
data_frame.to_csv('f_dist.csv')