In [95]:
import numpy as np, pandas as pd
import json
import ast 
from textblob import TextBlob
import torch
import pickle
from scipy import spatial
import warnings
warnings.filterwarnings('ignore')

In [96]:
train = pd.read_csv("data/train.csv")

In [97]:
with open("data/dict_embeddings1.pickle", "rb") as f:
    d1 = pickle.load(f)

In [98]:
with open("data/dict_embeddings2.pickle", "rb") as f:
    d2 = pickle.load(f)

In [99]:
dict_emb = dict(d1)
dict_emb.update(d2)

In [100]:
len(dict_emb)

380

In [101]:
del d1, d2

In [102]:
train.dropna(inplace=True)

In [103]:
train.head()

Unnamed: 0,context,question
0,\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,"In Pillow, RGBA values are represented how ?"
1,\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,Normally transparent pixels are pasted how ?
2,\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,Cropping an image means what ?


In [104]:
def process_data(train):
    
    print("step 1")
    train['sentences'] = train['context'].apply(lambda x: [item.raw for item in TextBlob(x).sentences])
    
    print("step 2")
    train['sent_emb'] = train['sentences'].apply(lambda x: [dict_emb[item][0] if item in\
                                                           dict_emb else np.zeros(4096) for item in x])
    print("step 3")
    train['quest_emb'] = train['question'].apply(lambda x: dict_emb[x] if x in dict_emb else np.zeros(4096) )
        
    return train

In [105]:
train = process_data(train)

step 1
step 2
step 3


In [106]:
train.head(3)

Unnamed: 0,context,question,sentences,sent_emb,quest_emb
0,\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,"In Pillow, RGBA values are represented how ?",[\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\...,"[[0.04268799, 0.09025154, 0.124629974, 0.02754...","[[0.088152945, 0.08341008, -0.009891508, -0.04..."
1,\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,Normally transparent pixels are pasted how ?,[\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\...,"[[0.04268799, 0.09025154, 0.124629974, 0.02754...","[[0.058497343, 0.080020554, 0.120430276, -0.05..."
2,\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,Cropping an image means what ?,[\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\...,"[[0.04268799, 0.09025154, 0.124629974, 0.02754...","[[0.063782886, 0.109130405, 0.11897445, -0.086..."


In [107]:
def cosine_sim(x):
    li = []
    
    for i in range(len(train['question'])):
        laux = []
        
        for item in x["sent_emb"][i]:
            laux.append(spatial.distance.cosine(item,x["quest_emb"][i]))
        
        li.append(laux)
            
    return li

In [108]:
def pred_idx(distances):
    return np.argmin(distances)   

In [109]:
train['cosine_sim'] = cosine_sim(train)

In [110]:
train["pred_idx_cos"] = train["cosine_sim"].apply(lambda x: pred_idx(x))

In [111]:
train.head()

Unnamed: 0,context,question,sentences,sent_emb,quest_emb,cosine_sim,pred_idx_cos
0,\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,"In Pillow, RGBA values are represented how ?",[\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\...,"[[0.04268799, 0.09025154, 0.124629974, 0.02754...","[[0.088152945, 0.08341008, -0.009891508, -0.04...","[0.39357614517211914, 0.36637061834335327, 0.4...",17
1,\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,Normally transparent pixels are pasted how ?,[\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\...,"[[0.04268799, 0.09025154, 0.124629974, 0.02754...","[[0.058497343, 0.080020554, 0.120430276, -0.05...","[0.4045429229736328, 0.4004858136177063, 0.414...",134
2,\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,Cropping an image means what ?,[\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\...,"[[0.04268799, 0.09025154, 0.124629974, 0.02754...","[[0.063782886, 0.109130405, 0.11897445, -0.086...","[0.47937631607055664, 0.4632381796836853, 0.47...",89


In [112]:
train['question'][0]

'In Pillow, RGBA values are represented how ?'

In [113]:
train["sentences"][0][train['pred_idx_cos'][0]]

'In Pillow, RGBA values are represented by a tuple of four integer values.'

In [114]:
train['question'][1]

'Normally transparent pixels are pasted how ?'

In [115]:
train["sentences"][1][train['pred_idx_cos'][1]]

'If the image you want \nto paste has transparent pixels, pass the Image object as the third argument \nso that a solid rectangle isn’t pasted .'

In [116]:
train['question'][2]

'Cropping an image means what ?'

In [117]:
train["sentences"][2][train['pred_idx_cos'][2]]

'Cropping Images\nCropping an image means selecting a rectangular region inside an image \nand removing everything outside the rectangle.'