In [1]:
from fastai import *
from fastai.text import *

In [2]:
import numpy as np
import pandas as pd
import json


In [3]:
def squad_json_to_dataframe_train(input_file_path, record_path = ['data','paragraphs','qas','answers'],verbose = 1):
    """
    input_file_path: path to the squad json file.
    record_path: path to deepest level in json file default value is
    ['data','paragraphs','qas','answers']
    verbose: 0 to suppress it default is 1
    """
    if verbose:
        print("Reading the json file")    
    file = json.loads(open(input_file_path).read())
    if verbose:
        print("processing...")
    # parsing different level's in the json file
    js = pd.io.json.json_normalize(file , record_path )
    m = pd.io.json.json_normalize(file, record_path[:-1] )
    r = pd.io.json.json_normalize(file,record_path[:-2])
    
    #combining it into single dataframe
    idx = np.repeat(r['context'].values, r.qas.str.len())
    ndx  = np.repeat(m['id'].values,m['answers'].str.len())
    m['context'] = idx
    js['q_idx'] = ndx
    main = pd.concat([ m[['id','question','context']].set_index('id'),js.set_index('q_idx')],1,sort=False).reset_index()
    main['c_id'] = main['context'].factorize()[0]
    if verbose:
        print("shape of the dataframe is {}".format(main.shape))
        print("Done")
    return main

In [4]:
# training data
torch.cuda.empty_cache()
input_file_path = 'dataset/train-v2.0.json'
record_path = ['data','paragraphs','qas','answers']
train_data = squad_json_to_dataframe_train(input_file_path=input_file_path,record_path=record_path)

Reading the json file
processing...
shape of the dataframe is (130319, 6)
Done


In [5]:
train_data = train_data.iloc[0:20000].copy()
train_data.shape
#df

(20000, 6)

In [6]:
train_data.head()

Unnamed: 0,index,question,context,answer_start,text,c_id
0,56be85543aeaaa14008c9063,When did Beyonce start becoming popular?,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,269.0,in the late 1990s,0
1,56be85543aeaaa14008c9065,What areas did Beyonce compete in when she was...,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,207.0,singing and dancing,0
2,56be85543aeaaa14008c9066,When did Beyonce leave Destiny's Child and bec...,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,526.0,2003,0
3,56bf6b0f3aeaaa14008c9601,In what city and state did Beyonce grow up?,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,166.0,"Houston, Texas",0
4,56bf6b0f3aeaaa14008c9602,In which decade did Beyonce become famous?,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,276.0,late 1990s,0


In [7]:
tran_questions = train_data['question']

In [8]:
def parallel_trees(m, fn, n_jobs=16):
    return list(ThreadPoolExecutor(n_jobs).map(fn, m.estimators_))

In [9]:
valid_pct = 0.05 #validation percent
df = train_data.iloc[np.random.permutation(len(train_data))]
cut = int(valid_pct * len(train_data)) + 1
train_df, valid_df = df[cut:], df[:cut]

In [10]:
data_lm = TextLMDataBunch.from_df('data', train_df, valid_df, text_cols='question')

In [11]:
learn = language_model_learner(data_lm, pretrained_model=URLs.WT103, drop_mult=0.5)
learn.fit_one_cycle(1, 1e-2)

epoch,train_loss,valid_loss,accuracy
1,4.242163,3.341255,0.429657


In [12]:
learn.unfreeze()
learn.fit_one_cycle(1, 1e-3)

epoch,train_loss,valid_loss,accuracy
1,3.135643,2.849936,0.490649


In [13]:
wd=1e-7
lr=1e-3
lrs = lr

In [14]:
learn.fit(15,lrs, wd)

epoch,train_loss,valid_loss,accuracy
1,2.852762,2.67506,0.510437
2,2.712082,2.590513,0.520364
3,2.586449,2.535226,0.525519
4,2.483176,2.492414,0.529681
5,2.395777,2.464091,0.532155
6,2.311199,2.43502,0.537886
7,2.2322,2.420893,0.537826
8,2.152505,2.411088,0.541818
9,2.0844,2.396587,0.541928
10,2.023641,2.392487,0.543678


In [15]:
number_of_ideas = 100
ideas_counter = 0
all_ideas = []

for i in range(100):
    idea = learn.predict("xxbos xxfld 1", n_words=20, temperature=0.8)
    ideas = idea.split("xxbos xxfld 1")
    ideas = ideas[1:-1]
    
    for idea in ideas:
        idea = idea.replace("xxbos xxfld 1 ","").strip()
        idea = idea.replace("xxmaj ","").strip()
        if(idea):
            all_ideas.append(idea)
            ideas_counter = ideas_counter+1
            
    if ideas_counter > number_of_ideas:
        break

In [16]:
all_ideas

["when did youlou decide he would run a date 's preaspirated clock ?",
 'what commercial facility was responsible for the price of automobile ?',
 'in how many countries are there in 2010 ?',
 'xxup cbs and columbia records renamed to whom ?',
 "what percentage of plymouth 's residents are german or chinese ?",
 'what process of mass production is most of the remaining parts of ectosymbiosis ?',
 'how old was bell when he went to canada ?',
 'where is the atomic bomb detonated ?',
 'who has the power to issue gifts ?',
 'what talk show did american idol first air on american idol ?',
 'who described the film as " a musical production that encourages all gets back " ?',
 'fleming wrote which two extra main missions ?',
 'what is the name of the single released by universal pictures ?',
 'what made the spanish used in and around estonians ?',
 "beyonce 's voice span is how ?",
 'what is the name for the italian 2 gender bond ?',
 'what language is the congo mostly composed of ?',
 'how m