In [1]:
from fastai import *
from fastai.text import *

In [2]:
import numpy as np
import pandas as pd
import json


In [3]:
def squad_json_to_dataframe_train(input_file_path, record_path = ['data','paragraphs','qas','answers'],verbose = 1):
    """
    input_file_path: path to the squad json file.
    record_path: path to deepest level in json file default value is
    ['data','paragraphs','qas','answers']
    verbose: 0 to suppress it default is 1
    """
    if verbose:
        print("Reading the json file")    
    file = json.loads(open(input_file_path).read())
    if verbose:
        print("processing...")
    # parsing different level's in the json file
    js = pd.io.json.json_normalize(file , record_path )
    m = pd.io.json.json_normalize(file, record_path[:-1] )
    r = pd.io.json.json_normalize(file,record_path[:-2])
    
    #combining it into single dataframe
    idx = np.repeat(r['context'].values, r.qas.str.len())
    ndx  = np.repeat(m['id'].values,m['answers'].str.len())
    m['context'] = idx
    js['q_idx'] = ndx
    main = pd.concat([ m[['id','question','context']].set_index('id'),js.set_index('q_idx')],1,sort=False).reset_index()
    main['c_id'] = main['context'].factorize()[0]
    if verbose:
        print("shape of the dataframe is {}".format(main.shape))
        print("Done")
    return main

In [4]:
# training data
torch.cuda.empty_cache()
input_file_path = 'dataset/train-v2.0.json'
record_path = ['data','paragraphs','qas','answers']
train_data = squad_json_to_dataframe_train(input_file_path=input_file_path,record_path=record_path)

Reading the json file
processing...
shape of the dataframe is (130319, 6)
Done


In [5]:
train_data = train_data.iloc[0:10000].copy()
train_data.shape
#df

(10000, 6)

In [6]:
train_data.head()

Unnamed: 0,index,question,context,answer_start,text,c_id
0,56be85543aeaaa14008c9063,When did Beyonce start becoming popular?,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,269.0,in the late 1990s,0
1,56be85543aeaaa14008c9065,What areas did Beyonce compete in when she was...,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,207.0,singing and dancing,0
2,56be85543aeaaa14008c9066,When did Beyonce leave Destiny's Child and bec...,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,526.0,2003,0
3,56bf6b0f3aeaaa14008c9601,In what city and state did Beyonce grow up?,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,166.0,"Houston, Texas",0
4,56bf6b0f3aeaaa14008c9602,In which decade did Beyonce become famous?,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,276.0,late 1990s,0


In [7]:
tran_questions = train_data['question']

In [8]:
def parallel_trees(m, fn, n_jobs=16):
    return list(ThreadPoolExecutor(n_jobs).map(fn, m.estimators_))

In [9]:
valid_pct = 0.05 #validation percent
df = train_data.iloc[np.random.permutation(len(train_data))]
cut = int(valid_pct * len(train_data)) + 1
train_df, valid_df = df[cut:], df[:cut]

In [10]:
data_lm = TextLMDataBunch.from_df('data', train_df, valid_df, text_cols='question')

In [11]:
learn = language_model_learner(data_lm, pretrained_model=URLs.WT103, drop_mult=0.5)
learn.fit_one_cycle(1, 1e-2)

epoch,train_loss,valid_loss,accuracy
1,4.942879,3.748623,0.346051


In [None]:
learn.unfreeze()
learn.fit_one_cycle(1, 1e-3)

In [12]:
wd=1e-7
lr=1e-3
lrs = lr

In [13]:
learn.fit(15,lrs, wd)

epoch,train_loss,valid_loss,accuracy
1,3.550487,3.015759,0.442345
2,3.248359,2.83924,0.463125
3,3.042901,2.722095,0.474685
4,2.89015,2.672304,0.48595
5,2.763742,2.633703,0.487403
6,2.676948,2.616692,0.490885
7,2.59039,2.585011,0.494913
8,2.52254,2.565977,0.495276
9,2.467773,2.553158,0.502422
10,2.425944,2.554808,0.504546


In [14]:
number_of_ideas = 100
ideas_counter = 0
all_ideas = []

for i in range(100):
    idea = learn.predict("xxbos xxfld 1", n_words=20, temperature=0.8)
    ideas = idea.split("xxbos xxfld 1")
    ideas = ideas[1:-1]
    
    for idea in ideas:
        idea = idea.replace("xxbos xxfld 1 ","").strip()
        if(idea):
            all_ideas.append(idea)
            ideas_counter = ideas_counter+1
            
    if ideas_counter > number_of_ideas:
        break

In [15]:
all_ideas

['how does the admissions separate exclusively from within academic groups as defined in regards to who ?',
 'genome size can be accomplished by : compositions to transfer existing genomes ?',
 'xxmaj smaller dogs in higher eukaryotes near what are two new games ?',
 'where is led web sites refer to silico issues ?',
 'xxmaj about how many xxmaj buildings collapsed ?',
 'genome sequence features approximately how many chromosomes can be controlled by genome size via genome size ?',
 'in 1995 , business protesters the torch visited xxmaj chopin in xxmaj nohant ?',
 'xxmaj which other way did he develop ?',
 'i - xxup ii made decision when bodhi was delivered to him in season twelve ?',
 "xxmaj which of the names of the game 's roots ?",
 'after leaving the palace 1980s , how much money did xxmaj crazy come into their life ?',
 'a replacement feature can removing without removing profit as per xxup r&b ?',
 'xxmaj when did the actor attend a school during the season of xxmaj american xxm