### Installs

In [None]:
!sudo pip install plotly
!sudo pip install lightgbm
!sudo pip install bokeh
!sudo pip install kaggle

### Download the data

In [2]:
import os
os.environ['KAGGLE_CONFIG_DIR']='/home/jupyter/kaggle-competitions/tfqa/config/.kaggle'
!chmod 600 $KAGGLE_CONFIG_DIR/kaggle.json
! mkdir /home/jupyter/kaggle-competitions/tfqa/data
os.chdir('/home/jupyter/kaggle-competitions/tfqa/data')
!kaggle competitions download tensorflow2-question-answering -f sample_submission.csv
!kaggle competitions download tensorflow2-question-answering -f simplified-nq-train.jsonl
!kaggle competitions download tensorflow2-question-answering -f simplified-nq-test.jsonl
!unzip simplified-nq-train.jsonl
!unzip simplified-nq-test.jsonl

Downloading sample_submission.csv to /home/jupyter/kaggle-competitions/tfqa/data
  0%|                                               | 0.00/18.2k [00:00<?, ?B/s]
100%|██████████████████████████████████████| 18.2k/18.2k [00:00<00:00, 14.3MB/s]


### Imports

In [8]:
import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.patches as patches

from plotly import tools, subplots
import plotly.offline as py
py.init_notebook_mode(connected = True)
import plotly.graph_objs as go
import plotly.express as px

pd.set_option('max_columns', 1000)

from bokeh.models import Panel, Tabs
from bokeh.io import output_notebook, show
from bokeh.plotting import figure

import lightgbm as lgb

import plotly.figure_factory as ff
import gc

from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder

import json

from keras.preprocessing import text, sequence
from sklearn.feature_extraction.text import CountVectorizer

Using TensorFlow backend.


In [9]:
path = '/home/jupyter/kaggle-competitions/tfqa/data/'
train_path = 'simplified-nq-train.jsonl'
test_path = 'simplified-nq-test.jsonl'
sample_submission_path = 'sample_submission.csv'

def read_data(path, sample = True, chunksize = 30000):
    if sample == True:
        df = []
        with open(path, 'rt') as reader:
            for i in range(chunksize):
                df.append(json.loads(reader.readline()))
        df = pd.DataFrame(df)
        print('Our sampled dataset have {} rows and {} columns'.format(df.shape[0], df.shape[1]))
    else:
        df = pd.read_json(path, orient = 'records', lines = True)
        print('Our dataset have {} rows and {} columns'.format(df.shape[0], df.shape[1]))
        gc.collect()
    return df

train = read_data(path+train_path, sample = True)
test = read_data(path+test_path, sample = False)
train.head()

Our sampled dataset have 30000 rows and 6 columns
Our dataset have 346 rows and 4 columns


Unnamed: 0,annotations,document_text,document_url,example_id,long_answer_candidates,question_text
0,"[{'annotation_id': 593165450220027640, 'short_...",Email marketing - Wikipedia <H1> Email marketi...,https://en.wikipedia.org//w/index.php?title=Em...,5655493461695504401,"[{'start_token': 14, 'top_level': True, 'end_t...",which is the most common use of opt-in e-mail ...
1,"[{'annotation_id': 12034874153783787365, 'shor...",The Mother ( How I Met Your Mother ) - wikiped...,https://en.wikipedia.org//w/index.php?title=Th...,5328212470870865242,"[{'start_token': 28, 'top_level': True, 'end_t...",how i.met your mother who is the mother
2,"[{'annotation_id': 10527123009892725162, 'shor...",Human fertilization - wikipedia <H1> Human fer...,https://en.wikipedia.org//w/index.php?title=Hu...,4435104480114867852,"[{'start_token': 14, 'top_level': True, 'end_t...",what type of fertilisation takes place in humans
3,"[{'annotation_id': 14634796365152556576, 'shor...",List of National Football League career quarte...,https://en.wikipedia.org//w/index.php?title=Li...,5289242154789678439,"[{'start_token': 28, 'top_level': True, 'end_t...",who had the most wins in the nfl
4,"[{'annotation_id': 11038549994888625916, 'shor...",Roanoke Colony - wikipedia <H1> Roanoke Colony...,https://en.wikipedia.org//w/index.php?title=Ro...,5489863933082811018,"[{'start_token': 32, 'top_level': True, 'end_t...",what happened to the lost settlement of roanoke


In [12]:
sample_submission = pd.read_csv(path + sample_submission_path)
print('Our sample submission have {} rows'.format(sample_submission.shape[0]))
sample_submission.head()

Our sample submission have 692 rows


Unnamed: 0,example_id,PredictionString
0,-1011141123527297803_long,
1,-1011141123527297803_short,
2,-1028916936938579349_long,
3,-1028916936938579349_short,
4,-1055197305756217938_long,


### Missing Values

In [15]:
def missing_values(df):
    df = pd.DataFrame(df.isnull().sum()).reset_index()
    df.columns = ['features', 'n_missing_values']
    return df
missing_values(train)

Unnamed: 0,features,n_missing_values
0,annotations,0
1,document_text,0
2,document_url,0
3,example_id,0
4,long_answer_candidates,0
5,question_text,0


In [16]:
missing_values(test)

Unnamed: 0,features,n_missing_values
0,document_text,0
1,example_id,0
2,long_answer_candidates,0
3,question_text,0


In [17]:
question_text_0 = train.loc[0, 'question_text']
question_text_0

'which is the most common use of opt-in e-mail marketing'

In [18]:
document_text_0 = train.loc[0, 'document_text'].split()
" ".join(document_text_0[:800])

"Email marketing - Wikipedia <H1> Email marketing </H1> Jump to : navigation , search <Table> <Tr> <Td> </Td> <Td> ( hide ) This article has multiple issues . Please help improve it or discuss these issues on the talk page . ( Learn how and when to remove these template messages ) <Table> <Tr> <Td> </Td> <Td> This article needs additional citations for verification . Please help improve this article by adding citations to reliable sources . Unsourced material may be challenged and removed . ( September 2014 ) ( Learn how and when to remove this template message ) </Td> </Tr> </Table> <Table> <Tr> <Td> </Td> <Td> This article possibly contains original research . Please improve it by verifying the claims made and adding inline citations . Statements consisting only of original research should be removed . ( January 2015 ) ( Learn how and when to remove this template message ) </Td> </Tr> </Table> ( Learn how and when to remove this template message ) </Td> </Tr> </Table> <Table> <Tr> <T

In [19]:
long_answer_candidates_0 = train.loc[0, 'long_answer_candidates']
long_answer_candidates_0[0:10]

[{'end_token': 170, 'start_token': 14, 'top_level': True},
 {'end_token': 169, 'start_token': 15, 'top_level': False},
 {'end_token': 103, 'start_token': 52, 'top_level': False},
 {'end_token': 102, 'start_token': 53, 'top_level': False},
 {'end_token': 156, 'start_token': 103, 'top_level': False},
 {'end_token': 155, 'start_token': 104, 'top_level': False},
 {'end_token': 321, 'start_token': 170, 'top_level': True},
 {'end_token': 180, 'start_token': 171, 'top_level': False},
 {'end_token': 186, 'start_token': 180, 'top_level': False},
 {'end_token': 224, 'start_token': 186, 'top_level': False}]

In [20]:
annotations_0 = train['annotations'][0][0]
annotations_0

{'annotation_id': 593165450220027640,
 'long_answer': {'candidate_index': 54,
  'end_token': 2019,
  'start_token': 1952},
 'short_answers': [{'end_token': 1969, 'start_token': 1960}],
 'yes_no_answer': 'NONE'}

In [21]:
print('Our question is : ', question_text_0)
print('Our short answer is : ', " ".join(document_text_0[annotations_0['short_answers'][0]['start_token']:annotations_0['short_answers'][0]['end_token']]))
print('Our long answer is : ', " ".join(document_text_0[annotations_0['long_answer']['start_token']:annotations_0['long_answer']['end_token']]))

Our question is :  which is the most common use of opt-in e-mail marketing
Our short answer is :  a newsletter sent to an advertising firm 's customers
Our long answer is :  <P> A common example of permission marketing is a newsletter sent to an advertising firm 's customers . Such newsletters inform customers of upcoming events or promotions , or new products . In this type of advertising , a company that wants to send a newsletter to their customers may ask them at the point of purchase if they would like to receive the newsletter . </P>


### Target Variable Exploration

In [23]:
yes_no_answer = []
for i in range(len(train)):
    yes_no_answer.append(train['annotations'][i][0]['yes_no_answer'])
yes_no_answer = pd.DataFrame({'yes_no_answer': yes_no_answer})

In [27]:
yes_no_answer['yes_no_answer'].value_counts()

NONE    29609
YES       235
NO        156
Name: yes_no_answer, dtype: int64

98.7% is None
The amount of observations that are YES and NO only sum 1.3%!

In [31]:
# this function extract the short answers and fill a dataframe
def extract_target_variable(df, short = True):
    if short:
        short_answer = []
        for i in range(len(df)):
            short = df['annotations'][i][0]['short_answers']
            if short == []:
                yes_no = df['annotations'][i][0]['yes_no_answer']
                if yes_no == 'NO' or yes_no == 'YES':
                    short_answer.append(yes_no)
                else:
                    short_answer.append('EMPTY')
            else:
                short = short[0]
                st = short['start_token']
                et = short['end_token']
                start_end = "{st}:{et}".format(st=st, et=et)
                short_answer.append(start_end)
        short_answer = pd.DataFrame({'short_answer': short_answer})
        return short_answer
    else:
        long_answer = []
        for i in range(len(df)):
            long = df['annotations'][i][0]['long_answer']
            if long['start_token'] == -1:
                long_answer.append('EMPTY')
            else:
                st = long['start_token']
                et = long['end_token']
                start_end = "{st}:{et}".format(st=st, et=et)
                long_answer.append(start_end)
        long_answer = pd.DataFrame({'long_answer': long_answer})
        return long_answer
        
short_answer = extract_target_variable(train)
short_answer.head()

Unnamed: 0,short_answer
0,1960:1969
1,213:215
2,EMPTY
3,512:514
4,EMPTY


In [32]:
short_answer['type'] = short_answer['short_answer'].copy()
short_answer.loc[(short_answer['short_answer']!='EMPTY') & (short_answer['short_answer']!='YES') & (short_answer['short_answer']!='NO'), 'type'] =  'TEXT'

In [38]:
short_answer['type'].value_counts()

EMPTY    19041
TEXT     10568
YES        235
NO         156
Name: type, dtype: int64

In [39]:
long_answer = extract_target_variable(train, False)
long_answer.head()

Unnamed: 0,long_answer
0,1952:2019
1,212:310
2,319:438
3,509:576
4,EMPTY


In [40]:
long_answer['type'] = long_answer['long_answer'].copy()
long_answer.loc[(long_answer['long_answer']!='EMPTY'), 'type'] =  'TEXT'
long_answer['type'].value_counts()

EMPTY    15047
TEXT     14953
Name: type, dtype: int64

Let's explore our question_text column which tell us the question that we want to answer with a segment of the document text

Count the number of words and check distribution
Most common words

In [41]:
def count_word_frequency(series, top = 0, bot = 20):
    cv = CountVectorizer()   
    cv_fit = cv.fit_transform(series)    
    word_list = cv.get_feature_names(); 
    count_list = cv_fit.toarray().sum(axis=0)
    frequency = pd.DataFrame({'Word': word_list, 'Frequency': count_list})
    frequency.sort_values(['Frequency'], ascending = False, inplace = True)
    frequency['Percentage'] = frequency['Frequency']/frequency['Frequency'].sum()
    frequency.drop('Frequency', inplace = True, axis = 1)
    frequency['Percentage'] = frequency['Percentage'].round(3)
    frequency = frequency.iloc[top:bot]
    frequency.set_index('Word', inplace = True)
    #bar_plot(pd.Series(frequency['Percentage']), 'Percentage', 'Question Text Word Frequency Distribution', 800, 500, 20, False)
    return frequency
    
frequency = count_word_frequency(train['question_text'])

In [42]:
frequency

Unnamed: 0_level_0,Percentage
Word,Unnamed: 1_level_1
the,0.095
of,0.043
in,0.038
who,0.029
is,0.025
what,0.019
when,0.016
to,0.013
and,0.012
where,0.012
