Group: poisson

Student Name: Kazi Abir Adnan
Student ID: 940406

Student Name: Daniel Gil
Student ID: 905923

Python version used: 3.5.4

# Load Necessary files and libraries

<b>Loading libraries</b>

In [None]:
import difflib
import json
import nltk
import numpy as np
import pickle
import pandas as pd
from collections import defaultdict
from collections import Counter
from difflib import SequenceMatcher
from gensim.models import Word2Vec
from itertools import groupby
from math import log
from nltk.corpus import wordnet as wn
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from nltk import word_tokenize, pos_tag, ne_chunk
from nltk.tag import StanfordNERTagger
from nltk.tag.stanford import StanfordPOSTagger
from nltk.tag.stanford import CoreNLPNERTagger
from nltk.tokenize.punkt import PunktSentenceTokenizer,PunktTrainer
from scipy.sparse import hstack
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction import DictVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer
print("Done")

<b>Utility functions</b>

In [None]:
#Saving object
def save_obj(obj,name):
    with open('obj/'+ name + '.pkl', 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

#Load saved object file
def load_obj(name):
    with open('obj/' + name + '.pkl', 'rb') as f:
        return pickle.load(f)

# Accuracy Report
def check_results(predictions, classifications):
    print("Accuracy:")
    print(accuracy_score(classifications,predictions))
    print(classification_report(classifications,predictions))

#returns lemmatized word
def lemmatize(word):
    lemma = lemmatizer.lemmatize(word,wn.NOUN)
    if (lemma == word):
        lemma = lemmatizer.lemmatize(word,wn.VERB)
    return lemma

# GET TF-IDF score of word in paragraph context
def get_tfidf(word,tf_idf,paragraph):
    result = 0.0
    found = False
    values = tf_idf[word]
    for value in values:
        if value[0] == paragraph:
            found  = True
            result = value [1]
            break
    return result

# GET sentence vector on paragraph context using Gensim word2vec model and TF-IDF score
def sentence_vector(word2vec_model, sentence,tf_idf,paragraph,qt):
    word_vectors = word2vec_model.wv
    doc = []
    tokenized_word = nltk.word_tokenize(sentence)
    for word in tokenized_word:
        word = word.lower()
        if word not in stopwords:
            word = lemmatize(word)
            if word in word_vectors.vocab:
                doc.append(word)
    score = []
    for word in doc:
        score.append(get_tfidf(word,tf_idf,paragraph)*word_vectors[word])

    if qt in word_vectors.vocab:
        score.append(word_vectors[qt])
    
    return np.mean(score, axis=0)

# get the word counter for a paragraph
def get_paragraph_words(paragraph):
    terms = Counter()
    for sentence in nltk.sent_tokenize(paragraph):
        for token in nltk.word_tokenize(sentence):
            if token not in stopwords: 
                token = lemmatize(token.lower())
                terms[token] = terms.get(token, 0) +  1
    return terms

# Get the document dictionary of paragraph term frequencies
def get_document_paragraphs_term(ds_documents):
    document_term={}
    for index, row in ds_documents.iterrows():
        terms={}
        for paragraph_id, paragraph in enumerate(row['text']):
            terms[paragraph_id]=get_paragraph_words(paragraph)
        document_term[row['docid']]=terms
    return document_term

# Get the term frequency distribution on paragraph of a document
def compute_paragraph_freqs(doc_term_freqs):
    doc_dic = {}
    for key, value in doc_term_freqs.items():
        dfs = Counter()
        for passage_id,tfs in value.items():
            for term in tfs.keys():
                dfs[term] += 1
        doc_dic[key] = dfs
        
    return doc_dic

# Get the total count of words in a paragraph
def count_words(freqs):
    p_count = 0
    for counter in freqs.values():
        p_count+=sum(counter.values())
    return p_count

# Get name entities of sentence using NLTK
def get_Name_Entity_NLTK(data):
    results=[]
    for sentence in data:
        ne_chunked_sents = ne_chunk(pos_tag(word_tokenize(sentence)))
        result = []

        for tagged_tree in ne_chunked_sents:

            if hasattr(tagged_tree, 'label'):
                entity_name = ' '.join(c[0] for c in tagged_tree.leaves()) #
                entity_type = tagged_tree.label() # get NE category
                result.append((entity_name, entity_type))
        results.append(result)

    return results

# Get name entities of sentence using Stanford CORENLTK
def get_Name_Entity_Sentence(sentence):
    st = CoreNLPNERTagger(url='http://localhost:9000')
    tokenized_text = nltk.word_tokenize(sentence)
    classified_text = st.tag(tokenized_text)
    result = []
    
    for tag, chunk in groupby(classified_text, lambda x:x[1]):
       if tag != "O":
            word = " ".join(w for w, t in chunk)
            result.append((word.lower(), tag))
    
    return result


# Add Name Entity column on dataframe based on the feature given
def addNameEntity(df,feature,func):
    if 'NE'+"_"+feature in df:
        df = df.drop('NE'+"_"+feature, axis=1)
    df["NE"+"_"+feature] = func(df[feature])
    
    return df

# GET Question keyword on question
def get_question_type(question):
    found  = False
    result = 'other'
    question_tokens = nltk.word_tokenize(question)
    for token in question_tokens:
        if token in questionwords:
            found = True
            result = token
    if not found:
        for token in question_tokens:
            if token in passiveQuestions:
                found = True
                result = token

    return result

# GET POS paragraph
def get_POS_paragraph(data):
    result = []
    tokenized_sentence = nltk.sent_tokenize(data)
    for sentence in tokenized_sentence:
        result.append(get_POS(sentence))
    return result

# GET POS sentence
def get_POS(data):
    tokenized_text = tokenizer.tokenize(data)
    nltk_tag = nltk.pos_tag(tokenized_text)
    stan_tag = stanford_tagger.tag(tokenized_text)
    return stan_tag

# Get keywords on sentence
def get_keyword(data):
    result = []
    sentence=data
    tokenized_text = nltk.word_tokenize(sentence)
    tagged = nltk.pos_tag(tokenized_text)
    for text,pos in tagged:
        text = lemmatize(text.lower())
        if text not in stopwords:
            if pos in POS:
                result.append(text)
                
    return result

# Get keywords on paragraph
def get_keyword_paragraph(data):
    results=[]
    tokenized_sentence = tokenizer.tokenize(data)
    for sentence in tokenized_sentence:
        result = get_keyword(sentence)
        results.append(result)
        
    return results

# Get keywords on document
def get_keyword_all(data):
    results=[]
    for sentence in data:
        result = get_keyword(sentence)
        results.append(result)
        
    return results

# Add keyword column on  dataframe based on feature
def add_keywords(df,feature):
    if 'keywords'+"_"+feature in df:
        df = df.drop('keywords'+"_"+feature, axis=1)
    df['keywords'+"_"+feature]=get_keyword_all(df[feature])
    return df

# get common keywords on question and a sentence
def get_number_of_common_kewyords(question_keywords,answer_sentence_keywords):
    sum_keywords=0
    for qkey in question_keywords:
        if qkey in answer_sentence_keywords:
            sum_keywords+=1
    
    return sum_keywords

# BOW extraction for passages and questions
def get_passages_bow(passages):
    passage_bow={}
    for passage in passages:
        for token in nltk.word_tokenize(passage):
            if token not in stopwords: 
                word=lemmatize(token.lower())
                passage_bow[word] = passage_bow.get(word, 0) +  1
    
    return passage_bow

# BOW extraction for sentences
def get_sentences_bow(sentences):
    sentence_bow={}
    
    for sentence in sentences:
        for token in nltk.word_tokenize(sentence):
            if token not in stopwords:
                word=lemmatize(token.lower())
                sentence_bow[word] = sentence_bow.get(word, 0) +  1
    
    return sentence_bow

# Get question BOW
def get_question_bow(question):
    question_bow={}
    question_bow[get_question_type(question)]=1
    for token in nltk.word_tokenize(question):
        if token not in stopwords: 
            word=lemmatize(token.lower())
            question_bow[word] = question_bow.get(word, 0) +  1
                
    return question_bow

# Get common BOW of question and answer keywords
def get_training_question_bow(question,keywords,qt):
    question_bow={}
    question_bow[qt]=1
    for token in nltk.word_tokenize(question):
        if token not in stopwords: 
            word=lemmatize(token.lower())
            if word in keywords:
                question_bow[word] = question_bow.get(word, 0) +  1
                
    return question_bow

#Get training dataset features
def get_feature_questions(questions, keywords,qt):
    qs = []
    for i,question in enumerate(questions):
        q_bow = get_training_question_bow(question,keywords,qt[i])
        qs.append(q_bow)
        
    return qs

#Get testing dataset features
def get_feature_question(question, keywords,qt):
#     print(question,keywords,qt)
    q_bow = get_training_question_bow(question,keywords,qt)
      
    return q_bow

# get a list of paragraphs ordered by relevance on the question
def query_vsm(query, index):
    accumulator = Counter()
    for term in query:
        postings = index[term]
        for docid, weight in postings:
            accumulator[docid] += weight
    return accumulator

def get_number_of_common_entities(ans,question):
    question_NE = set(x[1] for x in question)
    answer_NE = set(x[1] for x in ans)
    sum_keywords=0
    for qkey in question_NE:
        if qkey in answer_NE:
            sum_keywords+=1
#     print(ans,question)
    return sum_keywords

print("done")

<b>Loading saved objects</b>

In [None]:
df_training=pd.read_pickle("project_files/df_training.pkl")
df_devel=pd.read_pickle('project_files/df_devel.pkl')
df_docs=pd.read_json('project_files/documents.json')
df_testing=pd.read_pickle('project_files/df_testing.pkl')

question_learning_dataset = df_training[df_training.answer_type.notnull()]
question_devel_dataset = df_devel[df_devel.answer_type.notnull()]

NER_corpus=load_obj('ner_corpus')
random_forest_model = load_obj('random_forest')
vectorizer = load_obj('Vectorizer')
answer_keywords = load_obj('answer_keywords')
gensim_model = Word2Vec.load("obj/gensim_model")
gensim_features = load_obj('features')
gensim_labels = load_obj('labels')
vsm_inverted_index_corpus = load_obj('vsm_inverted_index_corpus')
vsm_inverted_index_all = load_obj('vsm_inverted_index_all')
doc_corpus_term_frquency = load_obj('doc_corpus_term_frquency')
doc_corpus_inverse_term_frquency = load_obj('doc_corpus_inverse_term_frquency')
punk_tokenizer = load_obj('punk_tokenizer')
print("Done")

<b>Initilaizing some global objects</b>

In [None]:
lemmatizer = nltk.stem.wordnet.WordNetLemmatizer()
stopwords = set(nltk.corpus.stopwords.words('english'))
stanford_tagger = StanfordPOSTagger('english-bidirectional-distsim.tagger')

POS = set(["NN","NNS","NNP","NNPS","CD","JJ","VB","VBD","VBG","VBN","VBP","VBZ"]) 

punk_tokenizer._params.abbrev_types.add('ii')
punk_tokenizer._params.abbrev_types.add('dr')

questionwords = set(["who", "what", "where", "when", "why", "how", "whose", "which", "whom","whats","what's","whos"])
passiveQuestions = set(["can", "could", "would", 
                   "was", "were","am","is", "are", "will","shall",
                   "did","do","does",
                   "had", "have","has",
                   "as","that","in",
                   "give an example","name"])
label={'NUMBER':'NUMBER','DATE':'TIME','PERSON':'PERSON','ORGANIZATION':'ORGANIZATION',
      'MISC':'MISC','MONEY':'NUMBER','COUNTRY':'LOCATION',
      'PERCENT':'NUMBER','TITLE':'PERSON','STATE_OR_PROVINCE':'LOCATION',
      'CAUSE_OF_DEATH':'NAME','DURATION':'TIME','CRIMINAL_CHARGE':'NAME',
       'CITY':'LOCATION','RELIGION':'NAME','SET':'TIME','NATIONALITY':'NAME',
       'IDEOLOGY':'NAME','ORDINAL':'NUMBER','TIME':'TIME','URL':'ORGANIZATION',None:'OTHER','LOCATION':'LOCATION'}

df_result_devel=pd.DataFrame(columns=['id','question','paragraph','retrieved paras','predicted_paragraph',
                                      'paragraph_found','sentence','predicted_sentence','predicted_sentence_id',
                                      'sentence_found','answer','predicted_answer',
                                      'NE_question','label','predicted_paragraph_final'])

df_result_test=pd.DataFrame(columns=['id','question','paragraph','retrieved paras','predicted_paragraph',
                                      'paragraph_found','sentence','predicted_sentence','predicted_sentence_id',
                                      'sentence_found','answer','predicted_answer','NE_question',
                                     'label','predicted_paragraph_final'])

df_best_sentences=pd.DataFrame(columns=['doc_id','para_id','original','sentence_id','sentence_text','score','sentence'])

print("Done")

# Analyzing training dataset and understanding tweeked answers

<b> Don't Run any of this functions in this section. This is only to analyze the training dataset

#### Special Tokenizer PUNKT

In [None]:
text = ""
for index, doc in df_docs.iterrows():
    print (index)
    for paragraph in doc['text']:
        text = text + paragraph+" "

punk_trainer = PunktTrainer()
punk_trainer.INCLUDE_ALL_COLLOCS = True
punk_trainer.train(text)
punk_trainer = PunktSentenceTokenizer(trainer.get_params())
print("Done")

In [None]:
def answer_find(sentence,answer):
    s_t = set(x.lower() for x in nltk.word_tokenize(sentence))
    a_t = nltk. word_tokenize(answer)
    for a in a_t:
        if a.lower() not in s_t:
            return False
    return True

def answer_find2(sentence,answer):
    s_t = set(x.lower().replace('-',' ').replace('\'','') for x in nltk.word_tokenize(sentence.replace('-',' ')))
    a_t = nltk.word_tokenize(answer.replace('-',' '))
    for a in a_t:
        if a.lower() not in s_t:
            return False
    return True

sentences = []
count = 0
for index,row in df_training.iterrows():
    p_id = row['answer_paragraph']
    doc_id = row['docid']
    paragraph = df_docs.iloc[doc_id]['text'][p_id]
    found = False
    for sent in tokenizer.tokenize(paragraph):
        if(answer_find(sent,row['text'])):
            sentences.append(sent)
            found = True
            break
    if not found:
        for sent in tokenizer.tokenize(paragraph):
            if(answer_find2(sent, row['text'])):
                sentences.append(sent)
                found = True
                break
    if not found:
        for sent in nltk.sent_tokenize(paragraph):
            if row['text'].lower() in sent.lower():
                sentences.append(sent)
                found = True
                break
    if not found:
        count = count + 1
        print (index,row['text'])
        print(paragraph)
        sentences.append(None)
print(count)

#### Set some predefined question type to entity type (Ex. Who/Whom/Who's -> PERSON)

In [None]:
def print_questions(question_type,entity_type):
    count = 0
    for index, row in df_training.iterrows():
        ne_answer = row['NE_text']
        question = row['question']
        tokenized_text = tokenizer.tokenize(question)
        for text in tokenized_text:
            if (text.lower() == question_type.lower()):
                answer = row['text']
                if(len(ne_answer))<1:
                    print (index,question,answer)
                    count = count + 1
#                     df_training.set_value(index,'NE_text',get_Name_Entity_StanfordCoreNLP_sentence_caseless(answer))
                break
    print (count)
    
questionwords = set(["what", "when", "why", "how", "whose", "which","whats","what's"])

print_questions("who","PERSON")
print_questions("where","LOCATION")
print_questions("whom","PERSON")
print_questions("whose","PERSON")


#### Question type set to dataframe

In [None]:
questionwords = set(["who", "what", "where", "when", "why", "how", "whose", "which", "whom","whats","what's","whos"])
passiveQuestions = ["can", "could", "would", 
                    "was", "were","am","is", "are", "will","shall",
                    "did","do","does",
                    "had", "have","has",
                    "as","that","in",
                    "give an example","name"]
question_types = []
count  = 0
for index,row in df_testing.iterrows():
    sentence = row["question"]
    question_type = 'other'
    found = False
    tokenized_text = tokenizer.tokenize(sentence)
    for text in tokenized_text:
        if text.lower() in questionwords:
            found = True
            question_type = text.lower()
            break;
    if (not found):
        for text in tokenized_text:
            if text.lower() in passiveQuestions:
                found = True
                question_type = text.lower()
                break;
    if (not found):
        count = count + 1
#         print (index,sentence,row["text"])
    print (index)
    question_types.append(question_type)
    
df_testing['question_type'] = question_types
print (count)

#### Get POS of question

In [None]:
results = []
for sentence in df_training['question']:
    result = []
    tokenized_text = tokenizer.tokenize(sentence)
    tagged = nltk.pos_tag(tokenized_text)
    for text,pos in tagged:
        text = lemmatize(text.lower())
        if text not in stopwords:
            if pos in POS:
                result.append(pos)
    results.append(result)
print(len(results))

#### Get Answer from paragraph

In [None]:
sentences = []
count = 0
for index,row in df_devel.iterrows():
    p_id = row['answer_paragraph']
    doc_id = row['docid']
    paragraph = df_docs.iloc[doc_id]['text'][p_id]
    found = False
    for sent in punk_tokenizer.tokenize(paragraph):
        if(answer_find(sent,row['text'])):
            sentences.append(sent)
            found = True
            break
    if not found:
#         print (index,row['text'])
        for sent in punk_tokenizer.tokenize(paragraph):
            if(answer_find2(sent, row['text'])):
                sentences.append(sent)
                found = True
                break
    if not found:
#         print (index,row['text'])
        for sent in nltk.sent_tokenize(paragraph):
            if row['text'].lower() in sent.lower():
                sentences.append(sent)
                found = True
                break
    if not found:
        count = count + 1
        print (index,row['text'])
        print(paragraph)
        sentences.append(None)
print(count)

#### Special answer tweeking trick that helped my project

In [None]:
import difflib
answer_type = []
count  = 0
for index,row in df_devel.iterrows():
    nes = row['NE_text']
    answer = row['text']
    answer_entity = None
    if(not row['answer_type']):
        if(len(nes)==1):
            answer_entity = tag
        elif(len(nes)>1):
            check = set()
            for ne in nes:
                check.add(ne[1])
            if(len(check) == 1):
                tag = list(check)[0]
                answer_entity = tag
            else:
    #             print(nes)
                for ne in nes:
                    word= ne[0]
                    tag = ne[1]
                    seq=difflib.SequenceMatcher(None, word,answer)
                    d=seq.ratio()*100
                    if (d>60.00):
    #                     print ("1.1-",index, word, answer,d,tag)
                        answer_entity = tag
                        break
                    else:
    #                     print ("1.2-",index, word, answer,d)
                        tokenize_answer = answer.split(" ")
                        if(len(tokenize_answer)>1):
                            for ans in tokenize_answer:
                                seq=difflib.SequenceMatcher(None, word,ans)
                                d=seq.ratio()*100
                                if (d>30.00):
                                    answer_entity = tag
                                    break
        else:
    #         print(len(nes))
            nes = row['NE_corrected_answer']
            if(len(nes)==1):
                answer_entity = tag
            if(len(nes)>1):
                check = set()
                for ne in nes:
                    check.add(ne[1])
                if(len(check) == 1):
                    tag = list(check)[0]
                    answer_entity = tag
                else:
    #                 print(nes)
                    for ne in nes:
                        word= ne[0]
                        tag = ne[1]
                        seq=difflib.SequenceMatcher(None, word,answer)
                        d=seq.ratio()*100
                        if (d>60.00):
    #                         print ("2.1-",index, word, answer,d)
                            answer_entity = tag
                            count = count + 1
                            break
                        else:
    #                         print ("2.2-",index, word, answer,d)
                            tokenize_answer = answer.split(" ")
                            if(len(tokenize_answer)>1):
                                for ans in tokenize_answer:
                                    seq=difflib.SequenceMatcher(None, word,ans)
                                    d=seq.ratio()*100
                                    if (d>30.00):
        #                                 print(index,word,tag)
                                        answer_entity = tag
                                        count = count + 1
                                        break
        #                             else:
        #                                 print(index,word,ans,d)
        #                                 print(row['text'])
        #     print (index, answer,answer_entity)
        if(not answer_entity) and  (len(nes)>=1):
            print(index, answer,answer_entity,nes)
    else:
        answer_entity = row['answer_type']
    answer_type.append(answer_entity)
print(count)

# Question Classification

<b>Create dataset (No need to run now)</b>

In [None]:
# get the most common words from answer sentences (we can twek this for paragraph)
answer_sentences_bow=get_sentences_bow(question_learning_dataset[question_learning_dataset['answer_found'].notnull()]['answer_found'])
answer_keywords = set([word for word, count in answer_sentences_bow.items()])

# Feature matrix
qs_training=get_feature_questions(list(question_learning_dataset.question),answer_keywords,list(question_learning_dataset.question_type))
qs_dev=get_feature_questions(list(question_devel_dataset.question),answer_keywords,list(question_devel_dataset.question_type))

<b>Learning model Random Forest / Multinomial NB</b>

In [None]:
#model = MultinomialNB(2, False, None)
model=RandomForestClassifier(n_estimators = 300, max_depth = 60, criterion = 'entropy')

In [None]:
if(not vectorizer):
    vectorizer = DictVectorizer()

X_train_dtm = vectorizer.fit_transform(qs_training)
X_dev_dtm = vectorizer.transform(qs_dev)

model=RandomForestClassifier(n_estimators = 300, max_depth = 60, criterion = 'entropy')
model.fit(X_train_dtm, list(question_learning_dataset.label))

<b>Evaluation</b>

In [None]:
y_predicted_class = model.predict(X_dev_dtm)
check_results(y_predicted_class,list(question_devel_dataset.label))

<b>Gensim Model (Finally not selected)</b>

Getting words of corpus and learning the model (Already trained and no need to do again)

In [None]:
sentences = list()
for index,row in df_docs.iterrows():
    for paragraph in row['text']:
        tokenized_sentences = nltk.sent_tokenize(paragraph)
        for sentence in tokenized_sentences:
            tokenized_words = nltk.word_tokenize(sentence)
            words = [lemmatize(x.lower()) for x in tokenized_words]
            sentences.append(words)
print("Done")

for index,row in df_training.iterrows():
    question = row['question']
    tokenized_word = nltk.word_tokenize(question)
    words = [lemmatize(x.lower()) for x in tokenized_word]
    sentences.append(words)
print("Done")

for index,row in df_devel.iterrows():
    question = row['question']
    tokenized_word = nltk.word_tokenize(question)
    words = [lemmatize(x.lower()) for x in tokenized_word]
    sentences.append(words)
print("Done")

print ("It will take a while to learn the Gensim Word2Vec model. Please wait.")
gensim_model = Word2Vec(sentences, size=500, workers=4, iter = 50)
print (model)
gensim_model.save("gensim_model")
print("Done")

Create features using gensim model to learn a model

In [None]:
gensim_features = []
gensim_labels = []
for index,row in question_learning_dataset.iterrows():
    question = row['question']
    gensim_features.append(sentence_vector(
        model, question,vsm_inverted_index_corpus[row['docid']],row['answer_paragraph'],row['question_type']))
    gensim_labels.append(row['answer_type'])
    print (index)
print ("done")

save_obj(gensim_features,'features')
save_obj(gensim_labels,'labels')
print("done")

In [None]:
x_train = gensim_features[0:22000]
x_test = gensim_features[22000:24982]
y_train = gensim_labels[0:22000]
y_test = gensim_labels[22000:24982]

Learning the model

In [None]:
rfc=RandomForestClassifier(random_state=42)
param_grid = { 
    'n_estimators': [300],
    'max_depth' : [40]
}
CV_rfc = GridSearchCV(estimator=rfc, param_grid=param_grid, cv= 2,verbose = 4,n_jobs= -1)
CV_rfc.fit(x_train, y_train)
print(CV_rfc.best_params_)

Evaluation

In [None]:
y_predicted_class = CV_rfc.predict(x_test)
check_results(y_predicted_class,y_test)

<b>Saving object if necessary</b>

In [None]:
save_obj(model,'random_forest')
save_obj(vectorizer,'Vectorizer')
save_obj(qs_training,'qs_training')
save_obj(qs_dev,'qs_dev')
save_obj(answer_keywords,'answer_keywords')

# Paragraph Retrieval

### For development Purpose

In [None]:
df_devel1=df_devel.iloc[0:10]
count = 0
for index, row in df_devel.iterrows():
    question=row['question']
    docid=row['docid']
    ida=index
    question_keywords=get_keyword(question)
    results = query_vsm(question_keywords, vsm_inverted_index_all[docid])
    documents_ranked=results.most_common(2)
    par_retrieved = False
    possible_par=[par[0] for par in documents_ranked]
    if(row['answer_paragraph'] in set(possible_par)):
        par_retrieved = True
        count  = count + 1
    df_result_devel.loc[ida] = [docid,question,row['answer_paragraph'],
                                possible_par,possible_par,
                                par_retrieved,row['answer_found'],None,None,False,row['text'],None,
                                row['NE_question'],row['label'],None]

print(count*1.0/len(df_devel))

df_result_devel = df_result_devel[df_result_devel['paragraph_found']==True]
print(len(df_result_devel))

### For Testing

In [None]:
df_testing1=df_testing.iloc[0:10]
count = 0
for index, row in df_testing1.iterrows():
    question=row['question']
    docid=row['docid']
    ida=index
    question_keywords=get_keyword(question)
    results = query_vsm(question_keywords, vsm_inverted_index_all[docid])
    documents_ranked=results.most_common(2)
    possible_par=[par[0] for par in documents_ranked]
    df_result_test.loc[ida] = [docid,question,None,
                                possible_par,possible_par,
                                None,None,None,None,False,None,None,
                                row['NE_question'],None,None]
    print(index)
df_result_test

## TF-IDF calculation (Not needed. Saved object)

### Calculation (Not needed. Saved object)

In [None]:
doc_corpus_term_frquency = get_document_paragraphs_term(df_docs)
doc_corpus_inverse_term_frquency = compute_paragraph_freqs(doc_corpus_term_frquency)

vsm_inverted_index_corpus = defaultdict()
for docid, doc_freq in doc_corpus_term_frquency.items():
    vsm_inverted_index_doc = defaultdict(list)
    N = count_words(doc_freq)
    for paragraph_id, paragraph_freqs in doc_freq.items():
        length = 0
        tfidf_values = []
        for term, count in paragraph_freqs.items():
            tfidf = float(count) / N * log(M / float(doc_corpus_inverse_term_frquency[docid][term]))
            tfidf_values.append((term, tfidf))
            length += tfidf ** 2
        length = length ** 0.5
        for term, tfidf in tfidf_values:
            vsm_inverted_index_doc[term].append([paragraph_id, tfidf / length])
    vsm_inverted_index_corpus[docid] = vsm_inverted_index_doc
print("Done")

### Saving Object

In [None]:
save_obj(doc_corpus_term_frquency,'doc_corpus_term_frquency')
save_obj(doc_corpus_inverse_term_frquency,'doc_corpus_inverse_term_frquency')
save_obj(vsm_inverted_index_corpus,'vsm_inverted_index_corpus')

# Sentence Retrieval

### Development Purpose

In [None]:
count  = 0
for index,row in df_result_devel.iterrows():
    NE_question = row['NE_question']
    question = row['question']
    answer = row['sentence']
    docid = row['id']
    candidate_answers = {}
    possible_paragraphs = row['predicted_paragraph']
#     print(possible_paragraphs)
    sentences_check = Counter()
    for paragraph_id in possible_paragraphs:
        paragraph = df_docs.iloc[row['id']]['text'][paragraph_id]
        candidate_answers[paragraph_id] = punk_tokenizer.tokenize(paragraph)
        question_keywords=get_keyword(question)
        for sentence_index,sentence in enumerate(candidate_answers[paragraph_id]):
            NER_sentence=NER_corpus[docid][paragraph_id][sentence_index]
            answer_sentence_keywords = get_keyword(sentence)
            common_keywords=get_number_of_common_kewyords(question_keywords,answer_sentence_keywords)
            common_entities = get_number_of_common_entities(NER_sentence,NE_question)

            # longest exact sequence of keywords
            longest_exact_sequence=0
            for i in range(len(question_keywords)):
                if i < len(answer_sentence_keywords):
                    if question_keywords[i] in answer_sentence_keywords[i]:
                        longest_exact_sequence+=1

            # proximity
            proximity=0
            question_keywords_span=question_keywords.copy()
            index_qk=0
            while len(question_keywords_span)>0 and index_qk<len(question_keywords_span):
                proximity+=1
                if question_keywords_span[index_qk] in answer_sentence_keywords:
                    question_keywords_span.pop(index_qk)
                index_qk+=1

            # n-gram overlap
            bigrams_question =  nltk.bigrams([lemmatize(token) for token in nltk.word_tokenize(question)])
            ngram_overlap=0

            for bigram_question in bigrams_question:
                bigrams_sentence = nltk.bigrams([lemmatize(token) for token in nltk.word_tokenize(sentence)])
                for bigram_sentence in bigrams_sentence:
                    if bigram_question == bigram_sentence:
                        ngram_overlap+=1
            
            sentence_score = common_keywords + common_entities + longest_exact_sequence - proximity + ngram_overlap
            df_best_sentences.loc[len(df_best_sentences)]=[row['id'],
                                                       paragraph_id,row['paragraph'],sentence_index,
                                                       sentence,sentence_score,row['sentence']]
            sentences_check[(paragraph_id,sentence_index)] = sentence_score
    
    best = sentences_check.most_common()
    df_result_devel.at[index, 'predicted_sentence_id'] = best[0][0][1]
    df_result_devel.at[index, 'predicted_sentence'] = candidate_answers[best[0][0][0]][best[0][0][1]]
    print(index)
    if(row['sentence']==candidate_answers[best[0][0][0]][best[0][0][1]]):
        count = count + 1
        df_result_devel.at[index, 'sentence_found'] = True
        df_result_devel.at[index, 'predicted_paragraph_final'] = best[0][0][0]
print(count*1.0/len(df_result_devel))
df_result_devel = df_result_devel[df_result_devel['sentence_found']==True]
print("Sentences found :",len(df_result_devel))

### Testing purpose

In [None]:
count  = 0
for index,row in df_result_test.iterrows():
    NE_question = row['NE_question']
    question = row['question']
    docid = row['id']
    candidate_answers = {}
    possible_paragraphs = row['predicted_paragraph']
    sentences_check = Counter()
    if(len(possible_paragraphs)>0):
        for paragraph_id in possible_paragraphs:
            paragraph = df_docs.iloc[row['id']]['text'][paragraph_id]
            candidate_answers[paragraph_id] = punk_tokenizer.tokenize(paragraph)
            question_keywords=get_keyword(question)
            for sentence_index,sentence in enumerate(candidate_answers[paragraph_id]):
                NER_sentence=NER_corpus[docid][paragraph_id][sentence_index]
                answer_sentence_keywords = get_keyword(sentence)
                common_keywords=get_number_of_common_kewyords(question_keywords,answer_sentence_keywords)
                common_entities = get_number_of_common_entities(NER_sentence,NE_question)

                # longest exact sequence of keywords
                longest_exact_sequence=0
                for i in range(len(question_keywords)):
                    if i < len(answer_sentence_keywords):
                        if question_keywords[i] in answer_sentence_keywords[i]:
                            longest_exact_sequence+=1

                # proximity
                proximity=0
                question_keywords_span=question_keywords.copy()
                index_qk=0
                while len(question_keywords_span)>0 and index_qk<len(question_keywords_span):
                    proximity+=1
                    if question_keywords_span[index_qk] in answer_sentence_keywords:
                        question_keywords_span.pop(index_qk)
                    index_qk+=1

                # n-gram overlap
                bigrams_question =  nltk.bigrams([lemmatize(token) for token in nltk.word_tokenize(question)])
                ngram_overlap=0

                for bigram_question in bigrams_question:
                    bigrams_sentence = nltk.bigrams([lemmatize(token) for token in nltk.word_tokenize(sentence)])
                    for bigram_sentence in bigrams_sentence:
                        if bigram_question == bigram_sentence:
                            ngram_overlap+=1

                sentence_score = common_keywords + common_entities + longest_exact_sequence - proximity + ngram_overlap
                df_best_sentences.loc[len(df_best_sentences)]=[row['id'],
                                                           paragraph_id,row['paragraph'],sentence_index,
                                                           sentence,sentence_score,row['sentence']]
                sentences_check[(paragraph_id,sentence_index)] = sentence_score

        best = sentences_check.most_common()
        df_result_test.at[index, 'predicted_paragraph_final'] = best[0][0][0]
        df_result_test.at[index, 'predicted_sentence_id'] = best[0][0][1]
        df_result_test.at[index, 'predicted_sentence'] = candidate_answers[best[0][0][0]][best[0][0][1]]
    print(index)
df_result_test

# Answer Extraction

### Development Purpose

In [None]:
count = 0

df_result_devel['predicted_label'] = None
df_result_devel['type_predicted'] = None

for index,row in df_result_devel.iterrows():
    sentence = row['predicted_sentence']
    question = row['question']
    question_NER = row['NE_question']
    sentence_id = row['predicted_sentence_id']
    paragraph_id = row['predicted_paragraph_final']
    docid = row['id']
    answer = row['answer']
    sentence_NER = NER_corpus[docid][paragraph_id][sentence_id]
    sentence_keywords = get_keyword(sentence)
    question_keywords=get_keyword(question)
    predicted_label = random_forest_model.predict(vectorizer.transform(
        get_feature_question(question,answer_keywords,get_question_type(question))))[0]
    
    df_result_devel.at[index, 'predicted_label'] = predicted_label
    
    answer_type = None
    predicted_answer = None
    list_match_answer_type=[]
    list_match_pred_answer=[]
    first_match=True
    for NE in sentence_NER:
        if (label[NE[1]]==predicted_label):            
            answer_type = NE[1]
            predicted_answer = NE[0]
            list_match_answer_type.append(answer_type)
            list_match_pred_answer.append(predicted_answer)
            if first_match:
                break
        else:
            first_match=False
    if (len(list_match_pred_answer)!=0):
        print(index,"NER best",list_match_pred_answer)
        df_result_devel.at[index,'type_predicted'] = "best NER"
    if (len(list_match_pred_answer)==0):
        for NE in sentence_NER:            
            answer_type = NE[1]
            predicted_answer = NE[0]
            if(predicted_answer.lower() not in [x.lower() for x in list_match_pred_answer]):
                list_match_answer_type.append(answer_type)
                list_match_pred_answer.append(predicted_answer)
        if (len(list_match_pred_answer)!=0):
            print(index,"NER",list_match_pred_answer)
            df_result_devel.at[index,'type_predicted'] = "NER"
    if (len(list_match_pred_answer)==0):
        for word,pos in nltk.pos_tag(nltk.word_tokenize(sentence)):
            if (pos == 'NN' or pos == 'NNP' or pos == 'NNS' or pos == 'NNPS' or pos == 'CD'):
#                 if (pos == 'NN'or pos == 'NNP' or pos == 'CD'):
                if pos in POS:
#                     if(word.lower() not in [x.lower() for x in list_match_pred_answer]):
                    list_match_pred_answer.append(word)
                    list_match_answer_type.append(pos)
        if (len(list_match_pred_answer)!=0):
            print(index,"POS",list_match_pred_answer)
            df_result_devel.at[index,'type_predicted'] = "POS"
    if (len(list_match_pred_answer)==0):
        for word in nltk.word_tokenize(sentence):
            if word not in stopwords:
                if(word.lower() not in [x.lower() for x in list_match_pred_answer]):
                    list_match_pred_answer.append(word)
        if (len(list_match_pred_answer)!=0):
            print(index,"keyword",list_match_pred_answer)
            df_result_devel.at[index,'type_predicted'] = "keyword"

    df_result_devel.at[index, 'predicted_answer'] = " ".join(list_match_pred_answer)

In [None]:
check_results(list(df_result_devel.predicted_label),list(df_result_devel.label))

### Analyze Predicted Sentences

In [None]:
count = 0
for index,row in df_result_devel.iterrows():
    ans = row['label']
    pred = row['predicted_label']
    if(ans == pred):
        count  = count + 1
print(count/len(df_result_devel))

In [None]:
print(len(df_result_devel))

In [None]:
count = 0
count1 = 0
count2 = 0
count3 = 0
count4 = 0
count5 = 0
count6 = 0

count7 = 0
count8 = 0
count9 = 0
for index,row in df_result_devel.iterrows():
    ans = row['answer']
    pred = row['predicted_answer']
    type_p = row['type_predicted']
    if(type_p == 'best NER'):
        count7 = count7 + 1
    elif(type_p == 'NER'):
        count8 = count8 + 1
    else:
        count9 = count9 + 1
        
    if(ans == pred):
        count = count + 1
    else:
        if(pred):
            seq=difflib.SequenceMatcher(None, ans,pred)
            d=seq.ratio()*100
            if (d>=80.00):
#                 print(row['label'],"~~",row['predicted_label'],'~~',ans,"~~",pred)
                count1 = count1 + 1
            elif(d>60 and d<80.00):
#                 print(row['label'],"~~",row['predicted_label'],'~~',ans,"~~",pred)
                count2 = count2 + 1
            elif(d>40 and d<=60.00):
#                 print(row['label'],"~~",row['predicted_label'],'~~',ans,"~~",pred)
                count3 = count3 + 1
            elif(d>20 and d<40.00):
#                 print(row['label'],"~~",row['predicted_label'],'~~',ans,"~~",pred)
                count4 = count4 + 1
            else:
#                 print(row['label'],"~~",row['predicted_label'],'~~',ans,"~~",pred)
                count5 = count5 + 1
        else:
            print("strange",pred,row['answer'],"~~",row["predicted_answer"])
            count6 = count6 + 1
            
print(count/len(df_result_devel),count1/len(df_result_devel),count2/len(df_result_devel),count3/len(df_result_devel),count4/len(df_result_devel),count5/len(df_result_devel),count6/len(df_result_devel))
cc = count + count1 + count2 + count3 + count4 + count5 + count6
print(cc)
print(count7/len(df_result_devel),count8/len(df_result_devel),count9/len(df_result_devel))
print(count7,count8,count9)

In [None]:
count10 = 0
count11 = 0
count12 = 0
count13 = 0
count14 = 0
count15 = 0
for index,row in df_result_devel.iterrows():
    type_p = row['type_predicted']
    actual = row['label']
    predicted = row['predicted_label']
    if(type_p == 'best NER'):
        if(actual==predicted):
            count10 = count10 + 1
        else:
            count11 = count11+1
    elif(type_p == 'NER'):
        if(actual==predicted):
            count12 = count12 + 1
        else:
            count13 = count13+1
    else:
        if(actual==predicted):
            count14 = count14 + 1
        else:
            count15 = count15+1
print(count10,count11,count12,count13,count14,count15)
print(count10/1031,count11,count12/407,count13,count14/108,count15)
total = count10 + count11 + count12 + count13 + count14 + count15
print(total)

### Testing Purpose

In [None]:
df_result=pd.DataFrame(columns=['id','answer'])
df_result_test['predicted_label'] = None
for index,row in df_result_test.iterrows():
#     print(index)
    sentence = row['predicted_sentence']
    question = row['question']
    question_NER = row['NE_question']
    sentence_id = row['predicted_sentence_id']
    paragraph_id = row['predicted_paragraph_final']
    docid = row['id']
    if(paragraph_id):
        sentence_NER = NER_corpus[docid][paragraph_id][sentence_id]
        sentence_keywords = get_keyword(sentence)
        question_keywords=get_keyword(question)
        predicted_label = random_forest_model.predict(vectorizer.transform(
            get_feature_question(question,answer_keywords,get_question_type(question))))[0]

        df_result_test.at[index, 'predicted_label'] = predicted_label

        answer_type = None
        predicted_answer = None
        list_match_answer_type=[]
        list_match_pred_answer=[]
        first_match=True
        for NE in sentence_NER:
            if (label[NE[1]]==predicted_label):            
                answer_type = NE[1]
                predicted_answer = NE[0]
                list_match_answer_type.append(answer_type)
                list_match_pred_answer.append(predicted_answer)
                if first_match:
                    break
            else:
                first_match=False
        if (len(list_match_pred_answer)!=0):
            print(index,"NER best",list_match_pred_answer)
        if (len(list_match_pred_answer)==0):
            for NE in sentence_NER:            
                answer_type = NE[1]
                predicted_answer = NE[0]
                if(predicted_answer.lower() not in [x.lower() for x in list_match_pred_answer]):
                    list_match_answer_type.append(answer_type)
                    list_match_pred_answer.append(predicted_answer)
            if (len(list_match_pred_answer)!=0):
                print(index,"NER",list_match_pred_answer)
        if (len(list_match_pred_answer)==0):
            for word,pos in nltk.pos_tag(nltk.word_tokenize(sentence)):
                if (pos == 'NN' or pos == 'NNP' or pos == 'NNS' or pos == 'NNPS' or pos == 'CD'):
#                 if (pos == 'NN'or pos == 'NNP' or pos == 'CD'):
                    if pos in POS:
#                     if(word.lower() not in [x.lower() for x in list_match_pred_answer]):
                        list_match_pred_answer.append(word)
                        list_match_answer_type.append(pos)
            if (len(list_match_pred_answer)!=0):
                print(index,"POS",list_match_pred_answer)
        if (len(list_match_pred_answer)==0):
            for word in nltk.word_tokenize(sentence):
                if word not in stopwords:
                    if(word.lower() not in [x.lower() for x in list_match_pred_answer]):
                        list_match_pred_answer.append(word)
            if (len(list_match_pred_answer)!=0):
                print(index,"keyword",list_match_pred_answer)

        df_result_test.at[index, 'predicted_answer'] = " ".join(list_match_pred_answer)
        df_result.loc[len(df_result)]=[index," ".join(list_match_pred_answer)]
    else:
        df_result.loc[len(df_result)]=[index,""]

In [None]:
df_result.to_csv('prediction/output.csv',index=False)

In [None]:
print(len(question_devel_dataset))