# Sentence Selection

Holy Lovenia / 13515113

In [8]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Data Preparation

In [9]:
from nltk import RegexpTokenizer
from pandas.io.json import json_normalize


import json
import pandas as pd
import re

### Read original data

In [10]:
data = []
with open('dataset/SQuAD/train-v2.0.json') as f:
    json_data = json.load(f)['data']

    for i in range(len(json_data)):
        json_data_i = json_data[i]['paragraphs']
        
        for j in range(1):
            paragraph = json_data_i[j]['context']
                
            data.append(paragraph)

### Preprocess data

In [11]:
preprocessed_data = []
with open('dataset/SQuAD/train-v2.0.json') as f:
    json_data = json.load(f)['data']

    for i in range(len(json_data)):
        json_data_i = json_data[i]['paragraphs']
        
        for j in range(1):
            paragraph = json_data_i[j]['context']
            
            # replace all dictionary phonetic with ''
            paragraph = re.sub('\/.*\ˈ.*\/', '', paragraph)
            
            # replace all japanese characters with ''
            paragraph = re.sub('[\u3000-\u303f\u3040-\u309f\u30a0-\u30ff\uff00-\uff9f\u4e00-\u9faf\u3400-\u4dbf]+', '', paragraph)
            paragraph = re.sub(r'[^\x00-\x7f]',r'', paragraph) 
            
            # replace dots in the center of words
            words = paragraph.split(' ')
            for i in range(len(words)):
                if(words[i].find('.') != len(words[i]) - 1 and words[i].find('.') != -1):
                    words[i] = words[i].replace('.', '')
                if(words[i].find(',') != len(words[i]) - 1 and words[i].find(',') != -1):
                    words[i] = words[i].replace(',', '')

            paragraph = ' '.join(words)
            
            data_i_j = paragraph.split('.')
            
            paragraph = []
            for k in range(len(data_i_j)):
                tokenizer = RegexpTokenizer('[\w\/\&\-\:]+', flags=re.UNICODE)
    
                token_list = tokenizer.tokenize(data_i_j[k])
                token_list = [token.strip() for token in token_list if len(token.strip()) > 1 or token.lower() == 'a']
            
                if token_list != []:
                    paragraph.append(token_list)
                
            preprocessed_data.append(paragraph)

## Text Summarization

In [48]:
from sentence_selection.lsa import SteinbergerJezekLSA
from sentence_selection.text_rank import TextRank
from sentence_selection.multi_word_phrase_extraction import MultiWordPhraseExtractor

### TextRank

In [49]:
tr_jaccard = TextRank(similarity='jaccard')
tr_cosine = TextRank(similarity='cosine')

### MultiWordPhraseExtraction

In [50]:
mwpe = MultiWordPhraseExtractor()

### LSA

In [51]:
lsa_tfidf = SteinbergerJezekLSA(matrix_technique='tfidf')

In [52]:
lsa_binary = SteinbergerJezekLSA(matrix_technique='binary')

### Usage example

In [31]:
count_same_results = 0

for i in range(len(preprocessed_data)):
    # LSA
    lsa_tfidf_result = lsa_tfidf.summarize(preprocessed_data[i])
    lsa_binary_result = lsa_binary.summarize(preprocessed_data[i])
    
    # MWPE
    multiword_result = mwpe.summarize(preprocessed_data[i])
    
    # TextRank
    tr_jaccard_result = tr_jaccard.summarize(preprocessed_data[i])
    tr_cosine_result = tr_cosine.summarize(preprocessed_data[i])
    
    if len(preprocessed_data[i]) > 1:
        print(i, lsa_tfidf_result, lsa_binary_result, multiword_result, tr_jaccard_result, tr_cosine_result)
    else:
        print(i, 'passage only has 1 sentence')

0 [0, 2] [3, 1] [0, 1] [0, 3] [1, 2]
1 [2, 0] [1, 0] [0, 1] [1, 0] [2, 0]


  """


2 [5, 4] [2, 6] [2, 4] [4, 1] [6, 0]
3 [2, 1] [0, 3] [0, 1] [1, 2] [3, 1]
4 [4, 1] [2, 0] [0, 2] [4, 2] [3, 0]
5 [3, 0] [1, 2] [2, 0] [0, 2] [1, 3]
6 passage only has 1 sentence
7 [1, 2] [0, 1] [1, 0] [0, 1] [2, 1]
8 [0, 1] [2, 1] [1, 2] [0, 1] [1, 0]
9 passage only has 1 sentence
10 [0, 1] [1, 0] [0, 1] [1, 0] [0, 1]
11 [2, 0] [1, 0] [0, 1] [0, 1] [2, 0]
12 [2, 1] [3, 1] [3, 0] [1, 3] [2, 3]
13 passage only has 1 sentence
14 [1, 0] [2, 0] [0, 2] [2, 1] [0, 1]
15 [1, 2] [0, 2] [0, 2] [1, 2] [0, 1]
16 [3, 0] [2, 0] [0, 1] [4, 3] [3, 1]
17 [1, 0] [1, 0] [0, 1] [1, 0] [0, 1]
18 [1, 0] [3, 1] [0, 1] [3, 1] [3, 2]
19 [1, 2] [0, 2] [1, 2] [1, 0] [2, 0]
20 [1, 2] [0, 2] [0, 2] [2, 0] [1, 2]
21 [1, 2] [0, 2] [0, 1] [1, 2] [1, 2]
22 [0, 2] [1, 2] [2, 0] [2, 1] [2, 0]
23 passage only has 1 sentence
24 [0, 1] [1, 0] [0, 1] [0, 1] [1, 0]
25 [1, 2] [2, 3] [0, 1] [1, 3] [3, 1]
26 [4, 3] [1, 4] [4, 0] [1, 2] [0, 3]
27 [2, 1] [3, 0] [3, 0] [1, 0] [4, 0]
28 [2, 1] [0, 1] [0, 1] [1, 0] [1, 0]
29 [1, 2] 

219 [2, 0] [1, 3] [0, 1] [2, 3] [0, 2]
220 [3, 1] [2, 5] [5, 0] [3, 1] [4, 2]
221 [0, 1] [2, 0] [0, 1] [2, 0] [1, 0]
222 [8, 7] [1, 2] [2, 9] [8, 1] [8, 6]
223 [1, 3] [4, 2] [4, 0] [0, 1] [2, 3]
224 [4, 2] [0, 3] [0, 1] [1, 3] [1, 3]
225 [1, 0] [2, 3] [1, 0] [0, 2] [2, 0]
226 [2, 4] [0, 1] [0, 3] [4, 3] [1, 4]
227 [4, 2] [0, 1] [0, 2] [4, 0] [4, 1]
228 [1, 2] [0, 3] [0, 1] [4, 3] [4, 2]
229 [0, 2] [1, 3] [0, 1] [0, 1] [3, 0]
230 [3, 5] [4, 1] [6, 0] [8, 0] [2, 1]
231 [3, 4] [2, 6] [2, 0] [2, 5] [0, 1]
232 [4, 1] [2, 0] [1, 0] [1, 7] [5, 7]
233 [2, 4] [4, 0] [2, 3] [3, 1] [3, 4]
234 [3, 1] [4, 2] [4, 0] [1, 3] [1, 3]
235 [2, 0] [3, 1] [3, 0] [0, 3] [2, 1]
236 [3, 1] [1, 0] [1, 0] [3, 1] [1, 3]
237 [1, 3] [2, 6] [6, 0] [4, 1] [3, 6]
238 [1, 3] [5, 4] [0, 5] [1, 5] [2, 3]
239 [2, 0] [4, 1] [0, 1] [0, 3] [0, 3]
240 [6, 1] [4, 2] [4, 0] [0, 1] [6, 0]
241 [2, 6] [1, 0] [1, 5] [3, 4] [2, 5]
242 [6, 3] [1, 0] [4, 0] [3, 1] [4, 0]
243 [2, 0] [1, 2] [1, 0] [1, 2] [1, 2]
244 [1, 0] [0, 2] [0, 1] 

433 [0, 2] [2, 1] [2, 0] [1, 2] [1, 2]
434 [5, 1] [2, 3] [2, 0] [0, 5] [0, 6]
435 [2, 1] [1, 0] [0, 3] [2, 1] [4, 0]
436 [1, 0] [0, 1] [1, 0] [0, 1] [1, 0]
437 [0, 4] [2, 4] [0, 1] [1, 3] [2, 0]
438 [3, 1] [1, 2] [1, 0] [1, 0] [2, 1]
439 [4, 1] [3, 5] [0, 3] [1, 0] [0, 2]
440 [3, 7] [0, 2] [4, 0] [1, 7] [5, 1]
441 [2, 3] [1, 0] [0, 1] [2, 1] [3, 1]


In [151]:
idx = 211

for i in range(len(preprocessed_data[idx])):
    print(i, ' '.join(preprocessed_data[idx][i]))

0 jews originated as a national and religious group in the middle east during the second millennium bce in the part of the levant known as the land of israel
1 the merneptah stele appears to confirm the existence of a people of israel associated with the god el somewhere in canaan as far back as the 13th century bce
2 the israelites as an outgrowth of the canaanite population consolidated their hold with the emergence of the kingdom of israel and the kingdom of judah
3 some consider that these canaanite sedentary israelites melded with incoming nomadic groups known as hebrews
4 though few sources in the bible mention the exilic periods in detail the experience of diaspora life from the ancient egyptian rule over the levant to assyrian captivity and exile to babylonian captivity and exile to seleucid imperial rule to the roman occupation and the historical relations between israelites and the homeland became a major feature of jewish history identity and memory


## Create test data for evaluation

In [29]:
test_data = []
original_test_data = []
indices = []
for i in range(len(preprocessed_data)):
    if len(preprocessed_data[i]) >= 10:
        test_data.append(preprocessed_data[i])
        original_test_data.append(data[i])
        indices.append(i)
        print(i, len(preprocessed_data[i]))

107 11
120 10
130 10
135 10
143 11
173 11
201 12
203 11
222 11
297 10


In [28]:
import pandas as pd

In [30]:
test_df = pd.DataFrame({'passage': original_test_data, 'index': indices})

In [31]:
test_df.head()

Unnamed: 0,passage,index
0,"In August 1836, two real estate entrepreneurs—...",107
1,"In 1636 George, Duke of Brunswick-Lüneburg, ru...",120
2,"Known during development as Xbox Next, Xenon, ...",130
3,"Biodiversity, a contraction of ""biological div...",135
4,John was born to Henry II of England and Elean...,143


In [35]:
test_df.to_csv('dataset/test_sentence_selection_sample.csv')

## Performance Measure

#### Prepare labeled data from CSV

In [36]:
labeled_data = pd.read_csv('dataset/test_sentence_selection.csv')

In [37]:
labeled_data.head()

Unnamed: 0.1,Unnamed: 0,passage,index,first,second,third
0,0,"In August 1836, two real estate entrepreneurs—...",107,0,7,1
1,1,"In 1636 George, Duke of Brunswick-Lüneburg, ru...",120,0,2,3
2,2,"Known during development as Xbox Next, Xenon, ...",130,0,8,7
3,3,"Biodiversity, a contraction of ""biological div...",135,0,9,3
4,4,John was born to Henry II of England and Elean...,143,6,8,9


In [44]:
first = labeled_data['first'].tolist()
second = labeled_data['second'].tolist()
third = labeled_data['third'].tolist()

In [61]:
gold_summary = []
for i in range(len(first)):
    idx = labeled_data['index'].tolist()[i]
    gold = ' '.join(preprocessed_data[idx][first[i]]) + '. ' + ' '.join(preprocessed_data[idx][second[i]]) + '. ' + ' '.join(preprocessed_data[idx][third[i]]) + '.'
    gold_summary.append(gold)

#### Prepare results from methods

In [93]:
from nltk.corpus import stopwords

In [94]:
def get_summary(array, preprocessed_data):
    summary = ' '.join(preprocessed_data[array[0]]) + '. ' + ' '.join(preprocessed_data[array[1]]) + '. ' + ' '.join(preprocessed_data[array[2]]) + '.'
    return summary

### TextRank

In [161]:
tr_jaccard = TextRank(similarity='jaccard')
tr_cosine = TextRank(similarity='cosine')

### MultiWordPhraseExtraction

In [162]:
mwpe = MultiWordPhraseExtractor(window_size=5, top_keywords=7)

### LSA

In [163]:
lsa_tfidf = SteinbergerJezekLSA(matrix_technique='tfidf')

In [164]:
lsa_binary = SteinbergerJezekLSA(matrix_technique='binary')

In [165]:
lsa_tfidf_results = []
lsa_binary_results = []
multiword_results = []
tr_jaccard_results = []
tr_cosine_results = []

for i in labeled_data['index'].tolist():
    # LSA
    lsa_tfidf_results.append(get_summary(lsa_tfidf.summarize(preprocessed_data[i], top=3), preprocessed_data[i]))
    lsa_binary_results.append(get_summary(lsa_binary.summarize(preprocessed_data[i], top=3), preprocessed_data[i]))
    
    # MWPE
    multiword_results.append(get_summary(mwpe.summarize(preprocessed_data[i], top=3), preprocessed_data[i]))
    
    # TextRank
    tr_jaccard_results.append(get_summary(tr_jaccard.summarize(preprocessed_data[i], top=3), preprocessed_data[i]))
    tr_cosine_results.append(get_summary(tr_cosine.summarize(preprocessed_data[i], top=3), preprocessed_data[i]))

In [166]:
results = [
    lsa_tfidf_results,
    lsa_binary_results,
    multiword_results,
    tr_jaccard_results,
    tr_cosine_results
]

In [131]:
window_size = [3, 4, 5]
keywords = [5, 6, 7]

In [137]:
results = []
for w in window_size:
    for k in keywords:
        print(w, k)
        mwpe = MultiWordPhraseExtractor(window_size=w, top_keywords=k)
        multiword_results = []
        for i in labeled_data['index'].tolist():
            # MWPE
            multiword_results.append(get_summary(mwpe.summarize(preprocessed_data[i], top=3), preprocessed_data[i]))
            print(mwpe.phrases)
        results.append(multiword_results)

3 5
['sizable numbers', 'sizable older', 'allenfrom john']
['iv semi-salic', 'iv law', 'iv william', 'succession law']
['mac g5', 'mac power', 'mac apple', 'mac hardware']
['pacific sea', 'pacific surface', 'pacific western']
['histories governance', 'histories own', 'histories traditions']
['bank peninsula', 'bank west', 'bank sinai', 'peninsula part']
['mercury freddie', 'mercury smile', 'mercury band']
['gdr period', 'gdr church', 'gdr membership', 'gdr lutheranism']
['agrarianism mohism', 'agrarianism confucianism', 'agrarianism legalism', 'taoism agrarianism']
['direct state', 'direct free', 'irish state', 'mention direct']
3 6
['sizable numbers', 'sizable older', 'allenfrom john', 'allenfrom new']
['iv semi-salic', 'iv law', 'iv william', 'succession law', 'male succession']
['mac hardware', 'mac g5', 'mac power', 'mac apple', 'power spotted']
['pacific western', 'pacific sea', 'pacific surface', 'temperature sea']
['histories governance', 'histories own', 'histories traditions']

### Content-based measures (Cosine)

In [138]:
import re, math
from collections import Counter

WORD = re.compile(r'\w+')

def get_cosine(text1, text2):
    vec1 = text_to_vector(text1)
    vec2 = text_to_vector(text2)
    
    intersection = set(vec1.keys()) & set(vec2.keys())
    numerator = sum([vec1[x] * vec2[x] for x in intersection])

    sum1 = sum([vec1[x]**2 for x in vec1.keys()])
    sum2 = sum([vec2[x]**2 for x in vec2.keys()])
    denominator = math.sqrt(sum1) * math.sqrt(sum2)

    if not denominator:
        return 0.0
    else:
        return float(numerator) / denominator

def text_to_vector(text):
    words = WORD.findall(text)
    return Counter(words)

In [167]:
cosine_similarity = []
for method_id in range(len(results)):
    cosine_row = []
    for test_id in range(len(gold_summary)):
        cosine_row.append(get_cosine(gold_summary[test_id], results[method_id][test_id]))
    cosine_similarity.append(cosine_row)

In [168]:
eval_result_df = pd.DataFrame({
    'index': indices,
    'lsa_tfidf': cosine_similarity[0],
    'lsa_binary': cosine_similarity[1],
    'mwpe': cosine_similarity[2],
    'tr_jaccard': cosine_similarity[3],
    'tr_cosine': cosine_similarity[4],
})

In [169]:
eval_result_df.mean().round(3)

index         173.100
lsa_tfidf       0.481
lsa_binary      0.719
mwpe            0.783
tr_jaccard      0.694
tr_cosine       0.669
dtype: float64

In [170]:
rounded = eval_result_df.round(3)
rounded.head()

Unnamed: 0,index,lsa_tfidf,lsa_binary,mwpe,tr_jaccard,tr_cosine
0,107,0.487,0.896,0.931,0.677,0.737
1,120,0.527,0.749,0.872,0.932,0.707
2,130,0.338,0.715,0.671,0.612,0.62
3,135,0.269,0.6,0.613,0.609,0.685
4,143,0.542,0.677,0.608,0.579,0.639


In [171]:
rounded.to_csv('dataset/cosine_without_stopwords_top_3.csv')