In [1]:
import pandas as pd
import os
import gensim
import spacy

In [2]:
data = pd.read_csv('HouseOnlyDataset.csv')

#### Dataset of dialogues by Greg House, lead of Med-Drama House M.D.

In [3]:
data.head()

Unnamed: 0,name,line
0,House,See that? They all assume I'm a Patient becau...
1,House,I don't want them to think I'm a doctor.
2,House,People don't want a sick doctor.
3,House,"The one who can't talk, I liked that part."
4,House,And your cousin doesn't like the diagnosis. I...


In [4]:
df = data.drop('name', axis='columns')
df['id'] =df.index


In [5]:
df.head()

Unnamed: 0,line,id
0,See that? They all assume I'm a Patient becau...,0
1,I don't want them to think I'm a doctor.,1
2,People don't want a sick doctor.,2
3,"The one who can't talk, I liked that part.",3
4,And your cousin doesn't like the diagnosis. I...,4


#### Pivot Sentence for Similarity Detection

In [6]:
reference = df.loc[df['id']==1169, 'line'].iloc[0]

In [7]:
reference

' A medical tube, saving his life.'

#### Modelling with spaCy>Encore Web 

In [8]:
nlp = spacy.load('en_core_web_lg')

In [9]:
reference_vec = nlp(reference)

In [10]:
all_docs = [nlp(row) for row in df['line']]

In [13]:
sims = []
doc_id = []
for i in range(len(all_docs)):
    similar = all_docs[i].similarity(reference_vec)
    sims.append(similar)
    doc_id.append(i)
    sims_docs = pd.DataFrame(list(zip(doc_id, sims)), columns = ['doc_id', 'sims'])

#### Sorting by Similarity Score

In [22]:
sims_sorted = sims_docs.sort_values(by = 'sims', ascending = False)

In [23]:
top_sims_sorted = df.iloc[sims_sorted['doc_id'][1:6]]

In [24]:
top_sims_sorted

Unnamed: 0,line,id
6686,"No, you idiot! It's titanium. Like from a sur...",6686
21705,I recently hacked into your old files. Found ...,21705
8310,I saved a life. Two minutes out of the chair ...,8310
12842,Dad's right. This is no time for gossip. A Pa...,12842
19647,Your Patient? No. Plenty of time to save her ...,19647


In [32]:
sims_output = pd.concat([top_sims_sorted, sims_sorted['sims'][1:6]], axis = 1)

#### Output DataFrame

In [33]:
sims_output

Unnamed: 0,line,id,sims
6686,"No, you idiot! It's titanium. Like from a sur...",6686,0.89218
21705,I recently hacked into your old files. Found ...,21705,0.888499
8310,I saved a life. Two minutes out of the chair ...,8310,0.88806
12842,Dad's right. This is no time for gossip. A Pa...,12842,0.886894
19647,Your Patient? No. Plenty of time to save her ...,19647,0.885991


In [34]:
for (line, similar) in zip(sims_output['line'], sims_output['sims']):
    print("The top similar sentences are: {}\n with a Similarity Score of {:.2f}\n".format)

<built-in method format of str object at 0x00000294E778DD30>
<built-in method format of str object at 0x00000294E778DD30>
<built-in method format of str object at 0x00000294E778DD30>
<built-in method format of str object at 0x00000294E778DD30>
<built-in method format of str object at 0x00000294E778DD30>


#### Similar Sentences/Scores

In [43]:
print(sims_output.loc[sims_output['id']==6686, 'line'].iloc[0])
print(sims_output.loc[sims_output['id']==6686, 'sims'].iloc[0])

 No, you idiot! It's titanium. Like from a surgical pin, like the kind the kid had inserted into his broken arm four years ago, nice medical history.
0.8921801113936079


In [46]:
print(sims_output.loc[sims_output['id']==21705, 'line'].iloc[0])
print(sims_output.loc[sims_output['id']==21705, 'sims'].iloc[0])

 I recently hacked into your old files. Found a six-year-old boy with bilateral retinoblastoma. Doctors wanted to take his eyes out before the cancer spread to his brain, and you fought them. AdvoCated for a new photon beam radiation. Saved the kid's vision. PRobably saved his life.
0.8884990462447957


In [47]:
print(sims_output.loc[sims_output['id']==8310, 'line'].iloc[0])
print(sims_output.loc[sims_output['id']==8310, 'sims'].iloc[0])

 I saved a life. Two minutes out of the chair to save a kid's life.
0.8880598940439731
