In [42]:
from bs4 import BeautifulSoup, SoupStrainer
import requests
import os
import re
import pandas as pd
from xml.dom import minidom
from functools import reduce

In [2]:
imdb_ep_ids = []
for season in range(8):
    url = "https://www.imdb.com/title/tt0412142/episodes?season=" + str(season+1)

    page = requests.get(url)    
    data = page.text
    soup = BeautifulSoup(data)
    
    for link in soup.find_all('a'):
        if link.get('href').startswith('/title/') and not link.get('href').startswith('/title/tt0412142/') and not 'ref' in link.get('href'):
            ep_id = link.get('href').split("/")[2][2:]
            if ep_id.startswith('0'):
                ep_id = ep_id[1:]
            imdb_ep_ids.append(ep_id)   

print('All episodes of House: \n {}'.format(list(set(imdb_ep_ids))))

All episodes of House: 
 ['1216108', '1503399', '774235', '1117787', '606015', '1726374', '2121957', '2121958', '1726382', '1273724', '1144658', '606038', '606032', '2016511', '1878391', '1708695', '994242', '1273726', '1273729', '1273725', '1503412', '2121956', '882414', '1081396', '1503396', '1726381', '1104387', '2121964', '1273716', '1880571', '1273720', '2015677', '994241', '606016', '1503395', '1117789', '1726375', '1274484', '777001', '976557', '1023376', '1726376', '606021', '2121955', '1136646', '1726379', '1216106', '1883788', '1503406', '979558', '606044', '606027', '1503405', '606028', '854514', '1273713', '606030', '606042', '859366', '774238', '1273718', '2121962', '606014', '897691', '888858', '1273717', '1134258', '917153', '1726377', '1123416', '871152', '1503401', '1503403', '1685104', '2121954', '1273719', '1216107', '1273732', '1564666', '1503394', '1503398', '1842081', '1168087', '2063276', '1273715', '765624', '834715', '2121961', '1697219', '606040', '1273721', '

In [3]:
def getText(nodelist):
    rc = []
    for node in nodelist:
        if node.nodeType == node.TEXT_NODE:
            if not node.data.startswith('\n '):
                sentence = re.search('\n(.*)\n', node.data)
                rc.append(sentence.group(1))
    return ''.join(rc)

# Files available at: http://opus.nlpl.eu/download.php?f=OpenSubtitles/v2018/raw/en.zip
directory = './raw_substitles/intra-lingual_alignments/en/OpenSubtitles/raw/en/'

all_sub_of_all_ep = []
for x in list(set(imdb_ep_ids)):
    print('Episode ID: ', x)
    files = []
    global all_sub_of_ep
    
    # r=root, d=directories, f=files
    for r, d, f in os.walk(directory):
        for folder in d:
            if folder == x:
                print("Folder '{}' exists.".format(x))
                for r, d, f in os.walk(os.path.join(r, folder)):
                    for file in f:
                        if '.xml' in file:
                            files.append(os.path.join(r, file))
                            
                print("Has sub:")
                for i,f in enumerate(files):
                    year = f.split("/")[-3]
                    imdb_id = f.split("/")[-2]
                    sub_id = f.split("/")[-1][:-4]
                    xmldoc = minidom.parse(str(f))
                    sentences = xmldoc.getElementsByTagName('s')
                    print('year: {}, imdb_id: {}, sub_id: {}, sentences: {}'.format(year, imdb_id, sub_id, len(sentences)))

                    df_cols = ["year", "imdb_id", "sub_id", "sen_id", "sen_text"]
                    rows = []

                    for s in sentences:
                        sen_id = s.attributes['id'].value
                        sen_text = getText(s.childNodes)

                        rows.append({"year": year, "imdb_id": imdb_id, "sub_id": sub_id, "sen_id": sen_id, "sen_text": sen_text})

                    if i == 0:
                        all_sub_of_ep = pd.DataFrame(rows, columns = df_cols)
                    else:
                        new_df = pd.DataFrame(rows, columns = df_cols)
                        all_sub_of_ep = pd.merge(all_sub_of_ep, new_df, how='outer')
                        
                    all_sub_of_ep.drop_duplicates(subset=['sen_text'], keep='first', inplace=True)
                break
                
    all_sub_of_all_ep.append(all_sub_of_ep)


Episode ID:  1216108
Folder '1216108' exists.
Has sub:
year: 2008, imdb_id: 1216108, sub_id: 4337902, sentences: 794
year: 2008, imdb_id: 1216108, sub_id: 5066983, sentences: 696
year: 2008, imdb_id: 1216108, sub_id: 6966260, sentences: 729
year: 2008, imdb_id: 1216108, sub_id: 4337966, sentences: 794
year: 2008, imdb_id: 1216108, sub_id: 3284052, sentences: 887
year: 2008, imdb_id: 1216108, sub_id: 4276937, sentences: 729
year: 2008, imdb_id: 1216108, sub_id: 4338280, sentences: 729
year: 2008, imdb_id: 1216108, sub_id: 4337950, sentences: 794
year: 2008, imdb_id: 1216108, sub_id: 4338025, sentences: 729
Episode ID:  1503399
Folder '1503399' exists.
Has sub:
year: 2010, imdb_id: 1503399, sub_id: 4348902, sentences: 710
year: 2010, imdb_id: 1503399, sub_id: 4961851, sentences: 733
year: 2010, imdb_id: 1503399, sub_id: 3654065, sentences: 710
year: 2010, imdb_id: 1503399, sub_id: 4956971, sentences: 653
year: 2010, imdb_id: 1503399, sub_id: 3654116, sentences: 701
year: 2010, imdb_id: 1

year: 2010, imdb_id: 1708695, sub_id: 4830105, sentences: 872
year: 2010, imdb_id: 1708695, sub_id: 3945200, sentences: 860
year: 2010, imdb_id: 1708695, sub_id: 3944269, sentences: 875
year: 2010, imdb_id: 1708695, sub_id: 4375809, sentences: 823
Episode ID:  994242
Folder '994242' exists.
Has sub:
year: 2007, imdb_id: 994242, sub_id: 3121688, sentences: 737
year: 2007, imdb_id: 994242, sub_id: 5969710, sentences: 726
year: 2007, imdb_id: 994242, sub_id: 3117128, sentences: 716
year: 2007, imdb_id: 994242, sub_id: 3259612, sentences: 740
year: 2007, imdb_id: 994242, sub_id: 4089224, sentences: 726
year: 2007, imdb_id: 994242, sub_id: 3117544, sentences: 736
Episode ID:  1273726
Folder '1273726' exists.
Has sub:
year: 2009, imdb_id: 1273726, sub_id: 5066774, sentences: 680
year: 2009, imdb_id: 1273726, sub_id: 4382881, sentences: 702
year: 2009, imdb_id: 1273726, sub_id: 4961845, sentences: 700
year: 2009, imdb_id: 1273726, sub_id: 6405476, sentences: 648
year: 2009, imdb_id: 1273726, 

Folder '2015677' exists.
Has sub:
year: 2011, imdb_id: 2015677, sub_id: 4253265, sentences: 805
year: 2011, imdb_id: 2015677, sub_id: 4253001, sentences: 780
year: 2011, imdb_id: 2015677, sub_id: 4253174, sentences: 756
year: 2011, imdb_id: 2015677, sub_id: 6511565, sentences: 702
year: 2011, imdb_id: 2015677, sub_id: 6371190, sentences: 763
year: 2011, imdb_id: 2015677, sub_id: 4253268, sentences: 777
Episode ID:  994241
Folder '994241' exists.
Has sub:
year: 2007, imdb_id: 994241, sub_id: 3121403, sentences: 744
year: 2007, imdb_id: 994241, sub_id: 3122911, sentences: 766
year: 2007, imdb_id: 994241, sub_id: 3121404, sentences: 744
year: 2007, imdb_id: 994241, sub_id: 6496347, sentences: 773
year: 2007, imdb_id: 994241, sub_id: 3259615, sentences: 770
year: 2007, imdb_id: 994241, sub_id: 4903168, sentences: 750
year: 2007, imdb_id: 994241, sub_id: 5970673, sentences: 750
year: 2007, imdb_id: 994241, sub_id: 3121055, sentences: 769
year: 2007, imdb_id: 994241, sub_id: 4864763, sentenc

year: 2008, imdb_id: 1216106, sub_id: 5113873, sentences: 794
year: 2008, imdb_id: 1216106, sub_id: 6965639, sentences: 862
year: 2008, imdb_id: 1216106, sub_id: 4338023, sentences: 862
Episode ID:  1883788
Folder '1883788' exists.
Has sub:
year: 2011, imdb_id: 1883788, sub_id: 6976129, sentences: 746
year: 2011, imdb_id: 1883788, sub_id: 4171794, sentences: 748
year: 2011, imdb_id: 1883788, sub_id: 4375823, sentences: 704
year: 2011, imdb_id: 1883788, sub_id: 4171793, sentences: 762
year: 2011, imdb_id: 1883788, sub_id: 4172081, sentences: 749
year: 2011, imdb_id: 1883788, sub_id: 4172078, sentences: 763
Episode ID:  1503406
Folder '1503406' exists.
Has sub:
year: 2010, imdb_id: 1503406, sub_id: 5434411, sentences: 689
year: 2010, imdb_id: 1503406, sub_id: 3678852, sentences: 724
year: 2010, imdb_id: 1503406, sub_id: 4350509, sentences: 897
year: 2010, imdb_id: 1503406, sub_id: 6083999, sentences: 724
year: 2010, imdb_id: 1503406, sub_id: 3678830, sentences: 725
year: 2010, imdb_id: 1

year: 2009, imdb_id: 1273718, sub_id: 4382898, sentences: 775
year: 2009, imdb_id: 1273718, sub_id: 6276783, sentences: 827
year: 2009, imdb_id: 1273718, sub_id: 6019399, sentences: 777
year: 2009, imdb_id: 1273718, sub_id: 3442349, sentences: 860
year: 2009, imdb_id: 1273718, sub_id: 5880733, sentences: 775
Episode ID:  2121962
Folder '2121962' exists.
Has sub:
year: 2012, imdb_id: 2121962, sub_id: 5131628, sentences: 620
year: 2012, imdb_id: 2121962, sub_id: 4883697, sentences: 662
year: 2012, imdb_id: 2121962, sub_id: 4535685, sentences: 669
year: 2012, imdb_id: 2121962, sub_id: 4535684, sentences: 705
year: 2012, imdb_id: 2121962, sub_id: 6371182, sentences: 702
year: 2012, imdb_id: 2121962, sub_id: 4535683, sentences: 669
year: 2012, imdb_id: 2121962, sub_id: 6514341, sentences: 592
year: 2012, imdb_id: 2121962, sub_id: 4535681, sentences: 705
year: 2012, imdb_id: 2121962, sub_id: 4558662, sentences: 705
year: 2012, imdb_id: 2121962, sub_id: 4558664, sentences: 669
year: 2012, imd

Episode ID:  1273719
Folder '1273719' exists.
Has sub:
year: 2009, imdb_id: 1273719, sub_id: 5435727, sentences: 760
year: 2009, imdb_id: 1273719, sub_id: 3449079, sentences: 846
year: 2009, imdb_id: 1273719, sub_id: 4382875, sentences: 778
year: 2009, imdb_id: 1273719, sub_id: 3449080, sentences: 820
year: 2009, imdb_id: 1273719, sub_id: 5884047, sentences: 733
year: 2009, imdb_id: 1273719, sub_id: 4382899, sentences: 733
Episode ID:  1216107
Folder '1216107' exists.
Has sub:
year: 2008, imdb_id: 1216107, sub_id: 4337914, sentences: 877
year: 2008, imdb_id: 1216107, sub_id: 5714769, sentences: 776
year: 2008, imdb_id: 1216107, sub_id: 4337962, sentences: 877
year: 2008, imdb_id: 1216107, sub_id: 3285431, sentences: 1033
year: 2008, imdb_id: 1216107, sub_id: 6965655, sentences: 838
year: 2008, imdb_id: 1216107, sub_id: 4338292, sentences: 837
year: 2008, imdb_id: 1216107, sub_id: 4276936, sentences: 837
year: 2008, imdb_id: 1216107, sub_id: 3281675, sentences: 1033
year: 2008, imdb_id:

Episode ID:  1273721
Folder '1273721' exists.
Has sub:
year: 2009, imdb_id: 1273721, sub_id: 5190408, sentences: 769
year: 2009, imdb_id: 1273721, sub_id: 3467670, sentences: 777
year: 2009, imdb_id: 1273721, sub_id: 5055901, sentences: 705
year: 2009, imdb_id: 1273721, sub_id: 6403364, sentences: 740
year: 2009, imdb_id: 1273721, sub_id: 4382877, sentences: 779
year: 2009, imdb_id: 1273721, sub_id: 4382901, sentences: 740
year: 2009, imdb_id: 1273721, sub_id: 3467669, sentences: 805
Episode ID:  1503397
Folder '1503397' exists.
Has sub:
year: 2010, imdb_id: 1503397, sub_id: 4348900, sentences: 1060
year: 2010, imdb_id: 1503397, sub_id: 4350502, sentences: 752
year: 2010, imdb_id: 1503397, sub_id: 5264983, sentences: 761
year: 2010, imdb_id: 1503397, sub_id: 4961849, sentences: 749
year: 2010, imdb_id: 1503397, sub_id: 3637353, sentences: 1046
year: 2010, imdb_id: 1503397, sub_id: 3637354, sentences: 1017
Episode ID:  2121965
Folder '2121965' exists.
Has sub:
year: 2012, imdb_id: 21219

Episode ID:  606036
Folder '606036' exists.
Has sub:
year: 2005, imdb_id: 606036, sub_id: 190353, sentences: 850
year: 2005, imdb_id: 606036, sub_id: 4172788, sentences: 814
year: 2005, imdb_id: 606036, sub_id: 4016121, sentences: 814
year: 2005, imdb_id: 606036, sub_id: 6223160, sentences: 814
year: 2005, imdb_id: 606036, sub_id: 4357870, sentences: 847
year: 2005, imdb_id: 606036, sub_id: 180475, sentences: 813
year: 2005, imdb_id: 606036, sub_id: 5659274, sentences: 814
year: 2005, imdb_id: 606036, sub_id: 6692318, sentences: 843
year: 2005, imdb_id: 606036, sub_id: 6955674, sentences: 814
Episode ID:  606031
Folder '606031' exists.
Has sub:
year: 2005, imdb_id: 606031, sub_id: 4310304, sentences: 757
year: 2005, imdb_id: 606031, sub_id: 180482, sentences: 773
year: 2005, imdb_id: 606031, sub_id: 6093973, sentences: 845
year: 2005, imdb_id: 606031, sub_id: 4357877, sentences: 869
year: 2005, imdb_id: 606031, sub_id: 4176433, sentences: 845
year: 2005, imdb_id: 606031, sub_id: 190360

year: 2006, imdb_id: 606023, sub_id: 4332356, sentences: 841
Episode ID:  606026
Folder '606026' exists.
Has sub:
year: 2005, imdb_id: 606026, sub_id: 5126026, sentences: 836
year: 2005, imdb_id: 606026, sub_id: 4578948, sentences: 790
year: 2005, imdb_id: 606026, sub_id: 3459116, sentences: 904
year: 2005, imdb_id: 606026, sub_id: 180492, sentences: 904
year: 2005, imdb_id: 606026, sub_id: 4305056, sentences: 839
year: 2005, imdb_id: 606026, sub_id: 3165422, sentences: 839
year: 2005, imdb_id: 606026, sub_id: 4939817, sentences: 787
year: 2005, imdb_id: 606026, sub_id: 3825719, sentences: 883
year: 2005, imdb_id: 606026, sub_id: 6396707, sentences: 788
year: 2005, imdb_id: 606026, sub_id: 4292595, sentences: 903
year: 2005, imdb_id: 606026, sub_id: 3465911, sentences: 839
year: 2005, imdb_id: 606026, sub_id: 4119042, sentences: 934
Episode ID:  606046
Folder '606046' exists.
Has sub:
year: 2005, imdb_id: 606046, sub_id: 6092161, sentences: 828
year: 2005, imdb_id: 606046, sub_id: 3266

year: 2004, imdb_id: 606035, sub_id: 180468, sentences: 749
year: 2004, imdb_id: 606035, sub_id: 4016114, sentences: 877
year: 2004, imdb_id: 606035, sub_id: 3863959, sentences: 877
year: 2004, imdb_id: 606035, sub_id: 4357863, sentences: 907
year: 2004, imdb_id: 606035, sub_id: 4811955, sentences: 877
year: 2004, imdb_id: 606035, sub_id: 4142444, sentences: 877
year: 2004, imdb_id: 606035, sub_id: 6100034, sentences: 937
Episode ID:  1399759
Folder '1399759' exists.
Has sub:
year: 2009, imdb_id: 1399759, sub_id: 6407167, sentences: 747
year: 2009, imdb_id: 1399759, sub_id: 4961846, sentences: 785
year: 2009, imdb_id: 1399759, sub_id: 4382882, sentences: 787
year: 2009, imdb_id: 1399759, sub_id: 3504746, sentences: 820
year: 2009, imdb_id: 1399759, sub_id: 5066865, sentences: 679
year: 2009, imdb_id: 1399759, sub_id: 6790224, sentences: 787
year: 2009, imdb_id: 1399759, sub_id: 3504747, sentences: 753
year: 2009, imdb_id: 1399759, sub_id: 4382906, sentences: 747
Episode ID:  2121953
Fo

year: 2005, imdb_id: 606043, sub_id: 180489, sentences: 749
year: 2005, imdb_id: 606043, sub_id: 190366, sentences: 816
year: 2005, imdb_id: 606043, sub_id: 6127499, sentences: 796
Episode ID:  834714
Folder '834714' exists.
Has sub:
year: 2006, imdb_id: 834714, sub_id: 3165453, sentences: 751
year: 2006, imdb_id: 834714, sub_id: 5861088, sentences: 747
year: 2006, imdb_id: 834714, sub_id: 3435533, sentences: 764
year: 2006, imdb_id: 834714, sub_id: 4899126, sentences: 727
year: 2006, imdb_id: 834714, sub_id: 4910535, sentences: 769
year: 2006, imdb_id: 834714, sub_id: 6692051, sentences: 742
year: 2006, imdb_id: 834714, sub_id: 3112983, sentences: 751
year: 2006, imdb_id: 834714, sub_id: 6492993, sentences: 763
Episode ID:  1178841
Folder '1178841' exists.
Has sub:
year: 2008, imdb_id: 1178841, sub_id: 4337901, sentences: 837
year: 2008, imdb_id: 1178841, sub_id: 4337949, sentences: 837
year: 2008, imdb_id: 1178841, sub_id: 4337965, sentences: 837
year: 2008, imdb_id: 1178841, sub_id:

Episode ID:  2121963
Folder '2121963' exists.
Has sub:
year: 2012, imdb_id: 2121963, sub_id: 6371194, sentences: 703
year: 2012, imdb_id: 2121963, sub_id: 4543689, sentences: 758
year: 2012, imdb_id: 2121963, sub_id: 4543690, sentences: 718
year: 2012, imdb_id: 2121963, sub_id: 4543245, sentences: 758
year: 2012, imdb_id: 2121963, sub_id: 4543246, sentences: 718
year: 2012, imdb_id: 2121963, sub_id: 4543468, sentences: 718
year: 2012, imdb_id: 2121963, sub_id: 5057774, sentences: 641
Episode ID:  606037
Folder '606037' exists.
Has sub:
year: 2005, imdb_id: 606037, sub_id: 180484, sentences: 709
year: 2005, imdb_id: 606037, sub_id: 4177654, sentences: 767
year: 2005, imdb_id: 606037, sub_id: 4357879, sentences: 793
year: 2005, imdb_id: 606037, sub_id: 6093975, sentences: 767
year: 2005, imdb_id: 606037, sub_id: 3266230, sentences: 709
year: 2005, imdb_id: 606037, sub_id: 4310296, sentences: 692
year: 2005, imdb_id: 606037, sub_id: 190362, sentences: 796
Episode ID:  1726380
Folder '1726

In [411]:
all_sub_of_ep

Unnamed: 0,year,imdb_id,sub_id,sen_id,sen_text
0,2006,873021,3091909,1,Here you go.
1,2006,873021,3091909,2,That fried crap will kill you.
2,2006,873021,3091909,3,See?
3,2006,873021,3091909,4,Time to make the donuts.
4,2006,873021,3091909,5,Hands in the air!
5,2006,873021,3091909,6,Right now!
6,2006,873021,3091909,7,- Get up!
7,2006,873021,3091909,8,"Move out here, let's go!"
8,2006,873021,3091909,9,- Move it!
9,2006,873021,3091909,10,Purses and wallets on the counter!


In [413]:
len(all_sub_of_ep.sen_text.unique())

1747

In [7]:
print(len(list(set(imdb_ep_ids))))
print(len(all_sub_of_all_ep))

176
176


In [36]:
for i in range(len(all_sub_of_all_ep)):
#     print(all_sub_of_all_ep[i]['sub_id'].values)
    if '3091909' in all_sub_of_all_ep[i]['sub_id'].values:
        print(all_sub_of_all_ep[i])

      year imdb_id   sub_id sen_id  \
0     2006  873021  3091909      1   
1     2006  873021  3091909      2   
2     2006  873021  3091909      3   
3     2006  873021  3091909      4   
4     2006  873021  3091909      5   
5     2006  873021  3091909      6   
6     2006  873021  3091909      7   
7     2006  873021  3091909      8   
8     2006  873021  3091909      9   
9     2006  873021  3091909     10   
10    2006  873021  3091909     12   
11    2006  873021  3091909     13   
12    2006  873021  3091909     14   
13    2006  873021  3091909     15   
14    2006  873021  3091909     16   
15    2006  873021  3091909     17   
16    2006  873021  3091909     19   
17    2006  873021  3091909     20   
18    2006  873021  3091909     21   
19    2006  873021  3091909     22   
20    2006  873021  3091909     23   
21    2006  873021  3091909     24   
22    2006  873021  3091909     26   
23    2006  873021  3091909     27   
24    2006  873021  3091909     28   
25    2006  

In [40]:
total_amount_sentences = 0
for i in range(len(all_sub_of_all_ep)):
    amount_sentences_ep = len(all_sub_of_all_ep[i])
    total_amount_sentences += amount_sentences_ep

print(total_amount_sentences)
    

284492


In [43]:
# Merge all dataframes in all_sub_of_all_ep
all_house_subs = reduce(lambda  left,right: pd.merge(left, right, how='outer'), all_sub_of_all_ep)
all_house_subs

Unnamed: 0,year,imdb_id,sub_id,sen_id,sen_text
0,2008,1216108,4337902,1,You like that?
1,2008,1216108,4337902,2,I don't know.
2,2008,1216108,4337902,3,You want me to use my butt again?
3,2008,1216108,4337902,4,I don't know how I got here.
4,2008,1216108,4337902,5,- How many drinks did I have?
5,2008,1216108,4337902,6,- Your scotch hasn't even arrived yet.
6,2008,1216108,4337902,7,Means I was drunk when I got here.
7,2008,1216108,4337902,8,8:50.
8,2008,1216108,4337902,9,I remember being at work.
9,2008,1216108,4337902,10,I've lost at least four hours.


In [48]:
all_house_subs.sort_values(by=['year','imdb_id','sub_id'], axis=0, ascending=True, inplace=True)

In [50]:
# all_house_subs.to_csv('house_subs.csv', index=None, sep=';')