In [121]:
import re
import srt 
import glob
import pandas as pd
import os
import numpy as np

In [18]:
files = glob.glob("./English_level/data_all/Subtitles_all/*.srt")

In [72]:
HTML = r'<.*?>'
TAG = r'{.*?}'
COMMENTS = r'[\(\[][A-Z ]+[\)\]]'
LETTERS = r'[^a-zA-Z\'.,!? ]'
SPACES = r'([ ])\1+'
DOTS = r'[\.]+'

def clean_subs(subs):
    txt = re.sub(HTML, ' ', subs) #html тэги меняем на пробел
    txt = re.sub(TAG, ' ', txt) #тэги меняем на пробел
    txt = re.sub(COMMENTS, ' ', txt) #комменты меняем на пробел
    txt = re.sub(LETTERS, ' ', txt) #все что не буквы меняем на пробел
    txt = re.sub(SPACES, r'\1', txt) #повторяющиеся пробелы меняем на один пробел
    txt = re.sub(DOTS, r'.', txt)  #многоточие меняем на точку
    # txt = txt.encode('ascii', 'ignore').decode() #удаляем все что не ascii символы   
    # txt = ".".join(txt.lower().split('.')[1:-1]) #удаляем первый и последний субтитр (обычно это реклама)
    return txt

In [147]:
dframes = {}

for filename in files:
    try:
        with open(filename) as f:
            subtitle_generator = srt.parse(f.read().encode('ascii', 'ignore').decode())
            df = pd.DataFrame(list([[item.start, item.end, item.content] for item in subtitle_generator]), columns=['start','end','content'])
            df['content'] = df['content'].apply(clean_subs)
            df['content'] = df['content'].str.strip().replace('', np.nan)
            df.dropna(subset=['content'], inplace=True)
            df.reset_index(drop=True, inplace=True)

            dframes[os.path.basename(f.name)] = df

    except srt.SRTParseError as e:
        print(filename, e)

In [148]:
dframes['10_Cloverfield_lane(2016).srt']

Unnamed: 0,start,end,content
0,0 days 00:00:55.279000,0 days 00:01:07.279000,Fixed Synced by bozxphd. Enjoy The Flick
1,0 days 00:04:20.200000,0 days 00:04:22.919000,"BEN ON PHONE Michelle, please don't hang up."
2,0 days 00:04:23.680000,0 days 00:04:27.673000,"Just talk to me, okay? I can't believe you jus..."
3,0 days 00:04:28.840000,0 days 00:04:30.068000,Michelle.
4,0 days 00:04:30.920000,0 days 00:04:32.273000,Come back.
...,...,...,...
839,0 days 01:36:49.560000,0 days 01:36:50.913000,Come join us.
840,0 days 01:36:52.240000,0 days 01:36:54.390000,We've taken back the southern seaboard.
841,0 days 01:36:55.520000,0 days 01:36:56.839000,And we 're winning.
842,0 days 01:36:57.360000,0 days 01:37:00.636000,But if you have any medical training or combat...


In [171]:
YEAR_SRT = r'(\(\d{4}\))'

In [165]:
names_df = pd.DataFrame([n for n in dframes], columns=['name'])

In [166]:
names_df

Unnamed: 0,name
0,10_Cloverfield_lane(2016).srt
1,10_things_I_hate_about_you(1999).srt
2,Aladdin(1992).srt
3,All_dogs_go_to_heaven(1989).srt
4,An_American_tail(1986).srt
...,...
109,Warm_bodies(2013).srt
110,Westworld_scenes_of_Dr_Robert_Ford.srt
111,We_are_the_Millers(2013).srt
112,While_You_Were_Sleeping(1995).srt


In [174]:
names_df['year'] = names_df['name'].str.extract(YEAR_SRT, flags=0)


In [188]:
names_df

Unnamed: 0,name,year
0,10_Cloverfield_lane(2016).srt,(2016)
1,10_things_I_hate_about_you(1999).srt,(1999)
2,Aladdin(1992).srt,(1992)
3,All_dogs_go_to_heaven(1989).srt,(1989)
4,An_American_tail(1986).srt,(1986)
...,...,...
109,Warm_bodies(2013).srt,(2013)
110,Westworld_scenes_of_Dr_Robert_Ford.srt,
111,We_are_the_Millers(2013).srt,(2013)
112,While_You_Were_Sleeping(1995).srt,(1995)
