In [47]:
import re
import srt 
import glob
import pandas as pd
import os
import numpy as np
from fuzzywuzzy import fuzz
from fuzzywuzzy import process

import spacy

In [48]:
files = glob.glob("./English_level/data_all/Subtitles_all/*.srt")

In [49]:
HTML = r'<.*?>'
TAG = r'{.*?}'
COMMENTS = r'[\(\[][A-Z ]+[\)\]]'
LETTERS = r'[^a-zA-Z\'.,!? ]'
SPACES = r'([ ])\1+'
DOTS = r'[\.]+'

def clean_subs(subs):
    txt = re.sub(HTML, ' ', subs) #html тэги меняем на пробел
    txt = re.sub(TAG, ' ', txt) #тэги меняем на пробел
    txt = re.sub(COMMENTS, ' ', txt) #комменты меняем на пробел
    txt = re.sub(LETTERS, ' ', txt) #все что не буквы меняем на пробел
    txt = re.sub(SPACES, r'\1', txt) #повторяющиеся пробелы меняем на один пробел
    txt = re.sub(DOTS, r'.', txt)  #многоточие меняем на точку
    # txt = txt.encode('ascii', 'ignore').decode() #удаляем все что не ascii символы   
    # txt = ".".join(txt.lower().split('.')[1:-1]) #удаляем первый и последний субтитр (обычно это реклама)
    return txt

In [50]:
dframes = {}

for filename in files:
    try:
        with open(filename) as f:
            subtitle_generator = srt.parse(f.read().encode('ascii', 'ignore').decode())
            df = pd.DataFrame(list([[item.start, item.end, item.content] for item in subtitle_generator]), columns=['start','end','content'])
            df['content'] = df['content'].apply(clean_subs)
            df['content'] = df['content'].str.strip().replace('', np.nan).str.lower()
            df.dropna(subset=['content'], inplace=True)
            df.reset_index(drop=True, inplace=True)

            dframes[os.path.basename(f.name)] = df

    except srt.SRTParseError as e:
        print(filename, e)

In [51]:
YEAR_SRT = r'(\(\d{4}\))'
CLEAN_TEXT = r'[^a-z\s\|]'

In [52]:
names_df = pd.DataFrame([n for n in dframes], columns=['name'])
names_df['year'] = names_df['name'].str.extract(YEAR_SRT, flags=0)
names_df['parsed_name'] = names_df.apply(lambda x: x['name'].replace(str(x['year']), '').replace('_', ' ').replace('.srt', ''), axis=1)

In [53]:
names_df

Unnamed: 0,name,year,parsed_name
0,10_Cloverfield_lane(2016).srt,(2016),10 Cloverfield lane
1,10_things_I_hate_about_you(1999).srt,(1999),10 things I hate about you
2,Aladdin(1992).srt,(1992),Aladdin
3,All_dogs_go_to_heaven(1989).srt,(1989),All dogs go to heaven
4,An_American_tail(1986).srt,(1986),An American tail
...,...,...,...
109,Warm_bodies(2013).srt,(2013),Warm bodies
110,Westworld_scenes_of_Dr_Robert_Ford.srt,,Westworld scenes of Dr Robert Ford
111,We_are_the_Millers(2013).srt,(2013),We are the Millers
112,While_You_Were_Sleeping(1995).srt,(1995),While You Were Sleeping


In [54]:
labels = pd.read_csv('English_level/data_all/labels_all.csv')
display(labels)

Unnamed: 0,Movie,Kinopoisk,Level,Subtitles
0,Forrest Gump,Rus sub,"A2/A2+, B1",Yes
1,Finding Nemo\r\n,Everything,A2/A2+,Yes
2,Cast away\r\n,"Paid, Rus sub",A2/A2+,Yes
3,The invisible man (2020)\r\n,"Paid, Rus lan",A2/A2+,Yes
4,Back to the future\r\n,Rus sub,A2/A2+,Yes
...,...,...,...,...
111,Klaus,,C1,
112,Ocean’s Eleven,,C1,
113,Ocean’s Twelve,,C1,
114,Bridget Jones’s Baby,,C1,


In [55]:
for index in labels.index:
    item = labels.loc[index, 'Movie']
    found = process.extractOne(item, names_df['parsed_name'])
    labels.loc[index, ['found', 'probability', 'index']] = found

labels['index'] = labels['index'].astype(int)
    

In [56]:
names_df.join(labels.set_index('index'))

Unnamed: 0,name,year,parsed_name,Movie,Kinopoisk,Level,Subtitles,found,probability
0,10_Cloverfield_lane(2016).srt,(2016),10 Cloverfield lane,10 Cloverfield Lane,,B1,Yes,10 Cloverfield lane,100.0
1,10_things_I_hate_about_you(1999).srt,(1999),10 things I hate about you,10 things I hate about you,No subs,B1,Yes,10 things I hate about you,100.0
2,Aladdin(1992).srt,(1992),Aladdin,Aladdin,Everything,A2/A2+,Yes,Aladdin,100.0
3,All_dogs_go_to_heaven(1989).srt,(1989),All dogs go to heaven,All dogs go to heaven,Nope,A2/A2+,Yes,All dogs go to heaven,100.0
4,An_American_tail(1986).srt,(1986),An American tail,An American tail,Nope,A2/A2+,Yes,An American tail,97.0
...,...,...,...,...,...,...,...,...,...
109,Warm_bodies(2013).srt,(2013),Warm bodies,Warm bodies,"Everything, Paid",B1,Yes,Warm bodies,100.0
110,Westworld_scenes_of_Dr_Robert_Ford.srt,,Westworld scenes of Dr Robert Ford,,,,,,
111,We_are_the_Millers(2013).srt,(2013),We are the Millers,We’re the Millers,Nope,B1,Yes,We are the Millers,97.0
112,While_You_Were_Sleeping(1995).srt,(1995),While You Were Sleeping,While You Were Sleeping,,B1,,While You Were Sleeping,100.0


In [57]:
text = '|'.join(dframes['10_Cloverfield_lane(2016).srt']['content'])

In [58]:
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])

In [67]:
def lemmatize(text):    
    doc = nlp(text)
    
    lemmatized_text = ' '.join(['' if token.is_stop else token.lemma_ for token in doc])
    lemmatized_text = re.sub(CLEAN_TEXT, ' ', lemmatized_text)
    lemmatized_text = re.sub(SPACES, r'\1', lemmatized_text)

    return lemmatized_text.split('|')

In [68]:
lemmatize(text)

['fix sync bozxphd enjoy flick',
 'ben phone michelle hang up ',
 'just talk okay believe left ',
 'michelle ',
 'come back ',
 'please something ',
 'michelle talk me ',
 'look argument couple fight ',
 'that reason leave behind ',
 'runne away go to help michelle please ',
 'newscaster detail that ',
 'elsewhere today power restored',
 'to city southern seaboard',
 'in wake afternoon widespread blackout ',
 'while inclement weather region ',
 'the problem link authority calling',
 'a catastrophic power surge cripple traffic area ',
 'no ',
 'no ',
 'damn ',
 'okay okay please ',
 'please ',
 'please hurt me ',
 'please ',
 'just let okay will tell anybody ',
 'i promise okay let please ',
 'man need fluid shock ',
 'what go me ',
 'i m go alive ',
 'work get handy these ',
 'my boyfriend expect me ',
 'he ll send cop looking ',
 'i m sorry ',
 'but look you ',
 'you ve get fight you ',
 'i respect that ',
 'but think try again ',
 'you re lucky all ',
 'and generosity extend far ',
 