In [235]:
import re
import srt 
import glob
import pandas as pd
import os
import numpy as np
from fuzzywuzzy import fuzz
from fuzzywuzzy import process

import spacy

In [236]:
files = glob.glob("./English_level/data_all/Subtitles_all/*.srt")

In [237]:
HTML = r'<.*?>'
TAG = r'{.*?}'
COMMENTS = r'[\(\[][A-Z ]+[\)\]]'
LETTERS = r'[^a-zA-Z\'.,!? ]'
SPACES = r'([ ])\1+'
DOTS = r'[\.]+'

def clean_subs(subs):
    txt = re.sub(HTML, ' ', subs) #html тэги меняем на пробел
    txt = re.sub(TAG, ' ', txt) #тэги меняем на пробел
    txt = re.sub(COMMENTS, ' ', txt) #комменты меняем на пробел
    txt = re.sub(LETTERS, ' ', txt) #все что не буквы меняем на пробел
    txt = re.sub(SPACES, r'\1', txt) #повторяющиеся пробелы меняем на один пробел
    txt = re.sub(DOTS, r'.', txt)  #многоточие меняем на точку
    # txt = txt.encode('ascii', 'ignore').decode() #удаляем все что не ascii символы   
    # txt = ".".join(txt.lower().split('.')[1:-1]) #удаляем первый и последний субтитр (обычно это реклама)
    return txt

In [238]:
dframes = {}

for filename in files:
    try:
        with open(filename) as f:
            subtitle_generator = srt.parse(f.read().encode('ascii', 'ignore').decode())
            df = pd.DataFrame(list([[item.start, item.end, item.content] for item in subtitle_generator]), columns=['start','end','content'])
            df['content'] = df['content'].apply(clean_subs)
            df['content'] = df['content'].str.strip().replace('', np.nan).str.lower()
            df.dropna(subset=['content'], inplace=True)
            df.reset_index(drop=True, inplace=True)

            dframes[os.path.basename(f.name)] = df

    except srt.SRTParseError as e:
        print(filename, e)

In [239]:
YEAR_SRT = r'(\(\d{4}\))'
CLEAN_TEXT = r'[^a-z\s\|]'

In [240]:
names_df = pd.DataFrame([n for n in dframes], columns=['name'])
names_df['year'] = names_df['name'].str.extract(YEAR_SRT, flags=0)
names_df['parsed_name'] = names_df.apply(lambda x: x['name'].replace(str(x['year']), '').replace('_', ' ').replace('.srt', ''), axis=1)

In [241]:
labels = pd.read_csv('English_level/data_all/labels_all.csv')
display(labels)

Unnamed: 0,Movie,Kinopoisk,Level,Subtitles
0,Forrest Gump,Rus sub,"A2/A2+, B1",Yes
1,Finding Nemo\n,Everything,A2/A2+,Yes
2,Cast away\n,"Paid, Rus sub",A2/A2+,Yes
3,The invisible man (2020)\n,"Paid, Rus lan",A2/A2+,Yes
4,Back to the future\n,Rus sub,A2/A2+,Yes
...,...,...,...,...
106,Klaus,,C1,
107,Ocean’s Eleven,,C1,
108,Ocean’s Twelve,,C1,
109,Bridget Jones’s Baby,,C1,


In [242]:
for index in labels.index:
    item = labels.loc[index, 'Movie']
    found = process.extractOne(item, names_df['parsed_name'])
    labels.loc[index, ['found', 'probability', 'index']] = found

labels['index'] = labels['index'].astype(int)
    

In [243]:
films = labels.set_index('index').join(names_df)
display(films)

Unnamed: 0_level_0,Movie,Kinopoisk,Level,Subtitles,found,probability,name,year,parsed_name
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
37,Forrest Gump,Rus sub,"A2/A2+, B1",Yes,Forrest Gump,100.0,Forrest_Gump(1994).srt,(1994),Forrest Gump
36,Finding Nemo\n,Everything,A2/A2+,Yes,Finding Nemo,100.0,Finding_Nemo(2003).srt,(2003),Finding Nemo
21,Cast away\n,"Paid, Rus sub",A2/A2+,Yes,Cast away,100.0,Cast_away(2000).srt,(2000),Cast away
91,The invisible man (2020)\n,"Paid, Rus lan",A2/A2+,Yes,The invisible man,95.0,The_invisible_man(2020).srt,(2020),The invisible man
8,Back to the future\n,Rus sub,A2/A2+,Yes,Back to the future,100.0,Back_to_the_future(1985).srt,(1985),Back to the future
...,...,...,...,...,...,...,...,...,...
48,Klaus,,C1,,Klaus,100.0,Klaus(2019).srt,(2019),Klaus
67,Ocean’s Eleven,,C1,,Oceans Eleven,96.0,Oceans_Eleven(2001).srt,(2001),Oceans Eleven
68,Ocean’s Twelve,,C1,,Oceans Twelve,96.0,Oceans_Twelve(2004).srt,(2004),Oceans Twelve
17,Bridget Jones’s Baby,,C1,,Bridget Joness Baby,97.0,Bridget_Joness_Baby.srt,,Bridget Joness Baby


In [244]:
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])
def lemmatize(text):    
    doc = nlp(text)
    
    lemmatized_text = ' '.join(['' if token.is_stop else token.lemma_ for token in doc])
    lemmatized_text = re.sub(CLEAN_TEXT, ' ', lemmatized_text)
    lemmatized_text = re.sub(SPACES, r'\1', lemmatized_text)

    return lemmatized_text.split('|')

In [245]:
films['Level'].unique()

array(['A2/A2+, B1', 'A2/A2+', 'A2/A2+/B1', 'B1', 'B1, B2', 'B2', 'B1/B2',
       'C1'], dtype=object)

In [246]:
# for index in films.index:
#     name = films.loc[index, 'name']
#     text = '|'.join(dframes[name]['content'])
#     result = lemmatize(text)
#     dframes[name]['content'] = pd.Series(result)
#     dframes[name].to_csv('Parsed/srt/' + name + '.csv')