# Импорт модулей

In [1]:
import pandas as pd

import pysrt
import os

import re
import nltk
# nltk.download('punkt')
from nltk.stem.snowball import SnowballStemmer

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import LinearSVC

# Предварительная Обработка

Введем константы.

In [2]:
SUBS_DIR = './subtitles/'
SEED = 5115

Откроем файл с разметкой данных

In [3]:
df = pd.read_csv("labels.csv")
df.head(5)

Unnamed: 0,Movie,Kinopoisk,Level,Subtitles
0,Forrest Gump,Rus sub,"A2/A2+, B1",Yes
1,Finding Nemo\n,Everything,A2/A2+,Yes
2,Cast away\n,"Paid, Rus sub",A2/A2+,Yes
3,The invisible man (2020)\n,"Paid, Rus lan",A2/A2+,Yes
4,Back to the future\n,Rus sub,A2/A2+,Yes


In [4]:
df['Level'].value_counts()

B1            28
A2/A2+        27
B2            19
B1, B2         9
A2/A2+, B1     5
Name: Level, dtype: int64

Изменим название фильмов в колонке Movie, чтобы они соответствовали названиям файлов субтитров.

In [5]:
df['Movie'] = df['Movie'].str.strip()\
                         .str.replace('\n', '')\
                         .str.replace(' ', '_')

Изменим метки в колонке Level, размножим строки, содержащие больше одной метки.

In [6]:
mapper = {'B1': ['B1'],
          'A2/A2+': ['A2'],
          'B2': ['B2'],
          'B1, B2': ['B1', 'B2'],
          'A2/A2+, B1': ['A2', 'B1']}

df['Level'] = df['Level'].map(mapper)
df = df.explode('Level')
df.head()

Unnamed: 0,Movie,Kinopoisk,Level,Subtitles
0,Forrest_Gump,Rus sub,A2,Yes
0,Forrest_Gump,Rus sub,B1,Yes
1,Finding_Nemo,Everything,A2,Yes
2,Cast_away,"Paid, Rus sub",A2,Yes
3,The_invisible_man_(2020),"Paid, Rus lan",A2,Yes


Добавим в датафрейм названия файлов субтитров.

In [7]:
for index, row in df.iterrows():
    for file in os.scandir(SUBS_DIR):
        if file.name.lower().startswith(row['Movie'].lower()):
            df.loc[index, 'Subs_file'] = file.name


df[df['Subs_file'].isna()]['Movie'].sort_values()

74                                    A_knight’s_tale
68                                  An__American_tail
51                        Beauty_and_the_beast_(film)
81    Eurovision_Song_Contest:_The_Story_of_Fire_Saga
26                                   Harry_Potter_(1)
80                              It’s_a_wonderful_life
71                                         Liar,_liar
58                                 Lie_to_me_(series)
58                                 Lie_to_me_(series)
78                                    Moulin_Rouge_🎙️
78                                    Moulin_Rouge_🎙️
46                                     Mrs._Doubtfire
30                       Pirates_of_the_Caribbean_(1)
79                            The_Greatest_Showman_🎙️
83                        The_Walking_Dead__(series)🧟
84                           The_fault_in_our_stars_😭
84                           The_fault_in_our_stars_😭
3                            The_invisible_man_(2020)
49                          

Добавим часть названий вручную.

In [8]:
df.loc[74, 'Subs_file'] = 'A_knights_tale(2001).srt'
df.loc[68, 'Subs_file'] = 'An_American_tail(1986).srt'
df.loc[51, 'Subs_file'] = 'Beauty_and_the_beast(2017).srt'
df.loc[81, 'Subs_file'] = 'Eurovision_song_contest_(2020).srt'
df.loc[26, 'Subs_file'] = 'Harry_Potter_and_the_philosophers_stone(2001).srt'
df.loc[80, 'Subs_file'] = 'It_s_a_wonderful_life(1946).srt'
df.loc[71, 'Subs_file'] = 'Liar_liar(1997).srt'
df.loc[78, 'Subs_file'] = 'Moulin_Rouge(2001).srt'
df.loc[46, 'Subs_file'] = 'Mrs_Doubtfire(1993).srt'
df.loc[30, 'Subs_file'] = 'Pirates_of_the_Caribbean(2003).srt'
df.loc[79, 'Subs_file'] = 'The_greatest_showman(2017).srt'
df.loc[84, 'Subs_file'] = 'The_fault_in_our_stars(2014).srt'
df.loc[3, 'Subs_file'] = 'The_invisible_man(2020).srt'
df.loc[49, 'Subs_file'] = 'The_kings_speech(2010).srt'
df.loc[13, 'Subs_file'] = 'The_man_called_Flintstone(1966).srt'
df.loc[35, 'Subs_file'] = 'We_are_the_Millers(2013).srt'

Удалим пропуски.

In [9]:
df = df.dropna(subset=['Subs_file']).reset_index(drop=True)

Добавим текст субтитров в датафрейм.

In [10]:
for index, row in df.iterrows():
    df.loc[index, 'Text'] = ' '.join([s.text for s in
                                      pysrt.open(SUBS_DIR + df.loc[index, 'Subs_file'])])
df = df[[ 'Level', 'Subs_file', 'Text']]
df.head()

Unnamed: 0,Level,Subs_file,Text
0,A2,Forrest_Gump(1994).srt,Hello. My name's Forrest. Forrest Gump. Do you...
1,B1,Forrest_Gump(1994).srt,Hello. My name's Forrest. Forrest Gump. Do you...
2,A2,Finding_Nemo(2003).srt,[Music playing] Advertise your product or bran...
3,A2,Cast_away(2000).srt,Created and Encoded by -- Bokutox -- of www....
4,A2,The_invisible_man(2020).srt,"Adrian? Come on. <i>Zeus, I'm sorry.\nI can't ..."


Очистим текст для анализа.

In [11]:
def preprocess_string(s: str) -> str:
    # перенос строки
    s = s.replace("\n", " ")
    # текст в круглых скобках
    s = re.sub("\((.*?)\)", " ", s)
    # текст в косых скобках
    s = re.sub("<(.*?)>", " ", s)
    # текст в квадратных скобках
    s = re.sub("\[(.*?)]", " ", s)
    # текст заглавными буквами
    s = re.sub("[A-Z]{2,}", " ", s)
    # знаки препинания и цифры
    s = re.sub("[^A-Za-z ']", " ", s)
    # несколько пробельных символов
    s = re.sub("\s+", " ", s)
    # приведем к нижнему регистру
    s = s.lower()
    return s

In [12]:
df['Text_preprocess'] = df['Text'].apply(preprocess_string)
display(df.head())

Unnamed: 0,Level,Subs_file,Text,Text_preprocess
0,A2,Forrest_Gump(1994).srt,Hello. My name's Forrest. Forrest Gump. Do you...,hello my name's forrest forrest gump do you wa...
1,B1,Forrest_Gump(1994).srt,Hello. My name's Forrest. Forrest Gump. Do you...,hello my name's forrest forrest gump do you wa...
2,A2,Finding_Nemo(2003).srt,[Music playing] Advertise your product or bran...,advertise your product or brand here contact ...
3,A2,Cast_away(2000).srt,Created and Encoded by -- Bokutox -- of www....,created and encoded by bokutox of www com the ...
4,A2,The_invisible_man(2020).srt,"Adrian? Come on. <i>Zeus, I'm sorry.\nI can't ...",adrian come on zeus i'm sorry i can't take you...


# Токенизация и стемминг

In [13]:
def tokenize_and_stem(text_preprocess: str)-> list:
    # токенизация
    words_tokenized = [word for word in nltk.word_tokenize(text_preprocess)]
    # стемминг
    stemmer = SnowballStemmer("english")
    stemmed_words = [stemmer.stem(word) for word in words_tokenized]

    return stemmed_words

# Векторизация текста

In [14]:
tfidf_vectorizer = TfidfVectorizer(max_df=0.8, min_df=0.2,
                                   stop_words=None,
                                   tokenizer=tokenize_and_stem,
                                   use_idf=False,
                                   sublinear_tf=False,
                                   ngram_range=(1,3))

X_tfidf = tfidf_vectorizer.fit_transform(df['Text_preprocess'])

train_tfidf, test_tfidf, y_train, y_test = train_test_split(X_tfidf, df['Level'],
                                                            stratify=df['Level'],
                                                            test_size=0.2, random_state=SEED)

# Получение предсказаний

In [15]:
logreg_cl = LogisticRegression(solver='saga', penalty='l2', C=0.5, random_state=SEED)
logreg_cl.fit(train_tfidf, y_train)
logreg_predict = logreg_cl.predict(test_tfidf)

print('Test Accuracy (LogisticRegression):', accuracy_score(y_test, logreg_predict))
pd.DataFrame(classification_report(y_test, logreg_predict, output_dict=True)).transpose()

Test Accuracy (LogisticRegression): 0.5


Unnamed: 0,precision,recall,f1-score,support
A2,1.0,0.166667,0.285714,6.0
B1,0.470588,1.0,0.64,8.0
B2,0.5,0.166667,0.25,6.0
accuracy,0.5,0.5,0.5,0.5
macro avg,0.656863,0.444444,0.391905,20.0
weighted avg,0.638235,0.5,0.416714,20.0


In [16]:
svc_cl = LinearSVC(penalty='l2', C=0.5, random_state=SEED)
svc_cl.fit(train_tfidf, y_train)
svc_predict = svc_cl.predict(test_tfidf)

print('Test Accuracy (LinearSVC):', accuracy_score(y_test, svc_predict))

Test Accuracy (LinearSVC): 0.45


In [17]:
bayes_cl = MultinomialNB(alpha=1.0)
bayes_cl.fit(train_tfidf, y_train)
bayes_predict = bayes_cl.predict(test_tfidf)

print('Test Accuracy (MultinomialNB):', accuracy_score(y_test, bayes_predict))

Test Accuracy (MultinomialNB): 0.45


In [18]:
kmeans_cl = KNeighborsClassifier(n_neighbors=7, p=2)
kmeans_cl.fit(train_tfidf, y_train)
kmeans_predict = kmeans_cl.predict(test_tfidf)

print('Test Accuracy (KNeighborsClassifier):', accuracy_score(y_test, kmeans_predict))

Test Accuracy (KNeighborsClassifier): 0.4


# Вывод

В работе удалось достичь Accuracy 50% с помощью линейной регрессии и подбора максимума и минимума частот векторизированного текста.