In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
import pickle
from collections import Counter
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [2]:
def process_vector_column(df, col_name, delimiter):
    df[col_name] = df[col_name].apply(lambda x: np.fromstring(x, dtype=float, sep=delimiter) if pd.notna(x) else np.array([]))
    return df

In [3]:
def tokenize_text(text):
    tokens = nltk.word_tokenize(text.lower())
    stop_words = set(stopwords.words('english'))
    return [word for word in tokens if word.isalnum() and word not in stop_words]

In [4]:
# Загрузка данных
with open("/content/gb_pr_vectorized.pkl", "rb") as f:
    train_data = pickle.load(f)
with open("/content/vec_sw.pkl", "rb") as f:
    test_data = pickle.load(f)

In [5]:
# Предобработка данных
train_data = process_vector_column(train_data, 'gb_vector', ';')
train_data = process_vector_column(train_data, 'pr_vector', ';')
test_data = process_vector_column(test_data, 'gb_vector', ';')
test_data = process_vector_column(test_data, 'pr_vector', ';')

train_data = train_data.dropna(subset=['gb_vector', 'pr_vector'])
test_data = test_data.dropna(subset=['gb_vector', 'pr_vector'])

In [6]:
# Подготовка данных для модели
X_train_gb = np.stack(train_data['gb_vector'].to_numpy())
X_train_pr = np.stack(train_data['pr_vector'].to_numpy())
X_train = np.concatenate((X_train_gb, X_train_pr), axis=1)

X_test_gb = np.stack(test_data['gb_vector'].to_numpy())
X_test_pr = np.stack(test_data['pr_vector'].to_numpy())
X_test = np.concatenate((X_test_gb, X_test_pr), axis=1)

y_train_goodness = train_data['goodness']
y_train_priority = train_data['priority']

In [7]:
# Масштабирование признаков
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
# # Обучение моделей
# model_goodness = LogisticRegression(max_iter=1000)
# model_goodness.fit(X_train, y_train_goodness)

# model_priority = LogisticRegression(max_iter=1000)
# model_priority.fit(X_train, y_train_priority)

In [8]:
# Обучение моделей
model_goodness = RandomForestClassifier()
model_goodness.fit(X_train, y_train_goodness)

model_priority = RandomForestClassifier()
model_priority.fit(X_train, y_train_priority)

In [9]:
# Предсказания на тестовом наборе
y_pred_goodness = model_goodness.predict(X_test)
y_pred_priority = model_priority.predict(X_test)

In [10]:
#Добавление предсказаний в тестовый датасет
test_data['predicted_goodness'] = y_pred_goodness
test_data['predicted_priority'] = y_pred_priority

In [12]:
test_data

Unnamed: 0,character,text,vector,gb_vector,pr_vector,predicted_goodness,predicted_priority
0,threepio,Did you hear that? They've shut down the main...,"[0.34349972, 0.13901646, 0.18637738, 0.2523306...","[1.0, 1.0, 1.0, 1.0, 0.733, 0.496, 0.49, 0.467...","[1.0, 1.0, 1.0, 0.778, 0.692, 0.689, 0.588, 0....",Good,2
1,threepio,What's that? I should have known better than t...,"[-0.0334638, 0.19327067, 0.1383156, 0.21797867...","[0.758, 0.5, 0.5, 0.492, 0.423, 0.422, 0.417, ...","[0.857, 0.733, 0.6, 0.579, 0.55, 0.546, 0.529,...",Good,2
2,threepio,At last! Where have you been? They're heading...,"[0.5643968, -0.09857929, 0.3613604, 0.09454506...","[1.0, 1.0, 0.789, 0.692, 0.538, 0.511, 0.511, ...","[1.0, 1.0, -1.0, 0.682, 0.667, 0.629, 0.614, 0...",Good,2
3,threepio,"Hey, you're not permitted in there. It's rest...","[0.51541805, -0.10249431, -0.061139096, 0.0834...","[1.0, 1.0, 0.789, 0.75, 0.688, 0.6, 0.574, 0.5...","[-1.0, 1.0, 1.0, 0.882, 0.672, 0.615, 0.615, 0...",Good,2
4,threepio,"I'm going to regret this. That's funny, the da...","[-0.20484321, 0.09471313, 0.36953843, 0.017646...","[1.0, 0.667, 0.528, 0.511, 0.5, 0.467, 0.45, 0...","[0.667, 0.655, 0.615, 0.613, 0.613, 0.588, 0.5...",Bad,2
...,...,...,...,...,...,...,...
897,han/pilot,"It's over, Commander. The Rebels have been rou...","[0.049598917, 0.29323563, 0.14532098, 0.116524...","[-1.0, -1.0, 0.667, 0.57, 0.538, 0.451, 0.44, ...","[1.0, -1.0, 0.682, 0.672, 0.634, 0.614, 0.571,...",Neutral,1
898,control room commander,Send three squads to help. Open the back door.,"[-0.07578328, -0.4497584, 0.0550272, -0.350976...","[1.0, 0.692, 0.6, 0.592, 0.422, 0.36, 0.319, 0...","[-1.0, 0.846, 0.63, 0.6, 0.528, 0.515, -0.5, 0...",Bad,2
899,second commander,"Yes, sir.","[0.2413588, 0.07787622, 0.38102102, -0.2044724...","[0.667, 0.176, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0...","[0.528, 0.495, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0...",Neutral,1
900,lure,But you'll die.,"[0.5533548, 0.31830752, 0.28580698, -0.4791090...","[0.574, 0.478, 0.384, 0.0, 0.0, 0.0, 0.0, 0.0,...","[0.616, 0.519, 0.263, 0.0, 0.0, 0.0, 0.0, 0.0,...",Neutral,1


In [13]:
def get_top_n(df, col_name, n=10):
    counts = df.groupby(col_name)['predicted_goodness'].count()
    return counts.sort_values(ascending=False).head(n)

In [35]:
#Вывод результатов
print("Top 10 Good Characters (Test set):")
print(get_top_n(test_data[test_data['predicted_goodness'] == 'Good'], 'character'))

Top 10 Good Characters (Test set):
character
luke          123
han           119
threepio       81
leia           62
vader          36
lando          27
ben            25
biggs          11
red leader     10
yoda            9
Name: predicted_goodness, dtype: int64


In [36]:
#Вывод результатов
print("Top 10 Bad Characters (Test set):")
print(get_top_n(test_data[test_data['predicted_goodness'] == 'Bad'], 'character'))

Top 10 Bad Characters (Test set):
character
luke        24
han         17
threepio    11
ben          8
leia         8
yoda         7
vader        6
emperor      4
creature     4
lando        4
Name: predicted_goodness, dtype: int64


In [16]:
def analyze_character_dialogue(df):
    """
    Анализирует данные о репликах персонажей и возвращает кортежи с именами,
    количеством хороших, плохих и нейтральных реплик, а также доли.

    Args:
        df: Pandas DataFrame с колонками 'character_name' и 'predicted_goodness'.
           'predicted_goodness' должен содержать значения: 'Good', 'Bad', 'Neutral'.

    Returns:
        Кортеж из шести элементов:
        - кортеж имен персонажей
        - кортеж количества хороших реплик
        - кортеж количества плохих реплик
        - кортеж количества нейтральных реплик
        - словарь долей хороших реплик
        - словарь долей плохих реплик

    """

    # Подсчет реплик для каждого персонажа и типа оценки
    counts = df.groupby('character_name')['predicted_goodness'].value_counts().unstack(fill_value=0)

    # Извлечение данных в кортежи
    character_names = list(counts.index)
    good_counts = list(counts['Good'])
    bad_counts = list(counts['Bad'])
    neutral_counts = list(counts['Neutral'])

    # Расчет долей
    goodness_shares = {}
    badness_shares = {}
    for name, good, bad, neutral in zip(character_names, good_counts, bad_counts, neutral_counts):
        total = good + bad + neutral
        if total > 0:
            goodness_shares[name] = good / total
            badness_shares[name] = bad / total
        else:
            goodness_shares[name] = 0
            badness_shares[name] = 0

    return character_names, good_counts, bad_counts, neutral_counts, list(goodness_shares.values()), list(badness_shares.values())

In [17]:
pg = test_data['predicted_goodness'].values.tolist()
len(pg)

902

In [18]:
ch = test_data['character'].values.tolist()
len(ch)

902

In [19]:
data1 = {'character_name': ch,
        'predicted_goodness': pg}
df1 = pd.DataFrame(data1)

In [20]:
result = analyze_character_dialogue(df1)
print("Результаты анализа диалогов:")
print("Имена персонажей:", result[0])
print("Количество хороших реплик:", result[1])
print("Количество плохих реплик:", result[2])
print("Количество нейтральных реплик:", result[3])
print("Доля хороших реплик:", result[4])
print("Доля плохих реплик:", result[5])

Результаты анализа диалогов:
Имена персонажей: ['ackbar', 'anakin', 'announcer', 'assistant officer', 'astro-officer', 'aunt beru', 'bartender', 'base voice', 'ben', 'beru', 'bib', 'biggs', 'boba fett', 'boushh', 'bunker commander', 'camie', 'captain', 'chief', 'chief pilot', 'commander', 'communications officer', 'control officer', 'control room commander', 'controller', 'creature', 'dack', 'deak', 'death star controller', 'death star intercom voice', 'deck officer', 'derlin', 'dodonna', 'emperor', 'first controller', 'first officer', 'first trooper', 'fixer', 'gantry officer', 'general madine', 'gold five', 'gold leader', 'gold two', 'gray leader', 'greedo', 'green leader', 'guard', 'han', 'han and luke', 'han/pilot', 'head controller', 'hobbie', 'human', 'imperial officer', 'imperial soldier', 'intercom voice', 'jabba', 'janson', 'jerjerrod', 'lando', 'leia', 'lieutenant', 'luke', 'lure', 'man', 'massassi intercom voice', 'medical droid', 'mon mothma', 'motti', 'navigator', 'needa',

In [21]:
cols = ['character', 'good_reps', 'bad_reps', 'neu_reps', 'good%', 'bad%']

In [22]:
res_df = pd.DataFrame(list(zip(result[0], result[1], result[2], result[3], result[4], result[5])), columns=cols)

In [23]:
res_df

Unnamed: 0,character,good_reps,bad_reps,neu_reps,good%,bad%
0,ackbar,5,0,0,1.000000,0.000000
1,anakin,1,0,0,1.000000,0.000000
2,announcer,1,0,0,1.000000,0.000000
3,assistant officer,1,0,0,1.000000,0.000000
4,astro-officer,0,0,1,0.000000,0.000000
...,...,...,...,...,...,...
124,woman,1,0,0,1.000000,0.000000
125,woman controller,0,0,1,0.000000,0.000000
126,y-wing pilot,0,0,1,0.000000,0.000000
127,yoda,9,7,1,0.529412,0.411765


In [26]:
# Вывод топ-N добрых/злых персонажей
print("\nTop 20 Bad Characters:")
top1 = res_df.sort_values(by='bad%', ascending=False).head(20)
print(top1)


Top 20 Bad Characters:
                  character  good_reps  bad_reps  neu_reps     good%      bad%
37           gantry officer          0         1         0  0.000000  1.000000
9                      beru          0         1         0  0.000000  1.000000
22   control room commander          0         1         0  0.000000  1.000000
60               lieutenant          0         1         0  0.000000  1.000000
45                    guard          0         1         0  0.000000  1.000000
52         imperial officer          1         1         0  0.500000  0.500000
76                    ozzel          1         1         0  0.500000  0.500000
12                boba fett          1         1         0  0.500000  0.500000
24                 creature          3         4         1  0.375000  0.500000
70               ninedenine          1         1         0  0.500000  0.500000
127                    yoda          9         7         1  0.529412  0.411765
39                gold five 

In [37]:
# Вывод топ-N главных/второстепенных персонажей
print("\nTop 10 Main Characters:")
print(get_top_n(test_data[test_data['predicted_priority'] >= 2], 'character'))


Top 10 Main Characters:
character
luke        100
han          96
threepio     77
leia         50
vader        34
ben          28
lando        21
yoda         14
emperor      13
biggs        11
Name: predicted_goodness, dtype: int64


In [38]:
# Вывод топ-N главных/второстепенных персонажей
print("\nTop 10 Secondary Characters:")
print(get_top_n(test_data[test_data['predicted_priority'] < 2], 'character'))


Top 10 Secondary Characters:
character
luke        65
han         57
leia        26
threepio    24
lando       13
vader       13
ben         11
jabba        7
wedge        6
piett        4
Name: predicted_goodness, dtype: int64


In [29]:
def analyze_character_priority(df):
    """
    Анализирует данные о приоритете персонажей и возвращает списки с информацией.

    Args:
        df: Pandas DataFrame с колонками 'character_name' и 'predicted_priority'.
           'predicted_priority' должен содержать значения от 0 до 3.

    Returns:
        Кортеж из 7 списков:
        - character: имена персонажей
        - important_count: количество важных реплик
        - neutral_count: количество нейтральных реплик
        - unimportant_count: количество неважных реплик
        - important_share: доли важных реплик
        - neutral_share: доли нейтральных реплик
        - unimportant_share: доли неважных реплик
    """

    # Группировка и подсчет
    priority_counts = df.groupby('character_name')['predicted_priority'].value_counts().unstack(fill_value=0)

    # Объединение категорий приоритета
    priority_counts['Important'] = priority_counts[2] + priority_counts[3]
    priority_counts['Neutral'] = priority_counts[1]
    priority_counts['Unimportant'] = priority_counts[0]
    priority_counts = priority_counts[['Important', 'Neutral', 'Unimportant']]

    # Расчет общих количеств и долей
    total_counts = priority_counts.sum(axis=1)
    shares = priority_counts.div(total_counts, axis=0)

    # Преобразование в списки
    characters = list(priority_counts.index)
    important_counts = list(priority_counts['Important'])
    neutral_counts = list(priority_counts['Neutral'])
    unimportant_counts = list(priority_counts['Unimportant'])
    important_shares = list(shares['Important'])
    neutral_shares = list(shares['Neutral'])
    unimportant_shares = list(shares['Unimportant'])

    return (characters, important_counts, neutral_counts, unimportant_counts,
            important_shares, neutral_shares, unimportant_shares)

In [30]:
pp = test_data['predicted_priority'].values.tolist()

In [31]:
data2 = {'character_name': ch,
        'predicted_priority': pp}
df2 = pd.DataFrame(data2)

result2 = analyze_character_priority(df2)
print("Результаты анализа диалогов:")
print("Имена персонажей:", result2[0])
print("Количество важных реплик:", result2[1])
print("Количество нейтральных реплик:", result2[2])
print("Количество неважных реплик:", result2[3])
print("Доля важных реплик:", result2[4])
print("Доля нейтральных реплик:", result2[5])
print("Доля неважных реплик:", result2[5])

Результаты анализа диалогов:
Имена персонажей: ['ackbar', 'anakin', 'announcer', 'assistant officer', 'astro-officer', 'aunt beru', 'bartender', 'base voice', 'ben', 'beru', 'bib', 'biggs', 'boba fett', 'boushh', 'bunker commander', 'camie', 'captain', 'chief', 'chief pilot', 'commander', 'communications officer', 'control officer', 'control room commander', 'controller', 'creature', 'dack', 'deak', 'death star controller', 'death star intercom voice', 'deck officer', 'derlin', 'dodonna', 'emperor', 'first controller', 'first officer', 'first trooper', 'fixer', 'gantry officer', 'general madine', 'gold five', 'gold leader', 'gold two', 'gray leader', 'greedo', 'green leader', 'guard', 'han', 'han and luke', 'han/pilot', 'head controller', 'hobbie', 'human', 'imperial officer', 'imperial soldier', 'intercom voice', 'jabba', 'janson', 'jerjerrod', 'lando', 'leia', 'lieutenant', 'luke', 'lure', 'man', 'massassi intercom voice', 'medical droid', 'mon mothma', 'motti', 'navigator', 'needa',

In [32]:
cols2 = ['character', 'imp_reps', 'neu_reps', 'unimp_reps', 'imp%', 'neu%', 'unimp%']

In [33]:
res_df2 = pd.DataFrame(list(zip(result2[0], result2[1], result2[2], result2[3], result2[4], result2[5], result2[6])), columns=cols2)

In [34]:
res_df2

Unnamed: 0,character,imp_reps,neu_reps,unimp_reps,imp%,neu%,unimp%
0,ackbar,4,1,0,0.800000,0.200000,0.0
1,anakin,1,0,0,1.000000,0.000000,0.0
2,announcer,0,1,0,0.000000,1.000000,0.0
3,assistant officer,1,0,0,1.000000,0.000000,0.0
4,astro-officer,0,1,0,0.000000,1.000000,0.0
...,...,...,...,...,...,...,...
124,woman,1,0,0,1.000000,0.000000,0.0
125,woman controller,0,1,0,0.000000,1.000000,0.0
126,y-wing pilot,0,1,0,0.000000,1.000000,0.0
127,yoda,14,3,0,0.823529,0.176471,0.0


In [39]:
# Вывод топ-N добрых/злых персонажей
print("\nTop 20 Unimportant Characters:")
top_20 = res_df2.sort_values(by='unimp%', ascending=False).head(20)
print(top_20[['character', 'imp_reps', 'unimp_reps', 'imp%', 'unimp%']])


Top 20 Unimportant Characters:
         character  imp_reps  unimp_reps      imp%    unimp%
42     gray leader         0           1  0.000000  1.000000
92       red three         0           1  0.000000  1.000000
117          veers         1           1  0.333333  0.333333
46             han        96           1  0.627451  0.006536
0           ackbar         4           0  0.800000  0.000000
84   rebel fighter         0           0  0.000000  0.000000
85   rebel officer         1           0  1.000000  0.000000
86     rebel pilot         0           0  0.000000  0.000000
87      red eleven         1           0  1.000000  0.000000
88      red leader         9           0  0.692308  0.000000
82         porkins         1           0  1.000000  0.000000
89        red nine         1           0  1.000000  0.000000
90       red seven         1           0  1.000000  0.000000
91         red ten         3           0  1.000000  0.000000
93         red two         0           0  0.000000  0

In [40]:
# Вывод топ-N добрых/злых слов (требуется дополнительная предобработка текста)
def get_top_words(df, goodness_label, n=10):
    text_col = df[df['predicted_goodness'] == goodness_label]['text']
    words = []
    for text in text_col:
        words.extend(text.lower().split())  # простейшая токенизация
    word_counts = Counter(words)
    return word_counts.most_common(n)

In [41]:
print("\nTop Good Words:")
print(get_top_words(test_data, 'Good'))


Top Good Words:
[('the', 649), ('you', 522), ('i', 495), ('to', 478), ('a', 328), ('of', 271), ('your', 203), ('is', 176), ('and', 172), ('be', 168)]


In [42]:
print("\nTop Bad Words:")
print(get_top_words(test_data, 'Bad'))


Top Bad Words:
[('you', 121), ('the', 102), ('i', 85), ('to', 67), ('a', 38), ('will', 35), ('your', 34), ('be', 31), ('is', 29), ('and', 27)]
