# Подготовка окружения

## Установка пакетов и импорт зависимостей

In [None]:
!pip install pandas



In [None]:
import sys
import os
import numpy as np
import pandas as pd
from getpass import getpass

from google.colab import drive

from joblib import dump, load

## Настройка окружения

In [None]:
drive.mount('/content/drive')

Mounted at /content/drive


## Клонирование исходных кодов проекта

In [None]:
def clone_pull_github_src(pull: bool = True):
    """
    Клонирует или обновляет репозиторий GitHub в локальный каталог для последующей работы.

    Parameters:
    pull (bool): Указывает, следует ли выполнять pull для существующего репозитория. Если True, выполняется git pull.
                 Если False, репозиторий клонируется в указанный каталог.

    Returns:
    Constants: Экземпляр класса Constants, содержащий константы проекта.
    """
    WORKSPACE_PATH = '/content/drive/MyDrive/docs/keepForever/mipt/nlp/hw1_4sem/'
    WORKSPACE_TMP = WORKSPACE_PATH + '/tmp/'
    GIT_HUB_PROJECT_PATH = WORKSPACE_PATH + 'code/'

    token = getpass('Введите GitHub token: ')
    repo_url = 'https://github.com/km-mipt-nlp-gen/hw1.git'
    repo_url_with_token = repo_url.replace('https://', f'https://{token}@')

    os.chdir(GIT_HUB_PROJECT_PATH)

    if pull:
        !git pull origin main
    else:
        !git clone {repo_url_with_token} "$GIT_HUB_PROJECT_PATH"

    del token

    sys.path.append(f"{GIT_HUB_PROJECT_PATH}/web_app/src/")
    from constants_module import Constants

    return Constants()


constants = clone_pull_github_src()

Введите GitHub token: ··········
From https://github.com/km-mipt-nlp-gen/hw1
 * branch            main       -> FETCH_HEAD
Already up to date.


# Подготовка набора данных

## Создание набора с контекстом и текущей репликой

In [None]:
simps_df = pd.read_csv(constants.THE_SIMPS_CSV_PATH)

In [None]:
simps_df.info()
simps_df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 158266 entries, 0 to 158265
Data columns (total 13 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   id                  158266 non-null  int64  
 1   episode_id          158266 non-null  int64  
 2   number              158266 non-null  int64  
 3   raw_text            158266 non-null  object 
 4   timestamp_in_ms     158266 non-null  int64  
 5   speaking_line       158266 non-null  bool   
 6   character_id        140740 non-null  float64
 7   location_id         157859 non-null  float64
 8   raw_character_text  140740 non-null  object 
 9   raw_location_text   157859 non-null  object 
 10  spoken_words        132103 non-null  object 
 11  normalized_text     132078 non-null  object 
 12  word_count          132103 non-null  float64
dtypes: bool(1), float64(3), int64(4), object(5)
memory usage: 14.6+ MB


Unnamed: 0,id,episode_id,number,raw_text,timestamp_in_ms,speaking_line,character_id,location_id,raw_character_text,raw_location_text,spoken_words,normalized_text,word_count
0,86263,298,245,"Seymour Skinner: I'm sorry, everyone. I perpetrated this charade. The boy can read, and Edna Krabappel is the greatest teacher I have ever known. If she can teach me to love, then she can teach anything.",1153000,True,3.0,4.0,Seymour Skinner,Auditorium,"I'm sorry, everyone. I perpetrated this charade. The boy can read, and Edna Krabappel is the greatest teacher I have ever known. If she can teach me to love, then she can teach anything.",im sorry everyone i perpetrated this charade the boy can read and edna krabappel is the greatest teacher i have ever known if she can teach me to love then she can teach anything,34.0
1,86264,298,246,Audience: Awwww...,1165000,True,321.0,4.0,Audience,Auditorium,Awwww...,awwww,1.0
2,86265,298,247,Agnes Skinner: Seymour! Your feelings are ugly and wrong!,1167000,True,192.0,4.0,Agnes Skinner,Auditorium,Seymour! Your feelings are ugly and wrong!,seymour your feelings are ugly and wrong,7.0
3,86266,298,248,Little Richard: Are you gonna listen to her?,1171000,True,3745.0,4.0,Little Richard,Auditorium,Are you gonna listen to her?,are you gonna listen to her,6.0
4,86267,298,249,"Seymour Skinner: Not on your life, Reverend. From now on, I'm my own man.",1173000,True,3.0,4.0,Seymour Skinner,Auditorium,"Not on your life, Reverend. From now on, I'm my own man.",not on your life reverend from now on im my own man,12.0


In [None]:
print(list(simps_df[constants.CHAR_ID_COL].value_counts())[:10])

[30110, 14264, 13968, 11639, 3207, 2863, 2443, 2146, 1957, 1911]


In [None]:
assert max(simps_df[constants.SIMPS_DF_SORT_BY_COLS].value_counts()[1])==1, 'Найдены дубликаты значений для группировки диалогов'

In [None]:
simps_df = simps_df.sort_values(by=constants.SIMPS_DF_SORT_BY_COLS)

In [None]:
simps_df.head(50)

Unnamed: 0,id,episode_id,number,raw_text,timestamp_in_ms,speaking_line,character_id,location_id,raw_character_text,raw_location_text,spoken_words,normalized_text,word_count
71222,1,1,0,(Street: ext. street - establishing - night),8000,False,,1.0,,Street,,,
71223,2,1,1,(Car: int. car - night),8000,False,,2.0,,Car,,,
71224,3,1,2,"Marge Simpson: Ooo, careful, Homer.",8000,True,1.0,2.0,Marge Simpson,Car,"Ooo, careful, Homer.",ooo careful homer,3.0
71225,4,1,3,Homer Simpson: There's no time to be careful.,10000,True,2.0,2.0,Homer Simpson,Car,There's no time to be careful.,theres no time to be careful,6.0
71226,5,1,4,Homer Simpson: We're late.,10000,True,2.0,2.0,Homer Simpson,Car,We're late.,were late,2.0
71227,6,1,5,(Springfield Elementary School: Ext. springfield elementary school - establishing - night),24000,False,,3.0,,Springfield Elementary School,,,
71228,7,1,6,(Auditorium: int. auditorium - night),24000,False,,4.0,,Auditorium,,,
71229,8,1,7,"Marge Simpson: (HUSHED VOICE) Sorry, Excuse us. Pardon me...",24000,True,1.0,4.0,Marge Simpson,Auditorium,"Sorry, Excuse us. Pardon me...",sorry excuse us pardon me,5.0
71230,9,1,8,"Homer Simpson: (SIMULTANEOUSLY) Hey, Norman. How's it going? So you got dragged down here, too... heh, heh. How ya doing, Fred? Excuse me, Fred.",26000,True,2.0,4.0,Homer Simpson,Auditorium,"Hey, Norman. How's it going? So you got dragged down here, too... heh, heh. How ya doing, Fred? Excuse me, Fred.",hey norman hows it going so you got dragged down here too heh heh how ya doing fred excuse me fred,21.0
93821,10,1,9,Homer Simpson: Pardon my galoshes. (CHUCKLES),34000,True,2.0,4.0,Homer Simpson,Auditorium,Pardon my galoshes.,pardon my galoshes,3.0


In [None]:
simps_df = simps_df[(simps_df[constants.SPEAKING_LINE_COL] == True) & (~simps_df[constants.NORM_TEXT_COL].isna())]

In [None]:
len(simps_df)

132078

In [None]:
len(simps_df[(simps_df[constants.RAW_CHAR_TEXT_COL] != constants.LISA_FULL_NAME) & (simps_df[constants.CHAR_ID_COL] == constants.LISA_ID)])

126

In [None]:
assert len(simps_df[(simps_df[constants.RAW_CHAR_TEXT_COL] != constants.LISA_FULL_NAME) &
         (simps_df[constants.CHAR_ID_COL] == constants.LISA_ID) &
         (~simps_df[constants.RAW_CHAR_TEXT_COL].str.contains(constants.LISA_LC_NAME, case=False, na=False))])==0, 'Текстовое имя целевого персонажа не соответствует числовому коду'

In [None]:
assert len(simps_df[simps_df[constants.NORM_TEXT_COL].isna()])==0, 'Найдены NA значения в тексте персонажей'

In [None]:
simps_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 132078 entries, 71224 to 70248
Data columns (total 13 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   id                  132078 non-null  int64  
 1   episode_id          132078 non-null  int64  
 2   number              132078 non-null  int64  
 3   raw_text            132078 non-null  object 
 4   timestamp_in_ms     132078 non-null  int64  
 5   speaking_line       132078 non-null  bool   
 6   character_id        132076 non-null  float64
 7   location_id         131701 non-null  float64
 8   raw_character_text  132076 non-null  object 
 9   raw_location_text   131701 non-null  object 
 10  spoken_words        132078 non-null  object 
 11  normalized_text     132078 non-null  object 
 12  word_count          132078 non-null  float64
dtypes: bool(1), float64(3), int64(4), object(5)
memory usage: 13.2+ MB


In [None]:
simps_df.head(100)

Unnamed: 0,id,episode_id,number,raw_text,timestamp_in_ms,speaking_line,character_id,location_id,raw_character_text,raw_location_text,spoken_words,normalized_text,word_count
71224,3,1,2,"Marge Simpson: Ooo, careful, Homer.",8000,True,1.0,2.0,Marge Simpson,Car,"Ooo, careful, Homer.",ooo careful homer,3.0
71225,4,1,3,Homer Simpson: There's no time to be careful.,10000,True,2.0,2.0,Homer Simpson,Car,There's no time to be careful.,theres no time to be careful,6.0
71226,5,1,4,Homer Simpson: We're late.,10000,True,2.0,2.0,Homer Simpson,Car,We're late.,were late,2.0
71229,8,1,7,"Marge Simpson: (HUSHED VOICE) Sorry, Excuse us. Pardon me...",24000,True,1.0,4.0,Marge Simpson,Auditorium,"Sorry, Excuse us. Pardon me...",sorry excuse us pardon me,5.0
71230,9,1,8,"Homer Simpson: (SIMULTANEOUSLY) Hey, Norman. How's it going? So you got dragged down here, too... heh, heh. How ya doing, Fred? Excuse me, Fred.",26000,True,2.0,4.0,Homer Simpson,Auditorium,"Hey, Norman. How's it going? So you got dragged down here, too... heh, heh. How ya doing, Fred? Excuse me, Fred.",hey norman hows it going so you got dragged down here too heh heh how ya doing fred excuse me fred,21.0
93821,10,1,9,Homer Simpson: Pardon my galoshes. (CHUCKLES),34000,True,2.0,4.0,Homer Simpson,Auditorium,Pardon my galoshes.,pardon my galoshes,3.0
71231,11,1,10,"Seymour Skinner: (UNREHEARSED) Wasn't that wonderful? And now, ""Santas of Many Lands,"" as presented by the entire second grade class.",44000,True,3.0,4.0,Seymour Skinner,Auditorium,"Wasn't that wonderful? And now, ""Santas of Many Lands,"" as presented by the entire second grade class.",wasnt that wonderful and now santas of many lands as presented by the entire second grade class,17.0
71232,12,1,11,Marge Simpson: Oh... Lisa's class.,55000,True,1.0,4.0,Marge Simpson,Auditorium,Oh... Lisa's class.,oh lisas class,3.0
71233,13,1,12,"JANEY: (SHY AND NERVOUS) Frohlich weihnachten -- that's German for Merry Christmas. In Germany, Santa's servant Ruprecht gives presents to good children and whipping rods to the parents of bad ones.",57000,True,4.0,4.0,JANEY,Auditorium,"Frohlich weihnachten -- that's German for Merry Christmas. In Germany, Santa's servant Ruprecht gives presents to good children and whipping rods to the parents of bad ones.",frohlich weihnachten -- thats german for merry christmas in germany santas servant ruprecht gives presents to good children and whipping rods to the parents of bad ones,27.0
71234,14,1,13,"Todd Flanders: Meri Kurimasu. I am Hotseiosha, a Japanese priest who acts like Santa Claus. I have eyes in the back of my head so children better behave when I'm nearby.",75000,True,5.0,4.0,Todd Flanders,Auditorium,"Meri Kurimasu. I am Hotseiosha, a Japanese priest who acts like Santa Claus. I have eyes in the back of my head so children better behave when I'm nearby.",meri kurimasu i am hotseiosha a japanese priest who acts like santa claus i have eyes in the back of my head so children better behave when im nearby,29.0


In [None]:
simps_df[constants.PREMISE_CHAR_ID_COL] = simps_df[constants.CHAR_ID_COL].shift(1)
simps_df[constants.PREMISE_COL] = simps_df[constants.RAW_TEXT_COL].shift(1)
simps_df[constants.PREV_EPISODE_ID_COL] = simps_df[constants.EPISODE_ID_COL].shift(1)
simps_df[constants.PREV_LOC_ID_COL] = simps_df[constants.LOC_ID_COL].shift(1)

# удалить пустые premise значения
simps_df = simps_df[(~simps_df[constants.PREMISE_CHAR_ID_COL].isna()) & (~simps_df[constants.PREMISE_COL].isna()) & (~simps_df[constants.PREV_LOC_ID_COL].isna())]

simps_df.loc[:, constants.SAME_LOC_ID_COL] = (simps_df[constants.LOC_ID_COL] == simps_df[constants.PREV_LOC_ID_COL])
simps_df.loc[:, constants.SAME_EPISODE_ID_COL] = (simps_df[constants.EPISODE_ID_COL] == simps_df[constants.PREV_EPISODE_ID_COL])
simps_df.head(10)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  simps_df.loc[:, constants.SAME_LOC_ID_COL] = (simps_df[constants.LOC_ID_COL] == simps_df[constants.PREV_LOC_ID_COL])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  simps_df.loc[:, constants.SAME_EPISODE_ID_COL] = (simps_df[constants.EPISODE_ID_COL] == simps_df[constants.PREV_EPISODE_ID_COL])


Unnamed: 0,id,episode_id,number,raw_text,timestamp_in_ms,speaking_line,character_id,location_id,raw_character_text,raw_location_text,spoken_words,normalized_text,word_count,premise_char_id,premise,prev_episode_id,prev_location_id,same_location_id_dialog,same_episode_id
71225,4,1,3,Homer Simpson: There's no time to be careful.,10000,True,2.0,2.0,Homer Simpson,Car,There's no time to be careful.,theres no time to be careful,6.0,1.0,"Marge Simpson: Ooo, careful, Homer.",1.0,2.0,True,True
71226,5,1,4,Homer Simpson: We're late.,10000,True,2.0,2.0,Homer Simpson,Car,We're late.,were late,2.0,2.0,Homer Simpson: There's no time to be careful.,1.0,2.0,True,True
71229,8,1,7,"Marge Simpson: (HUSHED VOICE) Sorry, Excuse us. Pardon me...",24000,True,1.0,4.0,Marge Simpson,Auditorium,"Sorry, Excuse us. Pardon me...",sorry excuse us pardon me,5.0,2.0,Homer Simpson: We're late.,1.0,2.0,False,True
71230,9,1,8,"Homer Simpson: (SIMULTANEOUSLY) Hey, Norman. How's it going? So you got dragged down here, too... heh, heh. How ya doing, Fred? Excuse me, Fred.",26000,True,2.0,4.0,Homer Simpson,Auditorium,"Hey, Norman. How's it going? So you got dragged down here, too... heh, heh. How ya doing, Fred? Excuse me, Fred.",hey norman hows it going so you got dragged down here too heh heh how ya doing fred excuse me fred,21.0,1.0,"Marge Simpson: (HUSHED VOICE) Sorry, Excuse us. Pardon me...",1.0,4.0,True,True
93821,10,1,9,Homer Simpson: Pardon my galoshes. (CHUCKLES),34000,True,2.0,4.0,Homer Simpson,Auditorium,Pardon my galoshes.,pardon my galoshes,3.0,2.0,"Homer Simpson: (SIMULTANEOUSLY) Hey, Norman. How's it going? So you got dragged down here, too... heh, heh. How ya doing, Fred? Excuse me, Fred.",1.0,4.0,True,True
71231,11,1,10,"Seymour Skinner: (UNREHEARSED) Wasn't that wonderful? And now, ""Santas of Many Lands,"" as presented by the entire second grade class.",44000,True,3.0,4.0,Seymour Skinner,Auditorium,"Wasn't that wonderful? And now, ""Santas of Many Lands,"" as presented by the entire second grade class.",wasnt that wonderful and now santas of many lands as presented by the entire second grade class,17.0,2.0,Homer Simpson: Pardon my galoshes. (CHUCKLES),1.0,4.0,True,True
71232,12,1,11,Marge Simpson: Oh... Lisa's class.,55000,True,1.0,4.0,Marge Simpson,Auditorium,Oh... Lisa's class.,oh lisas class,3.0,3.0,"Seymour Skinner: (UNREHEARSED) Wasn't that wonderful? And now, ""Santas of Many Lands,"" as presented by the entire second grade class.",1.0,4.0,True,True
71233,13,1,12,"JANEY: (SHY AND NERVOUS) Frohlich weihnachten -- that's German for Merry Christmas. In Germany, Santa's servant Ruprecht gives presents to good children and whipping rods to the parents of bad ones.",57000,True,4.0,4.0,JANEY,Auditorium,"Frohlich weihnachten -- that's German for Merry Christmas. In Germany, Santa's servant Ruprecht gives presents to good children and whipping rods to the parents of bad ones.",frohlich weihnachten -- thats german for merry christmas in germany santas servant ruprecht gives presents to good children and whipping rods to the parents of bad ones,27.0,1.0,Marge Simpson: Oh... Lisa's class.,1.0,4.0,True,True
71234,14,1,13,"Todd Flanders: Meri Kurimasu. I am Hotseiosha, a Japanese priest who acts like Santa Claus. I have eyes in the back of my head so children better behave when I'm nearby.",75000,True,5.0,4.0,Todd Flanders,Auditorium,"Meri Kurimasu. I am Hotseiosha, a Japanese priest who acts like Santa Claus. I have eyes in the back of my head so children better behave when I'm nearby.",meri kurimasu i am hotseiosha a japanese priest who acts like santa claus i have eyes in the back of my head so children better behave when im nearby,29.0,4.0,"JANEY: (SHY AND NERVOUS) Frohlich weihnachten -- that's German for Merry Christmas. In Germany, Santa's servant Ruprecht gives presents to good children and whipping rods to the parents of bad ones.",1.0,4.0,True,True
71235,15,1,14,"Dewey Largo: And now, presenting Lisa Simpson, as Tawanga, the Santa Claus of the South Seas.",91000,True,6.0,4.0,Dewey Largo,Auditorium,"And now, presenting Lisa Simpson, as Tawanga, the Santa Claus of the South Seas.",and now presenting lisa simpson as tawanga the santa claus of the south seas,14.0,5.0,"Todd Flanders: Meri Kurimasu. I am Hotseiosha, a Japanese priest who acts like Santa Claus. I have eyes in the back of my head so children better behave when I'm nearby.",1.0,4.0,True,True


## Обогащение реплик контекстом (из предыдущих реплик)

In [None]:
simps_df[constants.PREMISE_UPDATED_COL] = ''

# "R_1: 'Hi!'; A: 'How are you?'"
simps_df[constants.PREMISE_UPDATED_COL] = 'R_1: "' + simps_df[constants.RAW_TEXT_COL].shift(1) + '"'

for current_lag in range(2, constants.LAG_COUNT + 1):
    # создать исторические переменные
    simps_df[f'PREMISE_LAG_{current_lag}_COL'] = simps_df[constants.RAW_TEXT_COL].shift(current_lag)
    simps_df[f'PREV_EPISODE_ID_LAG_{current_lag}_COL'] = simps_df[constants.EPISODE_ID_COL].shift(current_lag)
    simps_df[f'PREV_LOC_ID_LAG_{current_lag}_COL'] = simps_df[constants.LOC_ID_COL].shift(current_lag)

    # проверка соответствия (корректности) всего диалога начиная от текущей строки до исторической реплики одному эпизоду и месту
    episode_match = simps_df[constants.SAME_EPISODE_ID_COL] == True
    location_match = simps_df[constants.SAME_LOC_ID_COL] == True

    conditions_met = episode_match & location_match
    for lag in range(2, current_lag + 1):
        conditions_met &= (simps_df[constants.EPISODE_ID_COL] == simps_df[f'PREV_EPISODE_ID_LAG_{lag}_COL']) & (simps_df[constants.LOC_ID_COL] == simps_df[f'PREV_LOC_ID_LAG_{lag}_COL'])

    simps_df = simps_df.fillna('')
    # обновление для корректных записей итогового столбца со всеми репликами
    simps_df.loc[conditions_met, constants.PREMISE_UPDATED_COL] = 'R_' + str(current_lag) + ': "' + simps_df[f'PREMISE_LAG_{current_lag}_COL'] + '"; ' + simps_df[constants.PREMISE_UPDATED_COL]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  simps_df[constants.PREMISE_UPDATED_COL] = ''


In [None]:
simps_df.head(30)

Unnamed: 0,id,episode_id,number,raw_text,timestamp_in_ms,speaking_line,character_id,location_id,raw_character_text,raw_location_text,spoken_words,normalized_text,word_count,premise_char_id,premise,prev_episode_id,prev_location_id,same_location_id_dialog,same_episode_id,premise_updated_col,PREMISE_LAG_2_COL,PREV_EPISODE_ID_LAG_2_COL,PREV_LOC_ID_LAG_2_COL,PREMISE_LAG_3_COL,PREV_EPISODE_ID_LAG_3_COL,PREV_LOC_ID_LAG_3_COL
71225,4,1,3,Homer Simpson: There's no time to be careful.,10000,True,2.0,2.0,Homer Simpson,Car,There's no time to be careful.,theres no time to be careful,6.0,1.0,"Marge Simpson: Ooo, careful, Homer.",1.0,2.0,True,True,,,,,,,
71226,5,1,4,Homer Simpson: We're late.,10000,True,2.0,2.0,Homer Simpson,Car,We're late.,were late,2.0,2.0,Homer Simpson: There's no time to be careful.,1.0,2.0,True,True,"R_1: ""Homer Simpson: There's no time to be careful.""",,,,,,
71229,8,1,7,"Marge Simpson: (HUSHED VOICE) Sorry, Excuse us. Pardon me...",24000,True,1.0,4.0,Marge Simpson,Auditorium,"Sorry, Excuse us. Pardon me...",sorry excuse us pardon me,5.0,2.0,Homer Simpson: We're late.,1.0,2.0,False,True,"R_1: ""Homer Simpson: We're late.""",Homer Simpson: There's no time to be careful.,1.0,2.0,,,
71230,9,1,8,"Homer Simpson: (SIMULTANEOUSLY) Hey, Norman. How's it going? So you got dragged down here, too... heh, heh. How ya doing, Fred? Excuse me, Fred.",26000,True,2.0,4.0,Homer Simpson,Auditorium,"Hey, Norman. How's it going? So you got dragged down here, too... heh, heh. How ya doing, Fred? Excuse me, Fred.",hey norman hows it going so you got dragged down here too heh heh how ya doing fred excuse me fred,21.0,1.0,"Marge Simpson: (HUSHED VOICE) Sorry, Excuse us. Pardon me...",1.0,4.0,True,True,"R_1: ""Marge Simpson: (HUSHED VOICE) Sorry, Excuse us. Pardon me...""",Homer Simpson: We're late.,1.0,2.0,Homer Simpson: There's no time to be careful.,1.0,2.0
93821,10,1,9,Homer Simpson: Pardon my galoshes. (CHUCKLES),34000,True,2.0,4.0,Homer Simpson,Auditorium,Pardon my galoshes.,pardon my galoshes,3.0,2.0,"Homer Simpson: (SIMULTANEOUSLY) Hey, Norman. How's it going? So you got dragged down here, too... heh, heh. How ya doing, Fred? Excuse me, Fred.",1.0,4.0,True,True,"R_2: ""Marge Simpson: (HUSHED VOICE) Sorry, Excuse us. Pardon me...""; R_1: ""Homer Simpson: (SIMULTANEOUSLY) Hey, Norman. How's it going? So you got dragged down here, too... heh, heh. How ya doing, Fred? Excuse me, Fred.""","Marge Simpson: (HUSHED VOICE) Sorry, Excuse us. Pardon me...",1.0,4.0,Homer Simpson: We're late.,1.0,2.0
71231,11,1,10,"Seymour Skinner: (UNREHEARSED) Wasn't that wonderful? And now, ""Santas of Many Lands,"" as presented by the entire second grade class.",44000,True,3.0,4.0,Seymour Skinner,Auditorium,"Wasn't that wonderful? And now, ""Santas of Many Lands,"" as presented by the entire second grade class.",wasnt that wonderful and now santas of many lands as presented by the entire second grade class,17.0,2.0,Homer Simpson: Pardon my galoshes. (CHUCKLES),1.0,4.0,True,True,"R_3: ""Marge Simpson: (HUSHED VOICE) Sorry, Excuse us. Pardon me...""; R_2: ""Homer Simpson: (SIMULTANEOUSLY) Hey, Norman. How's it going? So you got dragged down here, too... heh, heh. How ya doing, Fred? Excuse me, Fred.""; R_1: ""Homer Simpson: Pardon my galoshes. (CHUCKLES)""","Homer Simpson: (SIMULTANEOUSLY) Hey, Norman. How's it going? So you got dragged down here, too... heh, heh. How ya doing, Fred? Excuse me, Fred.",1.0,4.0,"Marge Simpson: (HUSHED VOICE) Sorry, Excuse us. Pardon me...",1.0,4.0
71232,12,1,11,Marge Simpson: Oh... Lisa's class.,55000,True,1.0,4.0,Marge Simpson,Auditorium,Oh... Lisa's class.,oh lisas class,3.0,3.0,"Seymour Skinner: (UNREHEARSED) Wasn't that wonderful? And now, ""Santas of Many Lands,"" as presented by the entire second grade class.",1.0,4.0,True,True,"R_3: ""Homer Simpson: (SIMULTANEOUSLY) Hey, Norman. How's it going? So you got dragged down here, too... heh, heh. How ya doing, Fred? Excuse me, Fred.""; R_2: ""Homer Simpson: Pardon my galoshes. (CHUCKLES)""; R_1: ""Seymour Skinner: (UNREHEARSED) Wasn't that wonderful? And now, ""Santas of Many Lands,"" as presented by the entire second grade class.""",Homer Simpson: Pardon my galoshes. (CHUCKLES),1.0,4.0,"Homer Simpson: (SIMULTANEOUSLY) Hey, Norman. How's it going? So you got dragged down here, too... heh, heh. How ya doing, Fred? Excuse me, Fred.",1.0,4.0
71233,13,1,12,"JANEY: (SHY AND NERVOUS) Frohlich weihnachten -- that's German for Merry Christmas. In Germany, Santa's servant Ruprecht gives presents to good children and whipping rods to the parents of bad ones.",57000,True,4.0,4.0,JANEY,Auditorium,"Frohlich weihnachten -- that's German for Merry Christmas. In Germany, Santa's servant Ruprecht gives presents to good children and whipping rods to the parents of bad ones.",frohlich weihnachten -- thats german for merry christmas in germany santas servant ruprecht gives presents to good children and whipping rods to the parents of bad ones,27.0,1.0,Marge Simpson: Oh... Lisa's class.,1.0,4.0,True,True,"R_3: ""Homer Simpson: Pardon my galoshes. (CHUCKLES)""; R_2: ""Seymour Skinner: (UNREHEARSED) Wasn't that wonderful? And now, ""Santas of Many Lands,"" as presented by the entire second grade class.""; R_1: ""Marge Simpson: Oh... Lisa's class.""","Seymour Skinner: (UNREHEARSED) Wasn't that wonderful? And now, ""Santas of Many Lands,"" as presented by the entire second grade class.",1.0,4.0,Homer Simpson: Pardon my galoshes. (CHUCKLES),1.0,4.0
71234,14,1,13,"Todd Flanders: Meri Kurimasu. I am Hotseiosha, a Japanese priest who acts like Santa Claus. I have eyes in the back of my head so children better behave when I'm nearby.",75000,True,5.0,4.0,Todd Flanders,Auditorium,"Meri Kurimasu. I am Hotseiosha, a Japanese priest who acts like Santa Claus. I have eyes in the back of my head so children better behave when I'm nearby.",meri kurimasu i am hotseiosha a japanese priest who acts like santa claus i have eyes in the back of my head so children better behave when im nearby,29.0,4.0,"JANEY: (SHY AND NERVOUS) Frohlich weihnachten -- that's German for Merry Christmas. In Germany, Santa's servant Ruprecht gives presents to good children and whipping rods to the parents of bad ones.",1.0,4.0,True,True,"R_3: ""Seymour Skinner: (UNREHEARSED) Wasn't that wonderful? And now, ""Santas of Many Lands,"" as presented by the entire second grade class.""; R_2: ""Marge Simpson: Oh... Lisa's class.""; R_1: ""JANEY: (SHY AND NERVOUS) Frohlich weihnachten -- that's German for Merry Christmas. In Germany, Santa's servant Ruprecht gives presents to good children and whipping rods to the parents of bad ones.""",Marge Simpson: Oh... Lisa's class.,1.0,4.0,"Seymour Skinner: (UNREHEARSED) Wasn't that wonderful? And now, ""Santas of Many Lands,"" as presented by the entire second grade class.",1.0,4.0
71235,15,1,14,"Dewey Largo: And now, presenting Lisa Simpson, as Tawanga, the Santa Claus of the South Seas.",91000,True,6.0,4.0,Dewey Largo,Auditorium,"And now, presenting Lisa Simpson, as Tawanga, the Santa Claus of the South Seas.",and now presenting lisa simpson as tawanga the santa claus of the south seas,14.0,5.0,"Todd Flanders: Meri Kurimasu. I am Hotseiosha, a Japanese priest who acts like Santa Claus. I have eyes in the back of my head so children better behave when I'm nearby.",1.0,4.0,True,True,"R_3: ""Marge Simpson: Oh... Lisa's class.""; R_2: ""JANEY: (SHY AND NERVOUS) Frohlich weihnachten -- that's German for Merry Christmas. In Germany, Santa's servant Ruprecht gives presents to good children and whipping rods to the parents of bad ones.""; R_1: ""Todd Flanders: Meri Kurimasu. I am Hotseiosha, a Japanese priest who acts like Santa Claus. I have eyes in the back of my head so children better behave when I'm nearby.""","JANEY: (SHY AND NERVOUS) Frohlich weihnachten -- that's German for Merry Christmas. In Germany, Santa's servant Ruprecht gives presents to good children and whipping rods to the parents of bad ones.",1.0,4.0,Marge Simpson: Oh... Lisa's class.,1.0,4.0


In [None]:
# создаем target столбец, 1 - когда предыдущая реплика из того же эпизода, локации (например, автомобиля), а также когда ответ принадлежит целевому персонажу (Lisa Simpson с id 9)
simps_df.loc[:, constants.LABEL_COL] = np.where((simps_df[constants.CHAR_ID_COL] == constants.LISA_ID)
    & (simps_df[constants.SAME_LOC_ID_COL]==True)
    & (simps_df[constants.SAME_EPISODE_ID_COL]==True), constants.VALID_QA_MARK, constants.INVALID_QA_MARK)

In [None]:
simps_df[constants.LABEL_COL].value_counts()

0    122076
1      9622
Name: label, dtype: int64

In [None]:
valid_df = simps_df[simps_df[constants.LABEL_COL] == 1]
invalid_df = simps_df[simps_df[constants.LABEL_COL] == 0]

num_ones = len(valid_df)
simps_df = pd.concat([valid_df, invalid_df.sample(n=num_ones, random_state=constants.SEED)]).reset_index(drop=True)
valid_df = None
invalid_df = None

In [None]:
simps_df.head()

Unnamed: 0,id,episode_id,number,raw_text,timestamp_in_ms,speaking_line,character_id,location_id,raw_character_text,raw_location_text,spoken_words,normalized_text,word_count,premise_char_id,premise,prev_episode_id,prev_location_id,same_location_id_dialog,same_episode_id,premise_updated_col,PREMISE_LAG_2_COL,PREV_EPISODE_ID_LAG_2_COL,PREV_LOC_ID_LAG_2_COL,PREMISE_LAG_3_COL,PREV_EPISODE_ID_LAG_3_COL,PREV_LOC_ID_LAG_3_COL,label
0,36,1,35,"Lisa Simpson: But I really want a pony and I've been really, really good this year.",243000,True,9.0,5.0,Lisa Simpson,Simpson Home,"But I really want a pony and I've been really, really good this year.",but i really want a pony and ive been really really good this year,14.0,1.0,"Marge Simpson: A pony? Oh, Lisa. You've asked for that for the last three years and I keep telling you Santa can't fit a pony into his sleigh. Can't you take a hint?",1.0,5.0,True,True,"R_3: ""Marge Simpson: All right, children. Let me have those letters. I'll send them to Santa's workshop at the North Pole.""; R_2: ""Bart Simpson: Oh, please. (TO LISA) There's only one fat guy that brings us presents and his name ain't Santa.""; R_1: ""Marge Simpson: A pony? Oh, Lisa. You've asked for that for the last three years and I keep telling you Santa can't fit a pony into his sleigh. Can't you take a hint?""","Bart Simpson: Oh, please. (TO LISA) There's only one fat guy that brings us presents and his name ain't Santa.",1.0,5.0,"Marge Simpson: All right, children. Let me have those letters. I'll send them to Santa's workshop at the North Pole.",1.0,5.0,1
1,62,1,61,"Lisa Simpson: Nice try, Dad.",311000,True,9.0,5.0,Lisa Simpson,Simpson Home,"Nice try, Dad.",nice try dad,3.0,2.0,"Homer Simpson: What do you think, kids?",1.0,5.0,True,True,"R_3: ""Bart Simpson: Good one, Dad.""; R_2: ""Homer Simpson: Okay, kids. Prepare to be dazzled. (CALLING) Marge! Turn on the juice!""; R_1: ""Homer Simpson: What do you think, kids?""","Homer Simpson: Okay, kids. Prepare to be dazzled. (CALLING) Marge! Turn on the juice!",1.0,5.0,"Bart Simpson: Good one, Dad.",1.0,5.0,1
2,68,1,67,Lisa Simpson: (ADMIRING) Ooh.,324000,True,9.0,5.0,Lisa Simpson,Simpson Home,Ooh.,ooh,1.0,12.0,Mechanical Santa: Ho ho ho. Ho ho ho. Ho ho ho. Ho ho ho.,1.0,5.0,True,True,"R_3: ""Homer Simpson: What is it, Flanders?""; R_2: ""Ned Flanders: Do you think this looks okay?""; R_1: ""Mechanical Santa: Ho ho ho. Ho ho ho. Ho ho ho. Ho ho ho.""",Ned Flanders: Do you think this looks okay?,1.0,5.0,"Homer Simpson: What is it, Flanders?",1.0,5.0,1
3,73,1,72,Lisa Simpson: I do!,335000,True,9.0,6.0,Lisa Simpson,KITCHEN,I do!,i do,2.0,1.0,"Marge Simpson: Kids, you want to go Christmas shopping?",1.0,6.0,True,True,"R_1: ""Marge Simpson: Kids, you want to go Christmas shopping?""","Homer Simpson: (TO BART AND LISA) It's too bright. (MUTTERING TO HIMSELF) That Flanders, what a big show-off.",1.0,5.0,Bart Simpson: Oh. Neat-o.,1.0,5.0,1
4,120,1,119,Lisa Simpson: But Mom had to spend all the Christmas money having it surgically removed.,494000,True,9.0,5.0,Lisa Simpson,Simpson Home,But Mom had to spend all the Christmas money having it surgically removed.,but mom had to spend all the christmas money having it surgically removed,13.0,8.0,Bart Simpson: Ow! Quit it. It used to be a real boss tattoo.,1.0,5.0,True,True,"R_3: ""Bart Simpson: Ow! Quit it!""; R_2: ""Homer Simpson: Hey! What's with this?""; R_1: ""Bart Simpson: Ow! Quit it. It used to be a real boss tattoo.""",Homer Simpson: Hey! What's with this?,1.0,5.0,Bart Simpson: Ow! Quit it!,1.0,5.0,1


In [None]:
simps_df.tail()

Unnamed: 0,id,episode_id,number,raw_text,timestamp_in_ms,speaking_line,character_id,location_id,raw_character_text,raw_location_text,spoken_words,normalized_text,word_count,premise_char_id,premise,prev_episode_id,prev_location_id,same_location_id_dialog,same_episode_id,premise_updated_col,PREMISE_LAG_2_COL,PREV_EPISODE_ID_LAG_2_COL,PREV_LOC_ID_LAG_2_COL,PREMISE_LAG_3_COL,PREV_EPISODE_ID_LAG_3_COL,PREV_LOC_ID_LAG_3_COL,label
19239,80864,280,14,Pilot: Are you okay?,137000,True,526.0,2344.0,Pilot,BANGED-UP HELICOPTER,Are you okay?,are you okay,3.0,526.0,"Pilot: (GASPS, THEN MESMERIZED) It's beautiful.",280.0,672.0,False,True,"R_1: ""Pilot: (GASPS, THEN MESMERIZED) It's beautiful.""",Helicopter Cop: We've recovered the flame.,280.0,672.0,"Marge Simpson: (SHAKES HEAD, WEARY NOISE) Every four years.",280.0,151.0,0
19240,50398,179,49,"Homer Simpson: (WHINY) Oh, I don't want to go to New York City.",296000,True,2.0,5.0,Homer Simpson,Simpson Home,"Oh, I don't want to go to New York City.",oh i dont want to go to new york city,10.0,9.0,"Lisa Simpson: ""...If you do not remedy this malparkage within 72 hours, your car will be thrown into the East River at your expense.""",179.0,5.0,True,True,"R_3: ""Lisa Simpson: (READING) ""Dear motorist, your vehicle is illegally parked in the borough of Manhattan...""""; R_2: ""Homer Simpson: (HOPEFUL) My vehicle!""; R_1: ""Lisa Simpson: ""...If you do not remedy this malparkage within 72 hours, your car will be thrown into the East River at your expense.""""",Homer Simpson: (HOPEFUL) My vehicle!,179.0,5.0,"Lisa Simpson: (READING) ""Dear motorist, your vehicle is illegally parked in the borough of Manhattan...""",179.0,5.0,0
19241,74472,259,22,Milhouse Van Houten: My doctor says I'm not supposed to go on sprees.,175000,True,25.0,270.0,Milhouse Van Houten,Springfield Street,My doctor says I'm not supposed to go on sprees.,my doctor says im not supposed to go on sprees,10.0,8.0,"Bart Simpson: Milhouse, my friend, you and I are going on a spending spree.",259.0,270.0,True,True,"R_2: ""Milhouse Van Houten: Can I see the fifty, Bart? Can I? Huh?""; R_1: ""Bart Simpson: Milhouse, my friend, you and I are going on a spending spree.""","Milhouse Van Houten: Can I see the fifty, Bart? Can I? Huh?",259.0,270.0,Bart Simpson: And I won fifty bucks!,259.0,5.0,0
19242,77199,267,270,Homer Simpson: Orphans lighting candles over a leaking gas line... Lighting candles!,1199000,True,2.0,2254.0,Homer Simpson,FOOD BOOTH,Orphans lighting candles over a leaking gas line... Lighting candles!,orphans lighting candles over a leaking gas line lighting candles,10.0,2.0,Homer Simpson: Bullies breaking bread with nerds...,267.0,2254.0,True,True,"R_3: ""Homer Simpson: Ned, Praiseland has touched an entire town with its inspiring message and toxic super-freakouts. Look at those smiling faces.""; R_2: ""Homer Simpson: Rich laughing with poor...""; R_1: ""Homer Simpson: Bullies breaking bread with nerds...""",Homer Simpson: Rich laughing with poor...,267.0,2254.0,"Homer Simpson: Ned, Praiseland has touched an entire town with its inspiring message and toxic super-freakouts. Look at those smiling faces.",267.0,2254.0,0
19243,100844,351,14,"Homer Simpson: Don't worry, Marge -- Hot Wheels to the rescue!",104000,True,2.0,5.0,Homer Simpson,Simpson Home,"Don't worry, Marge -- Hot Wheels to the rescue!",dont worry marge -- hot wheels to the rescue,9.0,1.0,"Marge Simpson: Homer, do something about this leak!",351.0,5.0,True,True,"R_1: ""Marge Simpson: Homer, do something about this leak!""",Kent Brockman: There you have it.,351.0,270.0,Kent Brockman: (EXASPERATED) Fine.,351.0,270.0,0


In [None]:
simps_df.loc[:, constants.RAW_TEXT_COL] = 'R_0: "' + simps_df[constants.RAW_TEXT_COL] + '"'

simps_df = simps_df[[constants.PREMISE_UPDATED_COL, constants.RAW_TEXT_COL, constants.LABEL_COL]]
simps_df = simps_df.rename(columns={constants.RAW_TEXT_COL: constants.TARGET_CHAR_ANSWER_COL})

In [None]:
simps_df.head(10)

Unnamed: 0,premise_updated_col,target_char_answer_col,label
0,"R_3: ""Marge Simpson: All right, children. Let me have those letters. I'll send them to Santa's workshop at the North Pole.""; R_2: ""Bart Simpson: Oh, please. (TO LISA) There's only one fat guy that brings us presents and his name ain't Santa.""; R_1: ""Marge Simpson: A pony? Oh, Lisa. You've asked for that for the last three years and I keep telling you Santa can't fit a pony into his sleigh. Can't you take a hint?""","R_0: ""Lisa Simpson: But I really want a pony and I've been really, really good this year.""",1
1,"R_3: ""Bart Simpson: Good one, Dad.""; R_2: ""Homer Simpson: Okay, kids. Prepare to be dazzled. (CALLING) Marge! Turn on the juice!""; R_1: ""Homer Simpson: What do you think, kids?""","R_0: ""Lisa Simpson: Nice try, Dad.""",1
2,"R_3: ""Homer Simpson: What is it, Flanders?""; R_2: ""Ned Flanders: Do you think this looks okay?""; R_1: ""Mechanical Santa: Ho ho ho. Ho ho ho. Ho ho ho. Ho ho ho.""","R_0: ""Lisa Simpson: (ADMIRING) Ooh.""",1
3,"R_1: ""Marge Simpson: Kids, you want to go Christmas shopping?""","R_0: ""Lisa Simpson: I do!""",1
4,"R_3: ""Bart Simpson: Ow! Quit it!""; R_2: ""Homer Simpson: Hey! What's with this?""; R_1: ""Bart Simpson: Ow! Quit it. It used to be a real boss tattoo.""","R_0: ""Lisa Simpson: But Mom had to spend all the Christmas money having it surgically removed.""",1
5,"R_3: ""Homer Simpson: (EXHAUSTED) Not a word, Marge. I'm heading straight for the tub.""; R_2: ""Marge Simpson: But Homer, my sisters are here. Don't you want to say hello?""; R_1: ""Bart Simpson: Oh, Dad, you're finally home.""","R_0: ""Lisa Simpson: Daddy! We're so glad to see you!""",1
6,"R_3: ""Homer Simpson: And why is that?""; R_2: ""Patty Bouvier: Well, for one thing, there's no tree.""; R_1: ""Homer Simpson: Well, I was just on my way out to get one.""","R_0: ""Lisa Simpson: Can we go too, Dad?""",1
7,"R_2: ""Homer Simpson: So, what do you think kids? Beauty, isn't it?""; R_1: ""Bart Simpson: Yeah, Homer.""","R_0: ""Lisa Simpson: Way to go, Dad.""",1
8,"R_2: ""Bubbles: (FROM TV) Hey, Moldy, do you think Santa will be able to find Elf County under all this snow?""; R_1: ""Moldy: (FROM TV, SADLY) I doubt it, Bubbles. We'll be sad little elves this Christmas.""","R_0: ""Lisa Simpson: Oh, no!""",1
9,"R_2: ""Elf #1: (THRU TV) Three cheers for Brainy.""; R_1: ""Elves: (THRU TV, all together) Hip hip hooray, hip hip hooray, hip hip hooray!""","R_0: ""Lisa Simpson: Yay!""",1


In [None]:
simps_df.tail(10)

Unnamed: 0,premise_updated_col,target_char_answer_col,label
19234,"R_3: ""Homer Simpson: (READS) I left a few helpful notes around the house...""; R_2: ""Marge Simpson: (READS) Put food in me.""; R_1: ""Homer Simpson: I'll take that.""","R_0: ""Marge Simpson: (ROLLING EYES) Well, duh! (SARCASTIC) With what, Ned?""",0
19235,"R_3: ""Marge Simpson: When do we want it?""; R_2: ""Her Group: (ALL AT ONCE) Now!""; R_1: ""Marge Simpson: What do we want?""","R_0: ""Marge Simpson: HER GROUP""",0
19236,"R_1: ""Dole: Well, 'night, everybody.""","R_0: ""Committee Staff: 'Night Sir / Good night, Mr. Dole / Good luck next week.""",0
19237,"R_3: ""Seymour Skinner: Oh.. uh... sorry.""; R_2: ""Australian Boy: Come on, luv... Loosen up.""; R_1: ""Australian Girl: I just can't forget what happened to that poor dingo back there. Who, or what, could've done such a horrible thing?""","R_0: ""Australian Boy: Oh, it was probably just a wallaby... now come on...""",0
19238,"R_1: ""Carl Carlson: I don't have time for peeling. Squirt 'em in.""","R_0: ""Ned Flanders: Well boys, Daddy's back on the beam, thanks to Christian prayer and Doctor Sheldon Lowenstein!""",0
19239,"R_1: ""Pilot: (GASPS, THEN MESMERIZED) It's beautiful.""","R_0: ""Pilot: Are you okay?""",0
19240,"R_3: ""Lisa Simpson: (READING) ""Dear motorist, your vehicle is illegally parked in the borough of Manhattan...""""; R_2: ""Homer Simpson: (HOPEFUL) My vehicle!""; R_1: ""Lisa Simpson: ""...If you do not remedy this malparkage within 72 hours, your car will be thrown into the East River at your expense.""""","R_0: ""Homer Simpson: (WHINY) Oh, I don't want to go to New York City.""",0
19241,"R_2: ""Milhouse Van Houten: Can I see the fifty, Bart? Can I? Huh?""; R_1: ""Bart Simpson: Milhouse, my friend, you and I are going on a spending spree.""","R_0: ""Milhouse Van Houten: My doctor says I'm not supposed to go on sprees.""",0
19242,"R_3: ""Homer Simpson: Ned, Praiseland has touched an entire town with its inspiring message and toxic super-freakouts. Look at those smiling faces.""; R_2: ""Homer Simpson: Rich laughing with poor...""; R_1: ""Homer Simpson: Bullies breaking bread with nerds...""","R_0: ""Homer Simpson: Orphans lighting candles over a leaking gas line... Lighting candles!""",0
19243,"R_1: ""Marge Simpson: Homer, do something about this leak!""","R_0: ""Homer Simpson: Don't worry, Marge -- Hot Wheels to the rescue!""",0


In [None]:
simps_df.info()
simps_df[constants.LABEL_COL].value_counts()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19244 entries, 0 to 19243
Data columns (total 3 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   premise_updated_col     19244 non-null  object
 1   target_char_answer_col  19244 non-null  object
 2   label                   19244 non-null  int64 
dtypes: int64(1), object(2)
memory usage: 451.2+ KB


1    9622
0    9622
Name: label, dtype: int64

## Создание словаря для тренировки

In [None]:
filtered_data = simps_df.to_dict(orient='records')
filtered_data[4:5]

[{'premise_updated_col': 'R_3: "Bart Simpson: Ow! Quit it!"; R_2: "Homer Simpson: Hey! What\'s with this?"; R_1: "Bart Simpson: Ow! Quit it. It used to be a real boss tattoo."',
  'target_char_answer_col': 'R_0: "Lisa Simpson: But Mom had to spend all the Christmas money having it surgically removed."',
  'label': 1}]

In [None]:
[sample[constants.PREMISE_UPDATED_COL] for sample in filtered_data[0:5]]

['R_3: "Marge Simpson: All right, children. Let me have those letters. I\'ll send them to Santa\'s workshop at the North Pole."; R_2: "Bart Simpson: Oh, please. (TO LISA) There\'s only one fat guy that brings us presents and his name ain\'t Santa."; R_1: "Marge Simpson: A pony? Oh, Lisa. You\'ve asked for that for the last three years and I keep telling you Santa can\'t fit a pony into his sleigh. Can\'t you take a hint?"',
 'R_3: "Bart Simpson: Good one, Dad."; R_2: "Homer Simpson: Okay, kids. Prepare to be dazzled. (CALLING) Marge! Turn on the juice!"; R_1: "Homer Simpson: What do you think, kids?"',
 'R_3: "Homer Simpson: What is it, Flanders?"; R_2: "Ned Flanders: Do you think this looks okay?"; R_1: "Mechanical Santa: Ho ho ho. Ho ho ho. Ho ho ho. Ho ho ho."',
 'R_1: "Marge Simpson: Kids, you want to go Christmas shopping?"',
 'R_3: "Bart Simpson: Ow! Quit it!"; R_2: "Homer Simpson: Hey! What\'s with this?"; R_1: "Bart Simpson: Ow! Quit it. It used to be a real boss tattoo."']

In [None]:
dump(filtered_data, constants.PROCESSED_QA_PATH)

['/content/drive/MyDrive/docs/keepForever/mipt/nlp/hw1_4sem/code/ml/data/processed/qa_pairs.joblib']

## Создание словаря для инференса

In [None]:
lisa_qa_pairs = [qa for qa in filtered_data if qa[constants.LABEL_COL] == 1]
lisa_qa_pairs = [{key: val for key, val in hash_map.items() if key != constants.LABEL_COL} for hash_map in lisa_qa_pairs]
lisa_answers = [qa[constants.TARGET_CHAR_ANSWER_COL] for qa in lisa_qa_pairs]

In [None]:
lisa_qa_pairs[0:2]

[{'premise_updated_col': 'R_3: "Marge Simpson: All right, children. Let me have those letters. I\'ll send them to Santa\'s workshop at the North Pole."; R_2: "Bart Simpson: Oh, please. (TO LISA) There\'s only one fat guy that brings us presents and his name ain\'t Santa."; R_1: "Marge Simpson: A pony? Oh, Lisa. You\'ve asked for that for the last three years and I keep telling you Santa can\'t fit a pony into his sleigh. Can\'t you take a hint?"',
  'target_char_answer_col': 'R_0: "Lisa Simpson: But I really want a pony and I\'ve been really, really good this year."'},
 {'premise_updated_col': 'R_3: "Bart Simpson: Good one, Dad."; R_2: "Homer Simpson: Okay, kids. Prepare to be dazzled. (CALLING) Marge! Turn on the juice!"; R_1: "Homer Simpson: What do you think, kids?"',
  'target_char_answer_col': 'R_0: "Lisa Simpson: Nice try, Dad."'}]

In [None]:
lisa_answers[0:5]

['R_0: "Lisa Simpson: But I really want a pony and I\'ve been really, really good this year."',
 'R_0: "Lisa Simpson: Nice try, Dad."',
 'R_0: "Lisa Simpson: (ADMIRING) Ooh."',
 'R_0: "Lisa Simpson: I do!"',
 'R_0: "Lisa Simpson: But Mom had to spend all the Christmas money having it surgically removed."']

In [None]:
dump(lisa_qa_pairs, constants.TARGET_CHAR_PROCESSED_QA_PATH)

['/content/drive/MyDrive/docs/keepForever/mipt/nlp/hw1_4sem/code/ml/data/processed/target_char_qa_pairs.joblib']

In [None]:
load(constants.TARGET_CHAR_PROCESSED_QA_PATH)[0:2]

[{'premise_updated_col': 'R_3: "Marge Simpson: All right, children. Let me have those letters. I\'ll send them to Santa\'s workshop at the North Pole."; R_2: "Bart Simpson: Oh, please. (TO LISA) There\'s only one fat guy that brings us presents and his name ain\'t Santa."; R_1: "Marge Simpson: A pony? Oh, Lisa. You\'ve asked for that for the last three years and I keep telling you Santa can\'t fit a pony into his sleigh. Can\'t you take a hint?"',
  'target_char_answer_col': 'R_0: "Lisa Simpson: But I really want a pony and I\'ve been really, really good this year."'},
 {'premise_updated_col': 'R_3: "Bart Simpson: Good one, Dad."; R_2: "Homer Simpson: Okay, kids. Prepare to be dazzled. (CALLING) Marge! Turn on the juice!"; R_1: "Homer Simpson: What do you think, kids?"',
  'target_char_answer_col': 'R_0: "Lisa Simpson: Nice try, Dad."'}]

In [None]:
dump(lisa_answers, constants.TARGET_CHAR_PROCESSED_ANSWERS_PATH)

['/content/drive/MyDrive/docs/keepForever/mipt/nlp/hw1_4sem/code/ml/data/processed/target_char_answers.joblib']

In [None]:
load(constants.TARGET_CHAR_PROCESSED_ANSWERS_PATH)[0:2]

['R_0: "Lisa Simpson: But I really want a pony and I\'ve been really, really good this year."',
 'R_0: "Lisa Simpson: Nice try, Dad."']