In [1]:
import os
import re
import pandas as pd

raw_data_dir = '../data/raw/'
output_path = '../data/processed/context_answer.csv'
character = 'House'

## Загружаем данные

In [2]:
# Загрузим все CSV-файлы из директории и объединим их в один DataFrame

all_files = [os.path.join(raw_data_dir, f) for f in os.listdir(raw_data_dir) if f.endswith('.csv')]
df_list = [pd.read_csv(file, encoding='ISO-8859-1') for file in all_files]
full_text = pd.concat(df_list, ignore_index=True)

full_text

Unnamed: 0,name,line
0,House,Pericardial effusion.
1,Taub,Which wasn't there last night when we did the...
2,Foreman,"She's getting worse, and there's no sign of a..."
3,Chase,Or maybe a cancer.
4,Foreman,"We could use your opinion, House, or at least..."
...,...,...
75307,House,[loudly] I will not have sex with you! Not ag...
75308,Cuddy,Stacy's husband is going to need close monito...
75309,House,Did she say yes?
75310,Cuddy,She said only if it was okay with you. [House...


## Очищаем текст

In [3]:
# Удаляем строки с NaN в столбце 'line'
full_clean_text = full_text.dropna(subset=['name', 'line']).reset_index(drop=True)

# Применяем очистку к столбцу 'line' и удаляем лишние пробелы и символы
full_clean_text.loc[:, 'line'] = full_clean_text['line'].apply(lambda x: ' '.join(re.sub(r'\[.*?\]|\(.*?\)', '', x).split()))

full_clean_text

Unnamed: 0,name,line
0,House,Pericardial effusion.
1,Taub,Which wasn't there last night when we did the ...
2,Foreman,"She's getting worse, and there's no sign of a ..."
3,Chase,Or maybe a cancer.
4,Foreman,"We could use your opinion, House, or at least ..."
...,...,...
75306,House,I will not have sex with you! Not again! Miser...
75307,Cuddy,Stacy's husband is going to need close monitor...
75308,House,Did she say yes?
75309,Cuddy,She said only if it was okay with you. Yes or no?


## Создаем пары 'контекст-ответ' для выбранного персонажа.

In [4]:
pairs = []
for i in range(1, len(full_clean_text)):
    if full_clean_text.loc[i, 'name'] == character:
        context = full_clean_text.loc[i - 1, 'line']
        response = full_clean_text.loc[i, 'line']
        if context and response:  # Пропускаем пустые строки
            pairs.append({'context': context, 'response': response})
pairs_df = pd.DataFrame(pairs)

pairs_df

Unnamed: 0,context,response
0,"We could use your opinion, House, or at least ...",Jack Cannon is not dead. It's worse. Ten books...
1,People who want to sell you the 11th.,She was gonna kill herself. This is the final ...
2,Is there anything in it that pertains to our c...,Yeah. I know why she wanted to kill herself.
3,Yeah. I know why she wanted to kill herself.,"His mentor, Helen Rutherford, has contracted a..."
4,How do we know this mentor character's really ...,Helen dies halfway through the book. Shoots he...
...,...,...
20864,"No, about me. I'm not over you. You were, you ...","So I'm the guy, but you want the other guy, wh..."
20865,"What's so great about you, you always think yo...",Okay.
20866,I want to run something by you.,I will not have sex with you! Not again! Miser...
20867,Stacy's husband is going to need close monitor...,Did she say yes?


## Сохраняем DataFrame в CSV-файл

In [5]:
pairs_df.to_csv(output_path, index=False)
