In [1]:
import warnings
warnings.simplefilter('ignore')

import os
import re
import random

import numpy as np
import pandas as pd
pd.set_option('max_colwidth', 400)

In [2]:
train = pd.read_csv('raw_data/train_dataset_v2.tsv', sep='\t')
tmp = pd.DataFrame(train['id'].str.split('_', expand=True))
tmp.columns = ['movie', 'scene', 'A', 'movie_id']
for col in ['movie', 'scene', 'movie_id']:
    train[col] = tmp[col].values
train.head()

Unnamed: 0,id,content,character,emotions,movie,scene,movie_id
0,1171_0001_A_1,天空下着暴雨，o2正在给c1穿雨衣，他自己却只穿着单薄的军装，完全暴露在大雨之中。,o2,0,1171,1,1
1,1171_0001_A_2,天空下着暴雨，o2正在给c1穿雨衣，他自己却只穿着单薄的军装，完全暴露在大雨之中。,c1,0,1171,1,2
2,1171_0001_A_3,o2一手拿着一个行李，一路小跑着把c1带到了文工团门口。,o2,0,1171,1,3
3,1171_0001_A_4,o2一手拿着一个行李，一路小跑着把c1带到了文工团门口。,c1,0,1171,1,4
4,1171_0001_A_5,o2停下来接过c1手里的行李：你妈妈交待我了，等领了军装一定要照张相寄回去，让街坊邻居都知道你当兵了。,o2,0,1171,1,5


In [3]:
test = pd.read_csv('raw_data/test_dataset.tsv', sep='\t')
tmp = pd.DataFrame(test['id'].str.split('_', expand=True))
tmp.columns = ['movie', 'scene', 'A', 'movie_id']
for col in ['movie', 'scene', 'movie_id']:
    test[col] = tmp[col].values
test.head()

Unnamed: 0,id,content,character,movie,scene,movie_id
0,34170_0002_A_12,穿着背心的b1醒来，看看手机，三点了。,b1,34170,2,12
1,34170_0002_A_14,b1走出卧室。,b1,34170,2,14
2,34170_0003_A_16,b1拿着手机，点开计时功能。,b1,34170,3,16
3,34170_0003_A_17,b1站在淋浴头下面，水从b1的头和脸上冲刷而过。,b1,34170,3,17
4,34170_0003_A_18,b1摈着呼吸。,b1,34170,3,18


In [4]:
train['character'] = train['movie'].astype(str) + '_' + train['character'].astype(str)
test['character'] = test['movie'].astype(str) + '_' + test['character'].astype(str)

In [5]:
train['character'].nunique(), test['character'].nunique()

(607, 372)

In [6]:
FIRST_NAMES = '羿祥惠盛捷霞阳豪誉涵颖梅湘丹勇苗悦朝君杰毓乐曦瑶全恒裕帅馨秋山诗东雯紫木水骏昊艳宗国源莲子锦尔蕾兵天钰财桥轩桐海运坤信卿诚欣茂明晓月韬泳绮侦熙龙舟雨晴元峻程金宇启浩莉彤槐巧艺伟伊扬洋琪正森文鹏辉泽婷美超玉娴智敬奎强玄心高嵘思朗萱昆宸甜凌俊治云仕亭苹喜寅书华瑜晨益仁璇满贵利沁淳林伯晞嘉辰'
SECOND_NAMES = '李王张刘陈杨赵黄周吴徐孙胡朱高林何郭马罗梁宋郑谢韩唐冯于'

In [7]:
def gen_names():
    f1 = FIRST_NAMES[random.randint(0, len(FIRST_NAMES)-1)]
    f2 = FIRST_NAMES[random.randint(0, len(FIRST_NAMES)-1)]
    s1 = SECOND_NAMES[random.randint(0, len(SECOND_NAMES)-1)]
    return f'{s1}{f1}{f2}'

In [8]:
train_characters = train['character'].unique()
test_characters = test['character'].unique()

train_mapping = dict()
for c in train_characters:
    train_mapping[c] = gen_names()

In [9]:
train['character_name'] = train['character'].map(train_mapping)
train.head(20)

Unnamed: 0,id,content,character,emotions,movie,scene,movie_id,character_name
0,1171_0001_A_1,天空下着暴雨，o2正在给c1穿雨衣，他自己却只穿着单薄的军装，完全暴露在大雨之中。,1171_o2,0.0,1171,1,1,何仁晴
1,1171_0001_A_2,天空下着暴雨，o2正在给c1穿雨衣，他自己却只穿着单薄的军装，完全暴露在大雨之中。,1171_c1,0.0,1171,1,2,刘昆诚
2,1171_0001_A_3,o2一手拿着一个行李，一路小跑着把c1带到了文工团门口。,1171_o2,0.0,1171,1,3,何仁晴
3,1171_0001_A_4,o2一手拿着一个行李，一路小跑着把c1带到了文工团门口。,1171_c1,0.0,1171,1,4,刘昆诚
4,1171_0001_A_5,o2停下来接过c1手里的行李：你妈妈交待我了，等领了军装一定要照张相寄回去，让街坊邻居都知道你当兵了。,1171_o2,0.0,1171,1,5,何仁晴
5,1171_0001_A_6,o2停下来接过c1手里的行李：你妈妈交待我了，等领了军装一定要照张相寄回去，让街坊邻居都知道你当兵了。,1171_c1,0.0,1171,1,6,刘昆诚
6,1171_0001_A_7,c1开心地点了点头。,1171_c1,10000.0,1171,1,7,刘昆诚
7,1171_0001_A_8,o2凑近c1小声：办入伍证审的时候，派出所的民警跟我说，你的亲生父亲还在劳改，但是你跟他划清了界限，改姓了你继父的姓，所以出身这一栏，我就给你填革干了，进了团不要跟别人说这件事，我也不会说的。,1171_o2,0.0,1171,1,8,何仁晴
8,1171_0001_A_9,o2凑近c1小声：办入伍证审的时候，派出所的民警跟我说，你的亲生父亲还在劳改，但是你跟他划清了界限，改姓了你继父的姓，所以出身这一栏，我就给你填革干了，进了团不要跟别人说这件事，我也不会说的。,1171_c1,0.0,1171,1,9,刘昆诚
9,1171_0001_A_10,c1再次微笑着点头，然后举手敬礼，但是手的形状却是弯的。,1171_c1,10000.0,1171,1,10,刘昆诚


In [12]:
def replace_text(text, movie, mapping):
    character_list = re.findall(r'[a-z][0-9]', text)
    for c in character_list:
        if f'{movie}_{c}' in mapping:
            text = text.replace(c, mapping[f'{movie}_{c}'])
    return text

train['content'] = train.apply(lambda row: replace_text(row['content'], row['movie'], train_mapping), axis=1)

In [13]:
train.head(20)

Unnamed: 0,id,content,character,emotions,movie,scene,movie_id,character_name
0,1171_0001_A_1,天空下着暴雨，何仁晴正在给刘昆诚穿雨衣，他自己却只穿着单薄的军装，完全暴露在大雨之中。,1171_o2,0.0,1171,1,1,何仁晴
1,1171_0001_A_2,天空下着暴雨，何仁晴正在给刘昆诚穿雨衣，他自己却只穿着单薄的军装，完全暴露在大雨之中。,1171_c1,0.0,1171,1,2,刘昆诚
2,1171_0001_A_3,何仁晴一手拿着一个行李，一路小跑着把刘昆诚带到了文工团门口。,1171_o2,0.0,1171,1,3,何仁晴
3,1171_0001_A_4,何仁晴一手拿着一个行李，一路小跑着把刘昆诚带到了文工团门口。,1171_c1,0.0,1171,1,4,刘昆诚
4,1171_0001_A_5,何仁晴停下来接过刘昆诚手里的行李：你妈妈交待我了，等领了军装一定要照张相寄回去，让街坊邻居都知道你当兵了。,1171_o2,0.0,1171,1,5,何仁晴
5,1171_0001_A_6,何仁晴停下来接过刘昆诚手里的行李：你妈妈交待我了，等领了军装一定要照张相寄回去，让街坊邻居都知道你当兵了。,1171_c1,0.0,1171,1,6,刘昆诚
6,1171_0001_A_7,刘昆诚开心地点了点头。,1171_c1,10000.0,1171,1,7,刘昆诚
7,1171_0001_A_8,何仁晴凑近刘昆诚小声：办入伍证审的时候，派出所的民警跟我说，你的亲生父亲还在劳改，但是你跟他划清了界限，改姓了你继父的姓，所以出身这一栏，我就给你填革干了，进了团不要跟别人说这件事，我也不会说的。,1171_o2,0.0,1171,1,8,何仁晴
8,1171_0001_A_9,何仁晴凑近刘昆诚小声：办入伍证审的时候，派出所的民警跟我说，你的亲生父亲还在劳改，但是你跟他划清了界限，改姓了你继父的姓，所以出身这一栏，我就给你填革干了，进了团不要跟别人说这件事，我也不会说的。,1171_c1,0.0,1171,1,9,刘昆诚
9,1171_0001_A_10,刘昆诚再次微笑着点头，然后举手敬礼，但是手的形状却是弯的。,1171_c1,10000.0,1171,1,10,刘昆诚


In [14]:
train_res = list(train_mapping.values())
random.shuffle(train_res)

In [15]:
# test 采用 train 出现过的随机值

test_mapping = dict()
for idx, c in enumerate(test_characters):
    test_mapping[c] = train_res[idx]

In [16]:
test['character_name'] = test['character'].map(test_mapping)
test.head(10)

Unnamed: 0,id,content,character,movie,scene,movie_id,character_name
0,34170_0002_A_12,穿着背心的b1醒来，看看手机，三点了。,34170_b1,34170,2,12,杨亭萱
1,34170_0002_A_14,b1走出卧室。,34170_b1,34170,2,14,杨亭萱
2,34170_0003_A_16,b1拿着手机，点开计时功能。,34170_b1,34170,3,16,杨亭萱
3,34170_0003_A_17,b1站在淋浴头下面，水从b1的头和脸上冲刷而过。,34170_b1,34170,3,17,杨亭萱
4,34170_0003_A_18,b1摈着呼吸。,34170_b1,34170,3,18,杨亭萱
5,34170_0003_A_20,b1睁开了眼，喘了口气。,34170_b1,34170,3,20,杨亭萱
6,34170_0003_A_21,b1看了看手机，大概四分钟。,34170_b1,34170,3,21,杨亭萱
7,34170_0004_A_24,i3躺在被窝里熟睡。,34170_i3,34170,4,24,马舟宇
8,34170_0004_A_25,b1蹲在床边，拉着i3的手，轻声说道：满儿，爸爸今晚下班以后回来给你过生日。,34170_b1,34170,4,25,杨亭萱
9,34170_0004_A_26,b1蹲在床边，拉着i3的手，轻声说道：满儿，爸爸今晚下班以后回来给你过生日。,34170_i3,34170,4,26,马舟宇


In [17]:
test['content'] = test.apply(lambda row: replace_text(row['content'], row['movie'], test_mapping), axis=1)

In [18]:
test.head(20)

Unnamed: 0,id,content,character,movie,scene,movie_id,character_name
0,34170_0002_A_12,穿着背心的杨亭萱醒来，看看手机，三点了。,34170_b1,34170,2,12,杨亭萱
1,34170_0002_A_14,杨亭萱走出卧室。,34170_b1,34170,2,14,杨亭萱
2,34170_0003_A_16,杨亭萱拿着手机，点开计时功能。,34170_b1,34170,3,16,杨亭萱
3,34170_0003_A_17,杨亭萱站在淋浴头下面，水从杨亭萱的头和脸上冲刷而过。,34170_b1,34170,3,17,杨亭萱
4,34170_0003_A_18,杨亭萱摈着呼吸。,34170_b1,34170,3,18,杨亭萱
5,34170_0003_A_20,杨亭萱睁开了眼，喘了口气。,34170_b1,34170,3,20,杨亭萱
6,34170_0003_A_21,杨亭萱看了看手机，大概四分钟。,34170_b1,34170,3,21,杨亭萱
7,34170_0004_A_24,马舟宇躺在被窝里熟睡。,34170_i3,34170,4,24,马舟宇
8,34170_0004_A_25,杨亭萱蹲在床边，拉着马舟宇的手，轻声说道：满儿，爸爸今晚下班以后回来给你过生日。,34170_b1,34170,4,25,杨亭萱
9,34170_0004_A_26,杨亭萱蹲在床边，拉着马舟宇的手，轻声说道：满儿，爸爸今晚下班以后回来给你过生日。,34170_i3,34170,4,26,马舟宇


In [19]:
train.to_csv('./data/train_with_names.csv', index=False)
test.to_csv('./data/test_with_names.csv', index=False)