In [1]:
import os
import re
import operator
import pandas as pd
from tqdm.notebook import tqdm
from googletrans import Translator
from nltk.tokenize import word_tokenize
from sentence_transformers import SentenceTransformer, util

In [2]:
translator = Translator(service_urls=['translate.google.co.in'])
model = SentenceTransformer('all-MiniLM-L6-v2')
all_files = os.listdir("subs")
path = "subs\\"

In [3]:
new_df = pd.DataFrame([])
for file in tqdm(all_files):
    file_path = os.path.join(path, file)
    df = pd.read_csv(file_path)
    df["file_name"] = file
    new_df = pd.concat([new_df, df])

HBox(children=(FloatProgress(value=0.0, max=6940.0), HTML(value='')))




In [3]:
def jaccard(a, b):
    a = set(a)
    b = set(b)
    c = a.intersection(b)
    return float(len(c)) / (len(a) + len(b) - len(c))

In [5]:
# Removing duplicates

dedup_df = new_df.drop_duplicates()
dedup_df = dedup_df.reset_index()

# Removing mappings low quality semantic score 

dedup_df = dedup_df[dedup_df['semantic_score'] > 0.01]
dedup_df = dedup_df[operator.and_(dedup_df['semantic_score'] > 0.2,  dedup_df['score'] > 0.2)]

# Removing en_text with less than 3 characters

dedup_df = dedup_df[dedup_df['en_text'].map(str).map(len) > 3]

# Remove words inside {} and <> in jp_text and en_text

dedup_df['jp_text'] = [re.sub(r"{.*}", "", x) for x in dedup_df['jp_text']]
dedup_df['en_text'] = [re.sub(r"{.*}", "", x) for x in dedup_df['en_text']]
dedup_df['jp_text'] = [re.sub(r"<.*>", "", x) for x in dedup_df['jp_text']]
dedup_df['en_text'] = [re.sub(r"<.*>", "", x) for x in dedup_df['en_text']]
dedup_df['jp_text'] = [re.sub(r"\（.*\）", "", x) for x in dedup_df['jp_text']]

# Replacing \\N with \n in en_texts
dedup_df['en_text'] = dedup_df['en_text'].str.replace(r"\\N", "\n")

In [6]:
# Strip \n from the beginning of jp_text

dedup_df['en_text'] = dedup_df['en_text'].str.strip()
dedup_df['jp_text'] = dedup_df['jp_text'].str.strip()

# Adding a new column type based on number of sentences in en_text and jp_text
# Type 0 : If "\n" in en_text and jp_text
# Type 1 : If "\n" in en_text but not in jp_text
# Type 2 : If "\n" in jp_text but not in en_text
# Type 3 : No "\n" in en_text and jp_text

typ = []
for i, row in dedup_df.iterrows():
    if "\n" in row['en_text'] and "\n" in row['jp_text']:
        typ.append(0)
    elif "\n" in row['en_text'] and "\n" not in row['jp_text']:
        typ.append(1)
    elif "\n" in row['jp_text'] and "\n" not in row['en_text']:
        typ.append(2)
    else:
        typ.append(3)
        
dedup_df['type'] = typ

# Removing one word wrong translations 

dedup_df['en_text_words'] = dedup_df.en_text.apply(word_tokenize)
dedup_df['word_count'] = dedup_df.en_text_words.apply(len)
drop_df = dedup_df[operator.and_(dedup_df['word_count'] == 1, dedup_df['score'] != 1)]
dedup_df = dedup_df.drop(drop_df.index)
dedup_df = dedup_df.drop(["en_text_words"], axis = 1)

In [8]:
#dedup_df.to_csv("dedup_df.csv")
dedup_df = pd.read_csv("dedup_df.csv")
dedup_df = dedup_df.dropna()

In [9]:
type_one_df = dedup_df[dedup_df["type"] == 1]
type_one_df = type_one_df.drop('index', axis = 1)
type_one_df = type_one_df.drop(['Unnamed: 0'], axis = 1)
type_one_df.head()

Unnamed: 0,en_text,jp_text,google_translated,score,semantic_score,file_name,type,word_count
0,It's that dream again.\nJust who is that guy?,またあの夢だ　誰なんだ　あの人は,Another dream that is that dream,0.3,0.59504,07-Ghost-1.csv,1,12
5,"Chairman Miroku, I've heard that the level \no...",ミロク理事長　今年の卒業生はレベルが高いと伺いましたぞ,Miroku President's graduates of this year have...,0.36,0.834872,07-Ghost-1.csv,1,22
11,I'm going to fight for the empire and protect ...,俺　帝国のために戦って　家族を守ってみせる,I will fight for the empire and protect my family,0.363636,0.624846,07-Ghost-1.csv,1,24
16,"After my judgment, I was taken in by Miroku.\n...",家族の愛情とか知らない,I do not know the loving of my family,0.26087,0.560514,07-Ghost-1.csv,1,25
28,If this guy is not defeated or if you \nabando...,皆で力を合わせないと　本当にやられてしまいますよ,It will really be done if you all do not match...,0.217391,0.370212,07-Ghost-1.csv,1,17


In [10]:
type_one_split_list = []
for i, row in tqdm(type_one_df.iterrows(), total = len(type_one_df)):
    try:
        first, second = row['en_text'].split("\n")
    except:
        continue
    translated_text = row['google_translated']
    file_name = row['file_name']
    embeddings_translated = model.encode(translated_text, convert_to_tensor=True)
    embeddings1 = model.encode(first, convert_to_tensor=True)
    embeddings2 = model.encode(second, convert_to_tensor=True)
    cosine_scores1 = util.pytorch_cos_sim(embeddings_translated, embeddings1)
    cosine_scores2 = util.pytorch_cos_sim(embeddings_translated, embeddings2)
    original_score = row['semantic_score']
    translated_tokenized = word_tokenize(translated_text.lower())
    
    if cosine_scores1 > cosine_scores2 and cosine_scores1 > original_score:
        en_tokenized = word_tokenize(first.lower())
        score = jaccard(translated_tokenized, en_tokenized)
        temp_list = [first, row['jp_text'], translated_text, score, cosine_scores1.item(), file_name, 3, len(en_tokenized)]
        
    elif cosine_scores2 > cosine_scores1 and cosine_scores2 > original_score:
        en_tokenized = word_tokenize(second.lower())
        score = jaccard(translated_tokenized, en_tokenized)
        temp_list = [second, row['jp_text'], translated_text, score, cosine_scores2.item(),file_name, 3, len(en_tokenized)]
        
    else:
        row["en_text"] = row["en_text"].replace("\n","") 
        temp_list = list(row)
        
    type_one_split_list.append(temp_list)
type_one_split_df = pd.DataFrame(type_one_split_list)

HBox(children=(FloatProgress(value=0.0, max=16913.0), HTML(value='')))




In [4]:
dedup_df = pd.read_csv("dedup_df.csv")
dedup_df = dedup_df.dropna()
#clean_type_2_df = pd.read_csv("type_2.csv")
type_two_df = dedup_df[dedup_df["type"] == 2]
type_two_df = type_two_df.reset_index()
type_two_df = type_two_df.drop('index', axis = 1)
type_two_df = type_two_df.drop(['Unnamed: 0'], axis = 1)
type_two_df = type_two_df.drop(['level_0'], axis = 1)
type_two_df.head()

Unnamed: 0,en_text,jp_text,google_translated,score,semantic_score,file_name,type,word_count
0,Are you crazy?,って お前ここ男子トイレだぞ\nバッカじゃねえの？,You are a boy's toilet here\nAren't you stupid?,0.272727,0.277794,Accel_World-1.csv,2,4
1,Um...,こんにちは\n君…,Hello\n ...,0.333333,0.462535,Accel_World-1.csv,2,2
2,Accelerated?,加速？\nそうだ,Accelerated?,1.0,1.0,Accel_World-1.csv,2,2
3,But this is your chance.,これはチャンスなのだよ\nえっ？,This is a chance\n?,0.375,0.674521,Accel_World-1.csv,2,6
4,It's a great chance.,おとなしく殴られますよ\nせっかくのチャンスですから,I will be beaten\nBecause it is a great opport...,0.230769,0.405899,Accel_World-1.csv,2,6


In [7]:
type_two_split_list = []
no = 0
for i, row in tqdm(type_two_df.iterrows(), total = len(type_two_df)):
    #if i < 25800:
    #    continue
    #try:
    #    first, second = row['jp_text'].split("\n")
    #except:
    #    continue
    #while True:
    #    try:
    #        first_translated_text = translator.translate(first, dest = "en", src="ja").text
    #        second_translated_text = translator.translate(second, dest = "en", src="ja").text
    #        break
    #    except:
    #        continue
    try:
        first_translated_text, second_translated_text = row['google_translated'].split("\n")
        first, second = row['jp_text'].split("\n")
    except:
        continue
    en_text = row['en_text']
    file_name = row['file_name']
    embeddings_text = model.encode(en_text, convert_to_tensor=True)
    embeddings1 = model.encode(first_translated_text, convert_to_tensor=True)
    embeddings2 = model.encode(second_translated_text, convert_to_tensor=True)
    cosine_scores1 = util.pytorch_cos_sim(embeddings_text, embeddings1)
    cosine_scores2 = util.pytorch_cos_sim(embeddings_text, embeddings2)
    original_score = row['semantic_score']
    en_tokenized = word_tokenize(en_text.lower())
    
    if cosine_scores1 > cosine_scores2 and cosine_scores1 > original_score:
        first_tokenized = word_tokenize(first_translated_text.lower())
        score = jaccard(first_tokenized, en_tokenized)
        temp_list = [row['en_text'], first, first_translated_text, score, cosine_scores1.item(),file_name, 3, len(en_tokenized)]
        
    elif cosine_scores2 > cosine_scores1 and cosine_scores2 > original_score:
        second_tokenized = word_tokenize(second_translated_text.lower())
        score = jaccard(second_tokenized, en_tokenized)
        temp_list = [row['en_text'], second, second_translated_text, score, cosine_scores2.item(), file_name, 3, len(en_tokenized)]
        
    else:
        row["jp_text"] = row["jp_text"].replace("\n","") 
        temp_list = list(row)
        
    type_two_split_list.append(temp_list)
    
    #if (len(type_two_split_list) + 1) % 100 == 0:
    #    no += 1
    #    type_two_split_df = pd.DataFrame(type_two_split_list)
    #    type_two_split_df.to_csv("type_2_"+str(no)+".csv")
    #    type_two_split_list = []
    
type_two_split_df = pd.DataFrame(type_two_split_list)
no += 1
type_two_split_df.to_csv("type_2_"+str(no)+".csv")

HBox(children=(FloatProgress(value=0.0, max=85600.0), HTML(value='')))




In [26]:
type_three_df = dedup_df[dedup_df["type"] == 3]
type_three_df = type_three_df.drop(['index','Unnamed: 0'], axis = 1)

In [46]:
type_one_split_df.columns = type_three_df.columns
type_two_split_df.columns = type_three_df.columns

In [48]:
clean_df = pd.concat([type_one_split_df, type_two_split_df, type_three_df], axis = 0)
clean_df.shape

(475038, 8)

In [49]:
clean_df.to_csv("clean_tl_dataset.csv", index = False)

In [56]:
clean_df.sample(5)[['en_text','jp_text','google_translated']]

Unnamed: 0,en_text,jp_text,google_translated
249842,My cigarette says that one.,俺のタバコは あの店を選んだぜ,My cigarette chose that store
149436,What is the one thing I don't want to let go?,手放したくないもんはどれ？,Which do you don't want to let go?
338844,I can't move at all.,まったく 何やってるんだ,I'm doing anything at all
389797,"I'll defeat Kaito, Rook, and all you freaks. T...",カイトもルークも　全員ぶっ倒し 俺がファイ・ブレインになる。,All kites and Luke are tired and I become a fi...
258835,"Write ""contradiction"" three times. contradiction",三回ずつ書け,Write three times
