In [115]:
import pandas as pd
import numpy as np

df_zh = pd.read_csv('zh.csv')
df_ko = pd.read_csv('ko.csv')

In [116]:
def drop_pairs(df):
    # Create a vector to flag rows to keep
    keep = np.ones(len(df), dtype=bool)

    # Iterate over pairs of rows
    for i in range(0, len(df), 2):
        # If either row in the pair has 'Malformed Brackets' == 'FALSE', set keep to False for both
        if (df.loc[i, 'Malformed Brackets'] == 'TRUE') or (df.loc[i+1, 'Malformed Brackets'] == 'TRUE'):
            keep[i] = False
            keep[i+1] = False

    # Apply the mask and return the filtered dataframe
    return df[keep]

In [117]:
df_ko.columns = df_ko.iloc[3]
df_ko = df_ko[6::].reset_index(drop=True)
df_ko = drop_pairs(df_ko).reset_index(drop=True)

In [118]:
df_zh.columns = df_zh.iloc[0]
df_zh = df_zh[3::].reset_index(drop=True)
df_zh = drop_pairs(df_zh).reset_index(drop=True)

In [122]:
def f(df):
    res = []
    for i in range(0, len(df), 2):
        res.append([
            df.loc[i, 'Idiom'],             # idiom
            df.loc[i, 'Meaning'],           # meaning
            df.loc[i, 'Source '],            # s_f_brkt
            df.loc[i + 1, 'Source '],        # s_l_brkt
            df.loc[i, 'Translation'],       # t_f_brkt
            df.loc[i + 1, 'Translation'],   # t_l_brkt
        ])
    return pd.DataFrame(res, 
            columns=['idiom', 'meaning', 's_f_brkt', 's_l_brkt', 't_f_brkt', 't_l_brkt'])

df_ko_agg = f(df_ko)
df_zh_agg = f(df_zh)

In [123]:
def f(df):
    res = []
    for i in range(0, len(df), 2):
        i_fig = i if df.loc[i, 'label'] == 'figurative' else i + 1
        i_lit = (i + 1) if df.loc[i + 1, 'label'] == 'literal' else i
        res.append([
            df.loc[i, 'idiom'],             # idiom
            df.loc[i, 'meaning'],           # meaning
            df.loc[i_fig, 'source'],        # s_f_brkt
            df.loc[i_lit, 'source'],        # s_l_brkt
        ])
    return pd.DataFrame(res, 
            columns=['idiom', 'meaning', 's_f_brkt', 's_l_brkt'])
base = pd.read_csv("base.csv")
base = f(base)

In [124]:
import re

def no_brkt(df, col_br, col_nobr):
    # Creating a new column 'text_without_brackets' by removing the square brackets
    df[col_nobr] = df[col_br].str.replace(r'\[\[', '', regex=True).str.replace(r'\]\]', '', regex=True)
    return df

def in_brkt(df, col_br, col_in_br):
    df[col_in_br] = df[col_br].apply(lambda x: ' '.join(re.findall(r'\[\[(.*?)\]\]', x)))
    return df

base = no_brkt(base, 's_f_brkt', 's_f')
base = no_brkt(base, 's_l_brkt', 's_l')
base = in_brkt(base, 's_f_brkt', 's_a')

In [110]:
df_ko_nobr = no_brkt(df_ko_agg, 's_f_brkt', 's_f')
df_ko_nobr = no_brkt(df_ko_agg, 's_l_brkt', 's_l')
df_ko_nobr = in_brkt(df_ko_agg, 's_f_brkt', 's_a')

df_ko_nobr = no_brkt(df_ko_agg, 't_f_brkt', 't_f')
df_ko_nobr = no_brkt(df_ko_agg, 't_l_brkt', 't_l')
df_ko_nobr = in_brkt(df_ko_agg, 't_f_brkt', 't_f_a')
df_ko_nobr = in_brkt(df_ko_agg, 't_l_brkt', 't_l_a')

In [125]:
df_zh_nobr = no_brkt(df_zh_agg, 's_f_brkt', 's_f')
df_zh_nobr = no_brkt(df_zh_agg, 's_l_brkt', 's_l')
df_zh_nobr = in_brkt(df_zh_agg, 's_f_brkt', 's_a')

df_zh_nobr = no_brkt(df_zh_agg, 't_f_brkt', 't_f')
df_zh_nobr = no_brkt(df_zh_agg, 't_l_brkt', 't_l')
df_zh_nobr = in_brkt(df_zh_agg, 't_f_brkt', 't_f_a')
df_zh_nobr = in_brkt(df_zh_agg, 't_l_brkt', 't_l_a')

In [112]:
df_ko_res = base.merge(df_ko_nobr[['idiom', 't_f_brkt', 't_l_brkt', 't_f', 't_l', 't_f_a', 't_l_a']], 
                        on='idiom', how='left').dropna().reset_index(drop=True)
df_zh_res = base.merge(df_zh_nobr[['idiom', 't_f_brkt', 't_l_brkt', 't_f', 't_l', 't_f_a', 't_l_a']], 
                        on='idiom', how='left').dropna().reset_index(drop=True)

In [113]:
inner = pd.merge(df_ko_res, df_zh_res, how='inner', 
                     on=['idiom', 'meaning', 's_f_brkt', 's_l_brkt', 's_a', 's_f', 's_l'],
                     suffixes=('_ko', '_zh'))
inner.to_csv("inner.csv", index=False)