In [241]:
import pandas as pd
import numpy as np

df_zh = pd.read_csv('zh.csv')
df_ko = pd.read_csv('ko.csv')

In [242]:
def mark_completed(df):
    # Create a vector to flag rows to keep
    completed = np.zeros(len(df), dtype=bool)
    # Iterate over pairs of rows
    for i in range(0, len(df), 2):
        # If either row in the pair has 'Malformed Brackets' == 'FALSE', set keep to False for both
        if ((df.loc[i, 'Malformed Brackets'] == 'FALSE') and (df.loc[i+1, 'Malformed Brackets'] == 'FALSE')) \
                or (df.loc[i, 'Discard'] == 'TRUE') or (df.loc[i+1, 'Discard'] == 'TRUE') \
                or (df.loc[i, 'Equivalent idiom'] == 'TRUE') or (df.loc[i+1, 'Equivalent idiom'] == 'TRUE'):
            completed[i] = True
            completed[i+1] = True
    # Apply the mask and return the filtered dataframe
    return df[completed]

In [243]:
df_ko.columns = df_ko.iloc[3]
df_ko = df_ko[6::].reset_index(drop=True)
df_ko = mark_completed(df_ko).reset_index(drop=True)

In [245]:
len(df_ko)

506

In [244]:
df_zh.columns = df_zh.iloc[0]
df_zh = df_zh[3::].reset_index(drop=True)
df_zh = mark_completed(df_zh).reset_index(drop=True)

In [184]:
def f(df):
    res = []
    for i in range(0, len(df), 2):
        equiv_idiom = df.loc[i, 'Equivalent idiom'] == 'TRUE' or df.loc[i + 1, 'Equivalent idiom'] == 'TRUE'
        discard = df.loc[i, 'Discard'] == 'TRUE' or df.loc[i + 1, 'Discard'] == 'TRUE'
        res.append([
            df.loc[i, 'Idiom'],             # idiom
            df.loc[i, 'Meaning'],           # meaning
            df.loc[i, 'Source '],            # s_f_brkt  
            'figurative', 
            df.loc[i, 'Translation'],       # t_f_brkt
            equiv_idiom,
            discard
        ])
        res.append([
            df.loc[i, 'Idiom'],             # idiom
            df.loc[i, 'Meaning'],           # meaning
            df.loc[i + 1, 'Source '],        # s_l_brkt
            'literal',
            df.loc[i + 1, 'Translation'],   # t_l_brkt
            equiv_idiom,
            discard
        ])
    return pd.DataFrame(res, 
            columns=['idiom', 'meaning', 'source', 'label', 'translation', 'equiv_idiom', 'discard'])

df_ko_agg = f(df_ko)
df_zh_agg = f(df_zh)

In [185]:
df_merged = pd.merge(df_ko_agg, df_zh_agg, how='outer', on=['idiom', 'meaning', 'source', 'label'], suffixes=['_ko', '_zh'])

In [186]:
df_base = pd.read_csv('base.csv')

In [187]:
df_true_merged = pd.merge(df_base, df_merged, how='outer', on=['idiom', 'meaning', 'label', 'source'])

In [188]:
df_true_merged[['equiv_idiom_zh', 'discard_zh', 'equiv_idiom_ko', 'discard_ko']] = df_true_merged[['equiv_idiom_zh', 'discard_zh', 'equiv_idiom_ko', 'discard_ko']].fillna(False)
df_true_merged = df_true_merged[(~df_true_merged['equiv_idiom_zh']) & (~df_true_merged['discard_zh'])
            & (~df_true_merged['equiv_idiom_ko']) & (~df_true_merged['discard_ko'])]

In [189]:
df_true_merged.reset_index(drop=True, inplace=True)

# Create the pairs of indices
pairs_indices = [(i, i+1) for i in range(0, len(df_true_merged), 2)]

# Select 50 random pairs of indices
np.random.seed(42)
selected_pairs = np.random.choice(len(pairs_indices), size=100, replace=False)

# Now, let's get the actual indices for these selected pairs
selected_indices = [idx for pair in selected_pairs for idx in pairs_indices[pair]]

# Select these indices from the dataframe
sample_df_true_merged = df_true_merged.loc[selected_indices]

In [206]:
df_merged.columns

Index(['idiom', 'meaning', 'source', 'label', 'translation_ko',
       'equiv_idiom_ko', 'discard_ko', 'translation_zh', 'equiv_idiom_zh',
       'discard_zh'],
      dtype='object')

# End

In [208]:
# You can perform a set difference with these two dataframes:
df_all = pd.merge(df_merged, sample_df_true_merged, how='outer', indicator=True, on=['idiom', 'meaning', 'label', 'source'])
df1 = df_all[df_all['_merge'] == 'left_only'].reset_index(drop=True).drop('_merge', axis=1)
df1 = df1.rename(columns={
    'translation_ko_x': 'translation_ko',
    'equiv_idiom_ko_x': 'equiv_idiom_ko', 
    'discard_ko_x': 'discard_ko', 
    'translation_zh_x': 'translation_zh',
    'equiv_idiom_zh_x': 'equiv_idiom_zh', 
    'discard_zh_x': 'discard_zh',
})[['idiom', 'meaning', 'source', 'label', 'translation_ko',
       'equiv_idiom_ko', 'discard_ko', 'translation_zh', 'equiv_idiom_zh',
       'discard_zh']]
df3 = sample_df_true_merged.copy()

In [210]:
def is_truthy(value):
    if pd.isna(value):
        return False
    else:
        return bool(value)
def compute(row):
    row['complete_ko'] = (is_truthy(row['translation_ko'])) \
        or row['equiv_idiom_ko'] or row['discard_ko']
    row['complete_zh'] = (is_truthy(row['translation_zh'])) \
        or row['equiv_idiom_zh'] or row['discard_zh']
    return row
df1[['equiv_idiom_zh', 'discard_zh', 'equiv_idiom_ko', 'discard_ko']] = \
    df1[['equiv_idiom_zh', 'discard_zh', 'equiv_idiom_ko', 'discard_ko']].fillna(False)
df1 = df1.apply(compute, axis=1)

In [211]:
# You can perform a set difference with these two dataframes:
df_all = pd.merge(df_base, pd.concat([df1, df3]), how='outer', indicator=True)
# The set difference (elements in df1 but not in df2)
df2 = df_all[df_all['_merge'] == 'left_only'].reset_index(drop=True).drop('_merge', axis=1)

In [213]:
def reverse_pairs(df):
    pairs = [df.iloc[i:i+2] for i in range(0, len(df), 2)]
    reversed_pairs = pairs[::-1]
    df_reversed = pd.concat(reversed_pairs).reset_index(drop=True)
    return df_reversed

df3 = reverse_pairs(df3)

In [215]:
df2[['equiv_idiom_ko', 'discard_ko', 'equiv_idiom_zh',
    'discard_zh', 'complete_ko', 'complete_zh']] = False
df3[['equiv_idiom_ko', 'discard_ko', 'equiv_idiom_zh', 
     'discard_zh', 'complete_ko', 'complete_zh']] = False

In [229]:
import random

def randomize_pairs(df):
    pairs = [df.iloc[i:i+2] for i in range(0, len(df), 2)]
    pairs_copy = pairs.copy()
    random.seed(42)
    random.shuffle(pairs_copy)
    df_shuffled = pd.concat(pairs_copy).reset_index(drop=True)
    return df_shuffled

In [224]:
df2

Unnamed: 0,idiom,meaning,label,source,translation_ko,equiv_idiom_ko,discard_ko,translation_zh,equiv_idiom_zh,discard_zh,complete_ko,complete_zh
0,end of story,there is nothing more to add to the matter un...,figurative,"I told him I wasn't interested, and [[that's t...",,False,False,,False,False,False,False
1,end of story,there is nothing more to add to the matter un...,literal,"She closed the book and said, '[[that's the en...",,False,False,,False,False,False,False
2,every nook and cranny,to look for something everywhere,figurative,"To find the perfect gift, [[they searched ever...",,False,False,,False,False,False,False
3,every nook and cranny,to look for something everywhere,literal,"During the deep cleaning, [[they searched ever...",,False,False,,False,False,False,False
4,face the music,receive punishment,figurative,"After making a mistake at work, [[they faced t...",,False,False,,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...
413,tore your hair out,to be extremely agitated or distressed about s...,literal,You accidentally [[tore your hair out]] while ...,,False,False,,False,False,False,False
414,treasure trove,a priceless or valuable discovery,figurative,"When they found the old library, [[they discov...",,False,False,,False,False,False,False
415,treasure trove,a priceless or valuable discovery,literal,"While digging in the backyard, [[they discover...",,False,False,,False,False,False,False
416,turn back on,choose not to speak to,figurative,[[She turned back on the situation]] when she ...,,False,False,,False,False,False,False


In [237]:
df_two = pd.concat([df1, df2]).reset_index(drop=True)
df_two.sort_values(by='complete_zh', ascending=False, inplace=True, kind='stable')

# Split DataFrame
df_true = df_two[df_two['complete_zh'] == True]
df_false = df_two[df_two['complete_zh'] == False]

# Sort DataFrame with true values by 'idiom'
df_true.sort_values(by='idiom', inplace=True, kind='stable')

# Shuffle DataFrame with false values
df_false = randomize_pairs(df_false)

# Concatenate two DataFrames
df_two = pd.concat([df_true, df_false], ignore_index=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_true.sort_values(by='idiom', inplace=True, kind='stable')


In [240]:
df = pd.concat([df_two, df3], ignore_index=True)
df.to_csv("ko_zh_1024.csv", index=False)