# Find new words in vocab list

In [12]:
import pandas as pd
from googletrans import Translator

## Step 1
Copy data from the vocab website of your choice. Paste this in new_data.csv Keep copyright into account, only scrape when allowed.

## Step 2 Prepare old list and new list

In [13]:
old_vocab = pd.read_excel(r'C:\Users\konin\OneDrive\Ewout\Totaallijst Chinees.xlsx')

#Skip bad lines is useful with messy data like HTML
new_vocab = pd.read_csv('new_data.csv', on_bad_lines='skip')

## Step 3 Define functions

In [14]:
def save_old_vocab_into_list(df):
    return df["Term"].tolist()

def save_new_vocab_into_list(df):
    
    topic_ch = []

    #Go through all the rows of the new csv
    for current_topic_row in new_vocab["Term"]:
        topic_row_ch = ""
    
        #Look at each character and check if it's Chinese
        for char in current_topic_row:
            if char > u'\u4e00' and char < u'\u9fff' and char != '' and (char not in topic_row_ch):
                topic_row_ch += char
    
        #Add only Chinese words that are new
        if topic_row_ch not in topic_ch:
            topic_ch.append(topic_row_ch)   

    return topic_ch

def find_abs_differences(new_word_list, old_word_list):
    #My data contained an empty line in the beginning, so the [1:] is optional.
    diff = list(set(new_word_list).difference(set(old_word_list)))[1:]
    
    #Create a string representation of all old words
    old_word_string = ""
    for word in old_word_list:
        old_word_string += word
    
    #Check if the new words are not anywhere inside the known vocab (chars in that order)
    pmatch = [word for word in diff if word not in old_word_string]
    return pmatch

def translate_new_vocab(abs_difference_list):
    vocab_with_translation = dict()

    for term in abs_difference_list:
        vocab_with_translation[term] = translator.translate(term).text

    return vocab_with_translation

## Step 4 Call functions to find new words

In [15]:
old_word_list = save_old_vocab_into_list(old_vocab)
new_word_list = save_new_vocab_into_list(new_vocab)

abs_difference_list = find_abs_differences(new_word_list, old_word_list)

## Step 5 Translate results

In [9]:
translator = Translator()
my_dict = translate_new_vocab(abs_difference_list)

## Step 6 Final result

In [10]:
my_dict

{'水饺': 'Dumplings',
 '红烧肉': 'Saffron',
 '拼盘': 'Platter',
 '果汁': 'juice',
 '小费': 'tip',
 '蒜': 'garlic',
 '早餐': 'breakfast',
 '饱': 'full',
 '煮蛋': 'Cooked egg',
 '面馆': 'Noodle House',
 '味道': 'smell',
 '茶叶': 'tea',
 '盐': 'Salt',
 '泡咖啡': 'Coffee',
 '苦瓜': 'Momordica charantia',
 '百事可乐': 'Pepsi',
 '海鲜': 'seafood',
 '牛肉面': 'beef noodles',
 '春卷': 'Spring roll',
 '白饭': 'Rice',
 '煎蛋': 'Fried egg',
 '豌豆': 'pea',
 '煤气炉': 'gas stove',
 '回锅肉': 'Return meat',
 '豆浆': 'Soy milk',
 '蒸笼': 'Steamer',
 '炉子': 'stove',
 '番茄酱': 'ketchup',
 '吃饭': 'Have a meal',
 '咸': 'salty',
 '买单': 'Pay',
 '油腻': 'Greasy',
 '橘子': 'tangerinr',
 '茉莉花茶': 'Jasmine tea',
 '土豆': 'Potato',
 '奶昔': 'milkshake',
 '辣椒': 'chili',
 '豆腐': 'Tofu',
 '花菜': 'Cauliflower',
 '番茄': 'tomato',
 '烧烤': 'barbecue',
 '白菜': 'Chinese cabbage',
 '咖啡馆': 'coffee shop',
 '辣椒酱': 'chili sauce',
 '盐水鸭': 'Salted Duck',
 '红葡萄酒': 'red wine',
 '吃素的人': 'Vegetarian',
 '炖': 'stew',
 '零食': 'snack',
 '芦笋': 'asparagus',
 '好吃': 'good to eat',
 '洋葱': 'onion',
 '冷盘': 'Cold di

In [16]:
abs_difference_list

['水饺',
 '红烧肉',
 '早餐',
 '饱',
 '面馆',
 '味道',
 '茶叶',
 '盐',
 '泡咖啡',
 '苦瓜',
 '百事可乐',
 '海鲜',
 '牛肉面',
 '白饭',
 '豌豆',
 '煤气炉',
 '回锅肉',
 '番茄酱',
 '吃饭',
 '咸',
 '买单',
 '油腻',
 '橘子',
 '土豆',
 '奶昔',
 '豆腐',
 '花菜',
 '番茄',
 '烧烤',
 '白菜',
 '咖啡馆',
 '盐水鸭',
 '红葡萄酒',
 '吃素的人',
 '炖',
 '零食',
 '好吃',
 '洋葱',
 '冷盘',
 '花椒',
 '绿茶',
 '家常便饭',
 '拿手菜',
 '糖醋肉',
 '新鲜',
 '馊',
 '火锅',
 '服务员',
 '杏子',
 '生姜',
 '汤',
 '口渴',
 '青椒',
 '冰块',
 '雪碧',
 '菜单',
 '胡椒',
 '茄子',
 '中餐',
 '面包',
 '炸',
 '火腿',
 '粤菜',
 '尝',
 '芹菜',
 '番茄炒蛋',
 '吃不来',
 '蔬菜',
 '锅',
 '刀',
 '素菜',
 '油饼',
 '做饭',
 '饭厅',
 '豆芽',
 '胡萝卜',
 '勺子',
 '账单',
 '竹笋',
 '菠萝',
 '水果',
 '巧克力',
 '麻婆豆腐',
 '芝麻油',
 '电炉',
 '橙汁',
 '四季豆',
 '三明治',
 '闻',
 '包子',
 '糖果',
 '黄油',
 '玉米',
 '香菜',
 '馒头',
 '冰淇淋',
 '气味',
 '烤鸭',
 '锅贴',
 '木莓',
 '热饮',
 '粮食',
 '汉堡',
 '蜂蜜',
 '馄饨',
 '泡茶',
 '自己来',
 '西餐',
 '披萨',
 '四川菜',
 '白酒',
 '虾',
 '猪肉',
 '筷子',
 '锅铲',
 '食堂',
 '宫保鸡丁',
 '午餐',
 '难吃',
 '牛排',
 '椰子',
 '萝卜',
 '奶酪',
 '草莓',
 '炒面',
 '快餐',
 '喝醉',
 '请客',
 '晚餐',
 '餐厅',
 '荔枝',
 '油条',
 '花生米',
 '点菜',
 '汽水',
 '茅台',
 '食谱',
 '桃子',
 '餐馆',
 '饼干