# Data permutation and combination for Chinese Name

# Loading the dataset 

In [332]:
import pandas as pd
import numpy as np 
import itertools
import functools

data = pd.read_csv('https://raw.githubusercontent.com/hankcs/HanLP/master/data/dictionary/custom/%E4%BA%BA%E5%90%8D%E8%AF%8D%E5%85%B8.txt', header = None)
#data = pd.read_csv('https://raw.githubusercontent.com/fung1091/profile/master/worksample/combin.txt')
data.columns = ["a"]
#print (data.head())
len(data)

50190

# Data cleaning and transfer into dataframe

In [333]:
splitted = data['a'].apply(lambda x: pd.Series(list(x)))
splitted.columns = ['B'+str(x) for x in splitted.columns]
df2 = data.join(splitted)
df2 = pd.DataFrame(df2.iloc[1:50191, 1:4])


print(df2.head())
len (df2)

  B0 B1 B2
1  丁  一  宇
2  丁  一  平
3  丁  万  明
4  丁  世  伟
5  丁  世  芳


50189

In [334]:
# character of last name
#df3 = pd.DataFrame(df2.iloc[:, 0:1])
#df3 = df3['B0'].unique()
#df3 = pd.DataFrame({'B0':df3})
#print (df3.head())
#len (df3)
#print(df3.to_csv(r'last_name.txt', header=None, index=None, sep=' ', mode='a'))

## Extract the last name and count the frequency, then remove less than 5 

In [335]:
# character of last name
df3 = pd.DataFrame(df2.iloc[:, 0:1])
df3['count'] = 1
df3 = df3.groupby(['B0'], as_index=False).count()
df3 = df3.sort_values(by=['count'], ascending=False) # sort the largest value
df3 = df3[df3['count'] > 5] # remove the number less than 1
df3 = pd.DataFrame(df3.iloc[:, 0:1]) # remove counter column
print (df3.head())

#len (df3)
# Export dataframe to txt file
print(df3.to_csv(r'last_name.txt', header=None, index=None, sep=' ', mode='a'))

    B0
568  王
449  李
350  张
875  陈
126  刘
None


The number of first name:

In [336]:
len(df3)

528

## Extract first character of first name and count the frequency, then remove less than 12

In [337]:
# first character of first name
df4 = pd.DataFrame(df2.iloc[:, 1:2])
df4['count'] = 1
df4 = df4.groupby(['B1'], as_index=False).count()
df4 = df4.sort_values(by=['count'], ascending=False) # sort the largest value
df4 = df4[df4['count'] > 12] # remove the number less than 1
df4 = pd.DataFrame(df4.iloc[:, 0:1])
print (df4.head())
len (df4)

     B1
1010  晓
955   文
630   尔
751   建
628   小


679

## Extract second character of first name and count the frequency, then remove less than 12

#### Drop symbol function

In [338]:
# Drop symbol function 

def drop(df, column):

    
    #drop('·') & - symbol and empty 
    indexNames = df5[df5['B2'] == ' '].index
 
    # Delete these row indexes from dataFrame
    df5.drop(indexNames , inplace=True)

    indexNames1 = df5[df5['B2'] == '·'].index
 
    # Delete these row indexes from dataFrame
    df5.drop(indexNames1 , inplace=True)

    indexNames2 = df5[df5['B2'] == '-'].index
 
    # Delete these row indexes from dataFrame
    df5.drop(indexNames2 , inplace=True)

    indexNames2 = df5[df5['B2'] == '—'].index
 
    # Delete these row indexes from dataFrame
    df5.drop(indexNames2 , inplace=True)

In [339]:
# last name
drop(df3, 'B0')
len(df3)

528

In [340]:
# first character of first name
drop(df4, 'B1')
len(df4)

679

In [341]:
# second character of first name
drop(df5, 'B2')
len(df5)

524

In [342]:
if df5.empty:
    print('DataFrame is empty!')
else:
    print('DataFrame is no empty!')

DataFrame is no empty!


In [343]:
# check empty rows
def emptyrows(df):
    if df.empty:
        print('DataFrame is empty!')
    else:
        print('DataFrame is no empty!')
        
emptyrows(df3)
emptyrows(df4)
emptyrows(df5)


DataFrame is no empty!
DataFrame is no empty!
DataFrame is no empty!


In [344]:
Totalnum = len(df4)*len(df5)

print ("Total number of first name is {}".format(Totalnum))

Total number of first name is 355796


## Export to txt for translation

In [345]:
print(df3.to_csv(r'last.txt', header=None, index=None, sep=' ', mode='a'))
print(df4.to_csv(r'firstname_12_one.txt', header=None, index=None, sep=' ', mode='a'))
print(df5.to_csv(r'firstname_12_second.txt', header=None, index=None, sep=' ', mode='a'))

None
None
None


# Translate from simple chinese to traditional chinese
From txt file to txt file

In [346]:
from hanziconv import HanziConv
import pandas as pd

def Translate_value(txt_org, txt_new, num):
    with open(txt_org, 'r') as f:
        myNames = [line.strip() for line in f]
        str1 = ''.join(myNames)
        z = HanziConv.toTraditional(str1)
        x= num # last name = 1, first name =2
        res=[z[y-x:y] for y in range(x, len(z)+x,x)]
        with open(txt_new, 'w') as f:
            for item in res:
                f.write("%s\n" % item)

In [347]:
#last_name.txt
Translate_value('last.txt','last_TC.txt', 1)
#first character of first name
Translate_value('firstname_12_one.txt','firstname_12_one_TC.txt', 1)
#second character of first name
Translate_value('firstname_12_second.txt','firstname_12_second_TC.txt', 1)

In [348]:
d = {'last_TC.txt','firstname_12_one_TC.txt','firstname_12_second_TC.txt'}

for i in d:
    dffr = pd.read_csv(i, sep=" ", header=None,)
    print (dffr.head())

   0
0  曉
1  文
2  爾
3  建
4  小
   0
0  王
1  李
2  張
3  陳
4  劉
   0
0  華
1  斯
2  平
3  爾
4  明


# Testing accuracy

In [349]:
def Testing(txt_org, txt_new):
    # Convert to String
    with open(txt_org, 'r') as f:
        myNames2 = [line.strip() for line in f]
        str2 = ''.join(myNames2)
        #print (myNames2)
    with open(txt_new, 'r') as f:
        myNames3 = [line.strip() for line in f]
        str3 = ''.join(myNames3)
        #print (myNames3)
    print (HanziConv.same(str2, str3))
    compare_different(str2, str3)
    

In [350]:
# Compare strings
def compare_different(question_old, question_new):
    ALL_CHAR = set()
    if question_old != question_new:
        print("========")
        set_old = set(question_old)
        set_new = set(question_new)
        print(set_old-set_new)
        print(set_new-set_old)
        for element in set_new-set_old:
            ALL_CHAR.add(element)
        print("========")

In [351]:
# last_name
# 'last.txt','last_TC.txt'
Testing('last.txt','last_TC.txt')

True
{'畅', '韩', '纽', '毕', '劳', '钟', '孙', '闵', '内', '谈', '东', '寿', '余', '马', '阳', '宫', '岳', '银', '刘', '维', '麦', '汉', '库', '蔺', '晋', '诺', '宾', '纪', '里', '云', '蓝', '巩', '乐', '爱', '齐', '满', '邝', '姜', '单', '练', '钮', '铁', '达', '诸', '胡', '权', '蒋', '迟', '凤', '乌', '陆', '严', '陈', '费', '时', '赵', '松', '乔', '凯', '怀', '顾', '缪', '区', '庄', '琼', '别', '涂', '卜', '盖', '丛', '凌', '贾', '储', '荣', '赛', '绍', '约', '冯', '苏', '应', '纳', '汤', '叶', '宁', '杨', '连', '关', '温', '萧', '农', '伦', '邬', '杰', '于', '谢', '贡', '帅', '萨', '郁', '兴', '亚', '项', '邹', '蒙', '吕', '赖', '广', '庞', '吴', '习', '郑', '贝', '让', '经', '舍', '车', '卫', '欧', '华', '闻', '简', '向', '迈', '邓', '国', '谷', '许', '窦', '鲁', '钱', '娄', '闫', '丰', '范', '莱', '优', '谌', '才', '宝', '千', '尔', '荆', '兰', '泽', '贺', '万', '卢', '买', '强', '龙', '楼', '骆', '饶', '黄', '热', '党', '曲', '阎', '聂', '回', '克', '扎', '朴', '边', '师', '游', '扬', '奥', '门', '沈', '丽', '张', '罗', '鲍', '来', '朱', '阙', '励', '圣', '韦', '颜', '图', '谭', '龚', '玛'}
{'歐', '習', '許', '關', '維', '濛', '饒', '馬', '榖', '闕', '陽', '麯', '簡', '熱'

In [352]:
# first character of first name
# 'firstname_12_one.txt','firstname_12_one_TC.txt'
Testing('firstname_12_one.txt','firstname_12_one_TC.txt')

True
{'畅', '梦', '冬', '顿', '澜', '继', '纯', '劳', '钟', '锦', '内', '顺', '树', '东', '寿', '阳', '风', '马', '聪', '钧', '谦', '岳', '银', '维', '则', '麦', '远', '汉', '库', '悦', '晋', '诺', '园', '纪', '宾', '里', '云', '历', '乐', '济', '胜', '欢', '爱', '传', '为', '齐', '炜', '宪', '满', '韵', '铁', '达', '会', '权', '晓', '凤', '铭', '军', '显', '志', '瑶', '灵', '晖', '陈', '时', '松', '轶', '乔', '钰', '凯', '怀', '联', '鹏', '坚', '潇', '焕', '发', '艳', '咏', '庄', '琼', '娅', '圆', '彦', '兹', '卜', '导', '伟', '劲', '勋', '凌', '荣', '绍', '赛', '诚', '滨', '腾', '莲', '镇', '润', '隽', '苏', '绪', '应', '纳', '贤', '义', '宁', '叶', '杨', '献', '连', '农', '伦', '书', '洁', '鸿', '杰', '业', '从', '帅', '萨', '开', '兴', '刚', '亚', '礼', '桥', '静', '蒙', '轩', '广', '枫', '贝', '骏', '跃', '涛', '经', '辉', '卫', '钦', '华', '向', '冲', '长', '国', '莹', '谷', '红', '艺', '鲁', '剑', '灿', '姗', '丰', '莱', '贵', '钊', '才', '宝', '千', '尔', '峥', '家', '兰', '鸣', '合', '泽', '飞', '选', '岛', '钢', '贺', '万', '买', '强', '复', '当', '庆', '龙', '硕', '进', '学', '玛', '竞', '岗', '黄', '诗', '烨', '双', '启', '鹤', '克', '丝', '扎', '颖', '运', '芸', '秋',

In [353]:
# second character of first name
# 'firstname_12_second.txt','firstname_12_second_TC.txt'
Testing('firstname_12_second.txt','firstname_12_second_TC.txt')

True
{'顿', '冬', '纯', '钟', '锦', '内', '顺', '东', '寿', '余', '风', '阳', '聪', '马', '钧', '谦', '银', '维', '汉', '库', '远', '诺', '园', '宾', '里', '声', '云', '图', '乐', '济', '胜', '欢', '传', '为', '齐', '宪', '满', '达', '会', '权', '晓', '凤', '铭', '举', '军', '志', '瑶', '灵', '晖', '时', '松', '乔', '钰', '凯', '怀', '宽', '鹏', '坚', '璇', '焕', '发', '艳', '韬', '庄', '琼', '营', '娅', '圆', '彦', '兹', '盖', '伟', '勋', '凌', '荣', '诚', '滨', '莲', '镇', '苏', '纳', '贤', '龄', '义', '宁', '叶', '杨', '连', '农', '伦', '书', '洁', '鸿', '杰', '业', '帅', '萨', '开', '兴', '刚', '亚', '礼', '桥', '静', '蒙', '轩', '广', '妈', '贝', '跃', '涛', '俭', '辉', '卫', '钦', '华', '长', '国', '莹', '红', '称', '艺', '标', '鲁', '剑', '灿', '姗', '丰', '范', '莱', '贵', '贞', '钊', '才', '宝', '尔', '家', '兰', '鸣', '泽', '飞', '万', '强', '庆', '龙', '进', '学', '楼', '儿', '诗', '尧', '双', '启', '克', '丝', '侠', '扎', '颖', '乡', '运', '芸', '秋', '娇', '锋', '师', '扬', '奥', '岚', '丽', '罗', '逊', '仪', '来', '玮', '实', '圣', '栋', '韦', '征', '岭', '玛'}
{'為', '義', '維', '濛', '馬', '陽', '齡', '歡', '蓮', '瑩', '豐', '徵', '濤', '瓊', '餘', '瑪', '莊', '韋'

# 2nd testing

In [354]:
# last name 
a = 'last.txt'
b = 'last_TC.txt'

# first character of first name
#a = 'firstname_12_one.txt'
#b = 'firstname_12_one_TC.txt'

# second character of first name
#a = 'firstname_12_second.txt'
#b = 'firstname_12_second_TC.txt'


dff = pd.read_csv(a, sep=" ", header=None, names=["a"])
dff2 = pd.read_csv(b, sep=" ", header=None, names=["b"])


df_all = pd.concat([dff, dff2], axis='columns',)

def print_full(x):
    pd.set_option('display.max_rows', len(x))
    print(x)
    pd.reset_option('display.max_rows')
    
print_full(df_all.loc[df_all['a'] != df_all['b']])

     a  b
2    张  張
3    陈  陳
4    刘  劉
5    杨  楊
6    黄  黃
7    吴  吳
8    马  馬
9    赵  趙
12   朱  硃
14   胡  鬍
15   孙  孫
16   罗  羅
18   郑  鄭
23   谢  謝
24   许  許
26   苏  蘇
27   韩  韓
32   叶  葉
33   冯  馮
34   沈  瀋
39   吕  呂
43   蒋  蔣
46   邓  鄧
47   克  剋
54   卢  盧
55   贾  賈
61   姜  薑
63   余  餘
64   范  範
70   于  於
74   乔  喬
75   钟  鍾
76   陆  陸
80   华  華
83   万  萬
90   谭  譚
92   欧  歐
93   严  嚴
97   钱  錢
99   贺  賀
100  齐  齊
104  邹  鄒
108  温  溫
110  顾  顧
112  鲁  魯
113  奥  奧
115  凯  凱
116  萨  薩
117  韦  韋
118  汤  湯
120  龙  龍
121  杰  傑
123  龚  龔
130  麦  麥
131  约  約
135  贝  貝
139  亚  亞
144  玛  瑪
145  鲍  鮑
148  迈  邁
150  乌  烏
151  向  嚮
153  兰  蘭
154  赖  賴
159  蓝  藍
166  维  維
167  闫  閆
168  关  關
170  达  達
180  聂  聶
188  费  費
189  岳  嶽
190  庄  莊
192  莱  萊
199  连  連
200  萧  蕭
201  庞  龐
205  毕  畢
208  颜  顔
209  库  庫
213  朴  樸
216  蒙  濛
222  凌  淩
227  爱  愛
228  纪  紀
233  卫  衛
234  游  遊
235  谷  榖
239  扎  紮
242  宁  寜
246  诺  諾
248  盖  蓋
249  圣  聖
251  窦  竇
252  骆  駱
255  车  車
260  劳  勞
262  涂  塗
263  纳  納


# Change and Modify character

In [355]:
# last name
i = {"於","薑","剋","瀋","纔","裏","麯","韆","彆"}

# first character of first name
#i = {"誌","剋","裏","傢","鞦","嚮","鼕","淩","濛","纔","榖","蔔","蔔","蔔"}

# second character of first name
#i = {"剋","裏","鬆","纔","誌","鞦","餘","淩","鼕","濛","範"}

for c in i:
    df_all["b"] = df_all["b"].replace(c,df_all["a"],regex=True)
print_full(df_all)

     a  b
0    王  王
1    李  李
2    张  張
3    陈  陳
4    刘  劉
5    杨  楊
6    黄  黃
7    吴  吳
8    马  馬
9    赵  趙
10   周  周
11   林  林
12   朱  硃
13   徐  徐
14   胡  鬍
15   孙  孫
16   罗  羅
17   郭  郭
18   郑  鄭
19   金  金
20   阿  阿
21   何  何
22   高  高
23   谢  謝
24   许  許
25   宋  宋
26   苏  蘇
27   韩  韓
28   梁  梁
29   蔡  蔡
30   安  安
31   唐  唐
32   叶  葉
33   冯  馮
34   沈  沈
35   曹  曹
36   潘  潘
37   董  董
38   程  程
39   吕  呂
40   袁  袁
41   汪  汪
42   丁  丁
43   蒋  蔣
44   艾  艾
45   方  方
46   邓  鄧
47   克  克
48   肖  肖
49   白  白
50   杜  杜
51   魏  魏
52   卡  卡
53   江  江
54   卢  盧
55   贾  賈
56   彭  彭
57   夏  夏
58   田  田
59   雷  雷
60   崔  崔
61   姜  姜
62   曾  曾
63   余  餘
64   范  範
65   石  石
66   洪  洪
67   斯  斯
68   秦  秦
69   姚  姚
70   于  于
71   巴  巴
72   布  布
73   戴  戴
74   乔  喬
75   钟  鍾
76   陆  陸
77   梅  梅
78   伊  伊
79   廖  廖
80   华  華
81   史  史
82   拉  拉
83   万  萬
84   武  武
85   任  任
86   文  文
87   孟  孟
88   熊  熊
89   莫  莫
90   谭  譚
91   薛  薛
92   欧  歐
93   严  嚴
94   康  康
95   邱  邱
96   侯  侯
97   钱  錢
98   陶  陶


In [356]:
# Export last name to txt file

print(df_all['b'].to_csv(r'last_final.txt', header=None, index=None, sep=' ', mode='a'))

None


In [320]:
# Extract to first character of first name
#df_first1 = df_all[['b']]
#df_first1.rename(columns={'b': 'B1'}, inplace=True)# change column name
df_first1.head()
#len (df_first1)

Unnamed: 0,B1
0,曉
1,文
2,爾
3,建
4,小


In [328]:
# Extract to second character of first name
#df_first2 = df_all[['b']]
#df_first2.rename(columns={'b': 'B2'}, inplace=True)# change column name
df_first2.head()
#len (df_first2)

Unnamed: 0,B2
0,華
1,斯
2,平
3,爾
4,明


In [329]:
# Export txt file
print(df_first2.to_csv(r'first_name_12_1char.txt', header=None, index=None, sep=' ', mode='a'))

None


# Combination of first name

In [321]:
# function of combination
def cartesian(df1, df2):
    rows = itertools.product(df1.iterrows(), df2.iterrows())   
    #print (rows)
    df = pd.DataFrame(left.append(right) for (_, left), (_, right) in rows)
    print (rows)
    return df.reset_index(drop=True)

In [324]:
combined_firstname_final = functools.reduce(cartesian, [df_first1, df_first2])

<itertools.product object at 0x1467132d0>


In [325]:
print(combined_firstname_final.head())
len(combined_firstname_final)

  B1 B2
0  曉  華
1  曉  斯
2  曉  平
3  曉  爾
4  曉  明


355796

In [326]:
# Remove space between 2 character
combined_firstname_final['B3'] = combined_firstname_final.B1.str.cat(combined_firstname_final.B2)
combined_firstname_F = pd.DataFrame(combined_firstname_final.iloc[:, 2:3])
combined_firstname_F.head()

Unnamed: 0,B3
0,曉華
1,曉斯
2,曉平
3,曉爾
4,曉明


In [330]:
# Export txt file
print(combined_firstname_F.to_csv(r'first_name_12_2char.txt', header=None, index=None, sep=' ', mode='a'))

None


# End 

# Make new dataset for 2 character

In [95]:
b = 'firstname_12_only2Character_TC.txt'

dff = pd.read_csv(b, sep=" ", header=None, names=["B2"])
dff.head()

Unnamed: 0,B2
0,華
1,斯
2,平
3,爾
4,明


In [96]:
# Export last name if only 2 character of name
print(df4.to_csv(r'middle.txt', header=None, index=None, sep=' ', mode='a'))

None


In [84]:
#last_name.txt
Translate_value('middle.txt','middle_TC.txt', 1)

In [97]:
c = 'middle_TC.txt'

dffr = pd.read_csv(c, sep=" ", header=None, names=["B1"])
dffr.head()

Unnamed: 0,B1
0,曉
1,文
2,爾
3,建
4,小


In [98]:
combined_firstname_final = functools.reduce(cartesian, [dffr, dff])

<itertools.product object at 0x148fad828>


In [100]:
# Remove space between 2 character
combined_firstname_final['B3'] = combined_firstname_final.B1.str.cat(combined_firstname_final.B2)
combined_firstname_final = pd.DataFrame(combined_firstname_final.iloc[:, 2:3])
combined_firstname_final.head()

Unnamed: 0,B3
0,曉華
1,曉斯
2,曉平
3,曉爾
4,曉明


In [102]:
# Export txt file
print(combined_firstname_final.to_csv(r'first_name_12_final.txt', header=None, index=None, sep=' ', mode='a'))

None


In [103]:
len (combined_firstname_final)

354748

In [None]:
#combined_all = functools.reduce(cartesian, [df3, combined_firstname])

In [165]:
# try
#Testing('CtextTry.txt','CtextTry2.txt')

#last_name.txt
#Testing('last_name.txt','last_name_TC.txt')
# 'firstname_12_only2Character.txt','firstname_12_only2Character_TC.txt')
a = 'last_name.txt'
b = 'last_name_TC.txt'

dff = pd.read_csv(a, sep=" ", header=None, names=["a"])
dff2 = pd.read_csv(b, sep=" ", header=None, names=["b"])
#dff3 = pd.read_csv(b, sep=" ", header=None, names=["a"])

df_all = pd.concat([dff, dff2], axis='columns',)
#df_all = pd.concat([df, df2], 
                   #axis='columns', keys=['First', 'Second'])
#display(df_all)

In [166]:
#df_all = df_all[df_all["a"] != df_all["b"]]
#display(df_all)

In [304]:
def diff_df(df1, df2, how="left"):
    """
      Find Difference of rows for given two dataframes
      this function is not symmetric, means
            diff(x, y) != diff(y, x)
      however
            diff(x, y, how='left') == diff(y, x, how='right')

      Ref: https://stackoverflow.com/questions/18180763/set-difference-for-pandas/40209800#40209800
    """
    if (df1.columns != df2.columns).any():
        raise ValueError("Two dataframe columns must match")

    if df1.equals(df2):
        return None
    elif how == 'right':
        return pd.concat([df2, df1, df1]).drop_duplicates(keep=False)
    elif how == 'left':
        return pd.concat([df1, df2, df2]).drop_duplicates(keep=False)
    else:
        raise ValueError('how parameter supports only "left" or "right keywords"')

In [305]:
diff_df(dff, dff3)

Unnamed: 0,a
0,华
3,尔
7,生
8,军
10,东
12,伟
13,龙
14,峰
15,强
16,辉


# first - first name
志  誌
克  剋
冬  鼕
凌  淩
蒙  濛
卜  蔔
千  韆

In [128]:
df_all['a'].isin(df_all['b'])

0    False
1    False
2    False
Name: a, dtype: bool

In [133]:
# Convert to String
# 'last_name.txt','last_name_TC.txt'

with open('last_name.txt', 'r') as f:
    myNames2 = [line.strip() for line in f]
    str2 = ''.join(myNames2)
    #print (myNames2)
with open('last_name_TC.txt', 'r') as f:
    myNames3 = [line.strip() for line in f]
    str3 = ''.join(myNames3)
    #print (myNames3)
HanziConv.same(str2, str3)
compare_different(str2, str3)

{'畅', '韩', '纽', '毕', '劳', '钟', '孙', '闵', '内', '谈', '东', '寿', '余', '马', '阳', '宫', '岳', '银', '刘', '维', '麦', '汉', '库', '蔺', '晋', '诺', '宾', '纪', '里', '云', '蓝', '巩', '乐', '爱', '齐', '满', '邝', '姜', '单', '练', '钮', '铁', '达', '诸', '权', '蒋', '迟', '凤', '乌', '陆', '严', '陈', '费', '时', '赵', '松', '乔', '凯', '怀', '顾', '缪', '区', '庄', '琼', '别', '涂', '卜', '盖', '丛', '凌', '贾', '储', '荣', '赛', '绍', '约', '冯', '苏', '应', '纳', '汤', '叶', '宁', '杨', '连', '关', '温', '萧', '农', '伦', '邬', '杰', '于', '谢', '贡', '帅', '萨', '郁', '兴', '亚', '项', '邹', '蒙', '吕', '赖', '广', '庞', '吴', '习', '郑', '贝', '让', '经', '舍', '车', '卫', '欧', '华', '闻', '简', '向', '迈', '邓', '国', '谷', '许', '窦', '鲁', '钱', '娄', '闫', '丰', '范', '莱', '优', '谌', '才', '宝', '千', '尔', '荆', '兰', '泽', '贺', '万', '卢', '买', '强', '龙', '楼', '骆', '饶', '黄', '热', '党', '曲', '阎', '聂', '回', '扎', '朴', '边', '师', '游', '扬', '奥', '门', '沈', '丽', '张', '罗', '鲍', '来', '阙', '励', '圣', '韦', '颜', '图', '谭', '龚', '玛'}
{'歐', '習', '許', '關', '維', '濛', '饒', '馬', '榖', '闕', '陽', '麯', '簡', '熱', '蔔', '暢', '區', '陸'

In [134]:
set_old = set(str2)
set_new = set(str3)

In [135]:
print(set_old-set_new)
ALL_CHAR = set()
for element in set_new-set_old:
    ALL_CHAR.add(element)

{'畅', '韩', '纽', '毕', '劳', '钟', '孙', '闵', '内', '谈', '东', '寿', '余', '马', '阳', '宫', '岳', '银', '刘', '维', '麦', '汉', '库', '蔺', '晋', '诺', '宾', '纪', '里', '云', '蓝', '巩', '乐', '爱', '齐', '满', '邝', '姜', '单', '练', '钮', '铁', '达', '诸', '权', '蒋', '迟', '凤', '乌', '陆', '严', '陈', '费', '时', '赵', '松', '乔', '凯', '怀', '顾', '缪', '区', '庄', '琼', '别', '涂', '卜', '盖', '丛', '凌', '贾', '储', '荣', '赛', '绍', '约', '冯', '苏', '应', '纳', '汤', '叶', '宁', '杨', '连', '关', '温', '萧', '农', '伦', '邬', '杰', '于', '谢', '贡', '帅', '萨', '郁', '兴', '亚', '项', '邹', '蒙', '吕', '赖', '广', '庞', '吴', '习', '郑', '贝', '让', '经', '舍', '车', '卫', '欧', '华', '闻', '简', '向', '迈', '邓', '国', '谷', '许', '窦', '鲁', '钱', '娄', '闫', '丰', '范', '莱', '优', '谌', '才', '宝', '千', '尔', '荆', '兰', '泽', '贺', '万', '卢', '买', '强', '龙', '楼', '骆', '饶', '黄', '热', '党', '曲', '阎', '聂', '回', '扎', '朴', '边', '师', '游', '扬', '奥', '门', '沈', '丽', '张', '罗', '鲍', '来', '阙', '励', '圣', '韦', '颜', '图', '谭', '龚', '玛'}


In [136]:
ALL_CHAR

{'亞',
 '來',
 '倫',
 '傑',
 '優',
 '儲',
 '內',
 '凱',
 '劉',
 '勞',
 '勵',
 '區',
 '叢',
 '吳',
 '呂',
 '喬',
 '單',
 '嚮',
 '嚴',
 '國',
 '圖',
 '塗',
 '壽',
 '奧',
 '婁',
 '孫',
 '宮',
 '寜',
 '寶',
 '嶽',
 '帥',
 '師',
 '庫',
 '廣',
 '張',
 '強',
 '彆',
 '愛',
 '應',
 '懷',
 '捨',
 '揚',
 '於',
 '時',
 '晉',
 '暢',
 '東',
 '楊',
 '榖',
 '榮',
 '樂',
 '樓',
 '樸',
 '權',
 '歐',
 '淩',
 '湯',
 '溫',
 '滿',
 '漢',
 '澤',
 '濛',
 '瀋',
 '烏',
 '熱',
 '爾',
 '瑪',
 '瓊',
 '畢',
 '盧',
 '竇',
 '範',
 '簡',
 '紀',
 '約',
 '納',
 '紐',
 '紮',
 '紹',
 '經',
 '維',
 '練',
 '繆',
 '纔',
 '羅',
 '習',
 '聖',
 '聞',
 '聶',
 '興',
 '荊',
 '莊',
 '華',
 '萊',
 '萬',
 '葉',
 '蓋',
 '蔔',
 '蔣',
 '蕭',
 '薑',
 '薩',
 '藍',
 '藺',
 '蘇',
 '蘭',
 '衛',
 '裏',
 '許',
 '談',
 '諶',
 '諸',
 '諾',
 '謝',
 '譚',
 '讓',
 '豐',
 '貝',
 '貢',
 '買',
 '費',
 '賀',
 '賈',
 '賓',
 '賴',
 '賽',
 '趙',
 '車',
 '農',
 '迴',
 '連',
 '遊',
 '達',
 '遲',
 '邁',
 '邊',
 '鄒',
 '鄔',
 '鄧',
 '鄭',
 '鄺',
 '鈕',
 '銀',
 '錢',
 '鍾',
 '鐵',
 '門',
 '閆',
 '閔',
 '閻',
 '闕',
 '關',
 '陳',
 '陸',
 '陽',
 '雲',
 '鞏',
 '韆',
 '韋',
 '韓',
 '項',
 '顔',
 '顧',
 '餘',
 '饒',
 '馬',
 '馮'

In [None]:
# Compare strings
def compare_different(question_old, question_new):
    ALL_CHAR = set()
    if question_old != question_new:
        print("========")
        set_old = set(question_old)
        set_new = set(question_new)
        print(set_old-set_new)
        print(set_new-set_old)
        for element in set_new-set_old:
            ALL_CHAR.add(element)
        print("========")

In [None]:
# Compare strings
compare_different(question_old, question_new):


In [None]:
#df4 = pd.DataFrame(df2.iloc[:, 1:2])
#df5 = pd.DataFrame(df2.iloc[:, 2:3])
#print (df4.head())
#print (df5.head())
#df4 = df4['B1'].unique()
#df8 = pd.DataFrame({'B1':df4})
#print (df8)
#df5 = df5['B2'].unique()
#df9 = pd.DataFrame({'B2':df5})
#print (df9)

In [None]:
def cartesian(df1, df2):
    rows = itertools.product(df1.iterrows(), df2.iterrows())   
    #print (rows)
    df = pd.DataFrame(left.append(right) for (_, left), (_, right) in rows)
    print (rows)
    return df.reset_index(drop=True)

In [None]:
#combined = functools.reduce(cartesian, [df7, df8, df9])

In [None]:
len (combined)

In [None]:
print(combined.tail())

# Compare dataframes

In [137]:
def diff_df(df1, df2, how="left"):
    """
      Find Difference of rows for given two dataframes
      this function is not symmetric, means
            diff(x, y) != diff(y, x)
      however
            diff(x, y, how='left') == diff(y, x, how='right')

      Ref: https://stackoverflow.com/questions/18180763/set-difference-for-pandas/40209800#40209800
    """
    if (df1.columns != df2.columns).any():
        raise ValueError("Two dataframe columns must match")

    if df1.equals(df2):
        return None
    elif how == 'right':
        return pd.concat([df2, df1, df1]).drop_duplicates(keep=False)
    elif how == 'left':
        return pd.concat([df1, df2, df2]).drop_duplicates(keep=False)
    else:
        raise ValueError('how parameter supports only "left" or "right keywords"')

In [None]:
def diff_df(df1, df2, how="left"):
    """
      Find Difference of rows for given two dataframes
      this function is not symmetric, means
            diff(x, y) != diff(y, x)
      however
            diff(x, y, how='left') == diff(y, x, how='right')

      Ref: https://stackoverflow.com/questions/18180763/set-difference-for-pandas/40209800#40209800
    """
    if (df1.columns != df2.columns).any():
        raise ValueError("Two dataframe columns must match")

    if df1.equals(df2):
        return None
    elif how == 'right':
        return pd.concat([df2, df1, df1]).drop_duplicates(keep=False)
    elif how == 'left':
        return pd.concat([df1, df2, df2]).drop_duplicates(keep=False)
    else:
        raise ValueError('how parameter supports only "left" or "right keywords"')

In [116]:
import sys
if sys.version_info[0] < 3:
    from StringIO import StringIO
else:
    from io import StringIO

DF1 = StringIO("""id   Name   score                    isEnrolled           Comment
111  Jack   2.17                     True                 "He was late to class"
112  Nick   1.11                     False                "Graduated"
113  Zoe    NaN                     True                  " "
""")
DF2 = StringIO("""id   Name   score                    isEnrolled           Comment
111  Jack   2.17                     True                 "He was late to class"
112  Nick   1.21                     False                "Graduated"
113  Zoe    NaN                     False                "On vacation" """)
df1 = pd.read_table(DF1, sep='\s+', index_col='id')
df2 = pd.read_table(DF2, sep='\s+', index_col='id')
diff_df(df1, df2)

Unnamed: 0_level_0,Name,score,isEnrolled,Comment
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
112,Nick,1.11,False,Graduated
113,Zoe,,True,


# Translate Simple chinese to traditional chinese

In [None]:
from hanziconv import HanziConv
import pandas as pd
print(HanziConv.toSimplified('繁簡轉換器'))
print(HanziConv.toTraditional('繁简转换器'))
HanziConv.same('繁簡轉換器', '繁简转换器')

In [None]:
data1 = pd.read_csv('https://raw.githubusercontent.com/fung1091/profile/master/worksample/combin.txt')
data1.columns = ["a"]
print (data1.head())

In [None]:
HanziConv.toTraditional(data1)

In [None]:
## Translate from simple chinese to traditional chinese by using list

In [None]:
with open('CtextTry.txt', 'r') as f:
    myNames = [line.strip() for line in f]
    print (myNames)

In [None]:
myNames

In [None]:
str1 = ''.join(myNames)
str1

In [None]:
z = HanziConv.toTraditional(str1)
z

In [None]:
import re
y = [x for x in re.split(r'(\w{5})', x) if x]
y

In [None]:
HanziConv.same(str1, x)

In [None]:
def itersplit_into_x_chunks(string,x=10): # we assume here that x is an int and > 0
    size = len(string)
    chunksize = size//x
    for pos in range(0, size, chunksize):
        yield string[pos:pos+chunksize]

In [None]:
yy = list(itersplit_into_x_chunks(z,x=3))
yy

In [None]:
z
x = 5
for y in range(x, len(z)+x,x):
    print (y)

In [None]:

x=5 
res=[z[y-x:y] for y in range(x, len(z)+x,x)]
print(res)

In [None]:
HanziConv.same(myNames, res)

In [None]:
HanziConv.same(myNames, yy)

In [None]:
myNames == myNames

In [None]:
with open('CtextTry2.txt', 'w') as f:
    for item in res:
        f.write("%s\n" % item)

In [None]:
with open("CtextTry1.txt", "w") as output:
    output.write(str(res))

In [None]:
with open('CtextTry.txt', 'r') as f:
    myNames2 = [line.strip() for line in f]
    print (myNames2)

In [None]:
with open('CtextTry2.txt', 'r') as f:
    myNames3 = [line.strip() for line in f]
    print (myNames3)

In [None]:
str2 = ''.join(myNames2)
str3 = ''.join(myNames3)

In [None]:
HanziConv.same(str2, str3)