# Extracting the Data for this app

In [2]:
#!conda install stopwordsiso

In [3]:
from pathlib import Path

import pandas as pd
import numpy as np
import sqlite3

# NLP
import jieba
import stopwordsiso #

In [4]:
DATA_DIR = Path('/home/jentlejames/Projects/Data/Chinese Automation/data')

In [5]:
DATA_DIR

PosixPath('/home/jentlejames/Projects/Data/Chinese Automation/data')

In [6]:
conn = sqlite3.connect('../db/ccrs.db')

# Characters

In [7]:
df_hanzi = pd.read_csv(DATA_DIR/'extracted'/'uniqueCharacters.csv',index_col=0)
df_hanzi['hanzi_index'] = df_hanzi.index + 1_000_000

In [8]:
# Filter by column
hanziColumns = ['char','cumulativeRawFrequency','kMandarin','English','kTotalStrokes','hanzi_index']
df_hanzi = df_hanzi[hanziColumns].copy()

# Rename for sql column standard
df_hanzi.columns = ['hanzi','raw_frequency','pinyin','definition','stroke_count','hanzi_index']
df_hanzi.sample(3)

Unnamed: 0,hanzi,raw_frequency,pinyin,definition,stroke_count,hanzi_index
1560,昏,94.970535,hūn,muddle-headed/twilight/to faint/to lose consci...,8,1001560
7916,禜,99.998307,yǒng,,15,1007916
4652,痿,99.879532,wěi,atrophy,13,1004652


In [9]:
df_hanzi.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9933 entries, 0 to 9932
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   hanzi          9933 non-null   object 
 1   raw_frequency  9933 non-null   float64
 2   pinyin         9931 non-null   object 
 3   definition     6224 non-null   object 
 4   stroke_count   9933 non-null   object 
 5   hanzi_index    9933 non-null   int64  
dtypes: float64(1), int64(1), object(4)
memory usage: 543.2+ KB


In [13]:
df_hanzi.iloc[400]

hanzi                        深
raw_frequency        71.059093
pinyin                    shēn
definition       deep/profound
stroke_count                11
hanzi_index            1000400
Name: 400, dtype: object

In [10]:
# cleaning up some ambivalence to convert to int type
df_hanzi[df_hanzi['stroke_count'].str.contains(' ')] = 9


df_hanzi.stroke_count.astype(np.int16)

0        8
1        1
2        9
3        4
4        2
        ..
9928    20
9929    22
9930    23
9931    14
9932    16
Name: stroke_count, Length: 9933, dtype: int16

In [11]:
df_hanzi.to_sql('hanzi_info',conn,if_exists='replace',index=df_hanzi['hanzi_index'])

DatabaseError: Execution failed on sql 'DROP TABLE "hanzi_info"': database is locked

In [None]:
test_hanzi = pd.read_sql('SELECT * FROM hanzi_info', conn, index_col='hanzi_index')
test_hanzi.sample(3)

Unnamed: 0_level_0,hanzi,raw_frequency,pinyin,definition,stroke_count
hanzi_index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1001817,廉,96.376082,lián,incorrupt/inexpensive,13
1006055,織,99.980609,zhī,,18
1006682,箜,99.992575,kōng,ancient string music instrument,14


In [None]:
df_hanzi.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9933 entries, 0 to 9932
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   hanzi          9933 non-null   object 
 1   raw_frequency  9933 non-null   float64
 2   pinyin         9931 non-null   object 
 3   definition     6224 non-null   object 
 4   stroke_count   9933 non-null   object 
 5   hanzi_index    9933 non-null   int64  
dtypes: float64(1), int64(1), object(4)
memory usage: 543.2+ KB


# Radicals

In [None]:
df_radicals = pd.read_csv(DATA_DIR/'extracted'/'Radicals.csv',index_col=0)

In [None]:
# Filter



#### Objective 2: Meaning and definitions

In [None]:
# English Category Clean up 
# Remove whitespace formatting
df_radicals['english']  = df_radicals.english.str.replace('\xad','')

# Cleaning up the list of definitions 

# Needs to deal with the nested list, expanding it out into a table
df_radicals['Meaning'] = df_radicals['kDefinition'].str.split(';')
df_meaning = df_radicals['Meaning'].apply(pd.Series).copy()
df_meaning.head()

Unnamed: 0,0,1,2,3,4
0,one,"a, an",alone,,
1,number one,line,Kangxi radical 2,,
2,line,Kangxi radical 4,,,
3,second,2nd heavenly stem,,,
4,hook,Kangxi radical 6,,,


In [None]:
# Removing the extra information about Kangxi Radicals
for i in range(5):
    df_meaning[i] = np.where(df_meaning[i].str.contains('Kangxi'),np.NaN,df_meaning[i])
    df_meaning[i] = df_meaning[i].str.strip()

In [None]:
df_radicals.head()

Unnamed: 0,number,radical,variants,simplifiedradical,pinyin,english,strokecount,char,ucn,kDefinition,Meaning
0,1,一,,,yi1,one,1,一,U+4E00,"one; a, an; alone","[one, a, an, alone]"
1,2,丨,,,gun3,line,1,丨,U+4E28,number one; line; Kangxi radical 2,"[number one, line, Kangxi radical 2]"
2,4,丿,"乀 (fu2), 乁(yi2)",,pie3,slash,1,丿,U+4E3F,line; Kangxi radical 4,"[line, Kangxi radical 4]"
3,5,乙,"乚 (yin3), 乛",,yi4,second,1,乙,U+4E59,second; 2nd heavenly stem,"[second, 2nd heavenly stem]"
4,6,亅,,,jue2,hook,1,亅,U+4E85,hook; Kangxi radical 6,"[hook, Kangxi radical 6]"


In [None]:
# This line is to unpack the definitions even further, with the goal of
# unpacking the nested lists inside of the nested lists

# Populating an empty array
df_meaning['idx'] = np.NaN

# Recurses through each column, adding where it iis found  
for i in range(5):
    df_meaning['idx'] = np.where(df_radicals['english'] == df_meaning[i],i,df_meaning['idx'])


# Checking for redundant definitions
secondaryCheckIdx = df_meaning['idx'].isnull()



In [None]:
df_meaning['english'] = df_radicals['english']

In [None]:
# Unpacking Level 2 nested list of definitions, checking for matches
#df_meaning[df_meaning[4].str.contains(',') == True]


commaMeanings0 = df_meaning[secondaryCheckIdx][0].str.split(', | or ').apply(pd.Series)
#print(commaMeanings0.shape[1])
commaMeanings1 = df_meaning[secondaryCheckIdx][1].str.split(', | or ').apply(pd.Series)
#print(commaMeanings1.shape[0])
# Merging two nested lists together in order to check for matching words that indicate redudant information 
commaMeanings = pd.merge(commaMeanings0,commaMeanings1,how='outer',on=commaMeanings0.index).drop('key_0',axis=1)

# Makes possible to iterate through each
commaMeanings.columns = range(commaMeanings.shape[1])

commaMeanings['single_word_def_is_redundant'] = np.NaN
commaMeanings['english'] = df_meaning[secondaryCheckIdx].english.reset_index(drop=True)

for i in range(commaMeanings.shape[1] -2 ): # -2 for index column and english column
    commaMeanings['single_word_def_is_redundant'] = np.where(commaMeanings['english'] == commaMeanings[i], i, commaMeanings['single_word_def_is_redundant'])

commaMeanings['merge_idx'] =  df_meaning[secondaryCheckIdx].index

In [None]:
df_meaning = pd.merge(df_meaning,commaMeanings[['merge_idx','single_word_def_is_redundant']],how='left',left_on=df_meaning.index,right_on='merge_idx').drop('merge_idx',axis=1)
df_meaning.shape

(214, 8)

In [None]:
df_meaning['english'] = np.where(df_meaning['single_word_def_is_redundant'].isnull() & df_meaning['idx'].isnull(),df_meaning['english'],np.NaN)

In [None]:
df_meaning = df_meaning[['english',0,1,2,3,4]]

In [None]:
df_meaning

Unnamed: 0,english,0,1,2,3,4
0,,one,"a, an",alone,,
1,,number one,line,,,
2,slash,line,,,,
3,,second,2nd heavenly stem,,,
4,,hook,,,,
...,...,...,...,...,...,...
209,,"even, uniform, of equal length",,,,
210,tooth,teeth,"gears, cogs",age,,
211,,dragon,,,,
212,,turtle or tortoise,cuckold,,,


In [None]:
df_radicals['Meaning'] = df_meaning.apply(lambda x: ', '.join(x.dropna()), axis=1)
df_radicals['Meaning'] = '[' + df_radicals['Meaning'] + ']'
df_radicals.drop(['kDefinition','english'],axis=1,inplace=True)

### Extract Variants

In [None]:
radical_variants =  df_radicals['variants']#.str.split(',').dropna()
radical_variants =  radical_variants.str.replace('\([a-z1-4]*\)','',regex=True).dropna()
radical_variants.str.replace('\s?,\s?', ',',regex=True)
radical_variants_unique =  radical_variants.str.split(',').apply(pd.Series).copy()

radical_variants_unique = pd.concat([radical_variants_unique[0],radical_variants_unique[1]],axis=0).dropna()

#### merging traditional and simplified radicals

In [None]:
df_radicals

Unnamed: 0,number,radical,variants,simplifiedradical,pinyin,strokecount,char,ucn,Meaning
0,1,一,,,yi1,1,一,U+4E00,"[one, a, an, alone]"
1,2,丨,,,gun3,1,丨,U+4E28,"[number one, line]"
2,4,丿,"乀 (fu2), 乁(yi2)",,pie3,1,丿,U+4E3F,"[slash, line]"
3,5,乙,"乚 (yin3), 乛",,yi4,1,乙,U+4E59,"[second, 2nd heavenly stem]"
4,6,亅,,,jue2,1,亅,U+4E85,[hook]
...,...,...,...,...,...,...,...,...,...
209,210,齊,,齐,qi2,14,齊,U+9F4A,"[even, uniform, of equal length]"
210,211,齒,,齿,chi3,15,齒,U+9F52,"[tooth, teeth, gears, cogs, age]"
211,212,龍,,龙,long2,16,龍,U+9F8D,[dragon]
212,213,龜,,龟,gui1,16,龜,U+9F9C,"[turtle or tortoise, cuckold]"


In [None]:
df_radicals['simplifiedradical'].fillna(df_radicals['radical'],inplace=True)

In [None]:
df_radicals['simplifiedradical'].isnull().sum()

0

In [None]:
# Collecting instances where there is a traditional radical

df_radicals['traditional'] = np.where(df_radicals['simplifiedradical'] != df_radicals['radical'],df_radicals['radical'],np.NaN)

In [None]:
df_radicals.columns

Index(['number', 'radical', 'variants', 'simplifiedradical', 'pinyin',
       'strokecount', 'char', 'ucn', 'Meaning', 'traditional'],
      dtype='object')

In [None]:
df_radicals.sample(3)

Unnamed: 0,number,radical,variants,simplifiedradical,pinyin,strokecount,char,ucn,Meaning,traditional
28,29,又,,又,you4,2,又,U+53C8,"[and, also, again, in addition]",
137,139,色,,色,se4,6,色,U+8272,"[color, tint, hue, shade, form, body, beauty, ...",
68,69,斤,,斤,jin1,4,斤,U+65A4,"[axe, a catty (approximately 600 g), an axe, k...",


In [None]:
df_radicals.drop(['simplifiedradical','char'],axis=1,inplace=True)

In [None]:
df_radicals.columns =['radical_number', 'radical', 'variants', 'pinyin', 'stroke_count', 'ucn',
       'meaning', 'traditional']

In [None]:
df_radicals['radical_index'] = df_radicals.index + 100_000
df_radicals.to_sql('radicals',conn,if_exists='replace',index=df_radicals['radical_index'])

214

In [None]:
df_test_radicals =  pd.read_sql('SELECT * FROM radicals', conn, index_col='radical_index')
df_test_radicals.sample(3)


Unnamed: 0_level_0,radical_number,radical,variants,pinyin,stroke_count,ucn,meaning,traditional
radical_index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
100213,214,龠,,yue4,17,U+9FA0,"[flute, pipe, ancient measure]",
100034,36,夕,,xi1,3,U+5915,"[evening, night, dusk, slanted]",
100084,86,火,灬,huo3,4,U+706B,"[fire, flame, burn, anger, rage]",


## Dictionary Words

df_cedict = pd.read_

In [None]:
df_cedict = pd.read_csv(DATA_DIR/'extracted'/'ce_dict.csv')
df_cedict.sample(3)

Unnamed: 0,traditional,simplified,pinyin,english
8193,信神者,信神者,xin4 shen2 zhe3,a believer
98880,變為,变为,bian4 wei2,to change into
107156,金匯兌本位制,金汇兑本位制,jin1 hui4 dui4 ben3 wei4 zhi4,gold exchange standard (economics)


In [None]:
df_cedict['cedict_index'] = df_cedict.index + 2_000_000

In [12]:
df_cedict.to_sql('ce_dictionary',conn,if_exists='replace',index=df_cedict['cedict_index'])

NameError: name 'df_cedict' is not defined

In [186]:
df_test_cedict =  pd.read_sql('SELECT * FROM ce_dictionary', conn, index_col='cedict_index')


In [187]:
df_test_cedict.columns

Index(['traditional', 'simplified', 'pinyin', 'english'], dtype='object')

## HSK (used to add statistics and remove stopwords)

In [108]:
df_hsk = pd.read_csv(DATA_DIR/'HSK Standard Course 1-6-Table 1.csv')

In [109]:
df_hsk.sample(3)

Unnamed: 0,Id,Traditional,Simplified,English,HSK,HSK 5（二）词语搭配,Img,Txt,Pinyin,Explanation,...,Alternative,Grammar Reference,Song Lyrics,Song YouTube,Song Pinyin,Song Translation,Example Pinyin,Length,Character Phrase,Instagram Image Created
1857,1859,現實,现实,reality,5,,False,False,xiàn shí,现实，就是 reality。,...,,,,,,,tā bìxū zhèngshì shīyè zhè yī xiànshí,2,,
4902,4905,屏障,屏障,protective screen,6,,False,False,píng zhàng,屏障，就是防护屏。,...,,,,,,,xīn shuǐbà shì jiānglái dǐyù hóngshuǐ de píngz...,2,,
2194,2196,單調,单调,"monotonous, dull",5,20.0,False,False,dān diào,单调，就是单调乏味。,...,,,,,,,zhè liǎng gè jìjié zhījiān de rìzi shì fēichán...,2,,


## Example Sentences 

In [110]:
df_sentences = pd.read_csv(DATA_DIR/'sentences.tsv',sep='\t')
df_sentences.shape


(18896, 5)

In [111]:
df_sentences.columns = ['Characters', 'Pinyin', 'Meaning', 'HSK average',
       'Custom Ratio']
df_sentences['sentence_index'] = df_sentences.index + 3_000_000
df_sentences.sample(3)

Unnamed: 0,Characters,Pinyin,Meaning,HSK average,Custom Ratio,sentence_index
5810,有学生向路人散发传单.,yǒu xuésheng xiàng lùrén sànfā chuándān,There were students dishing out leaflets to pa...,4.0,0.5,3005810
10364,他当上了省长.,tā dāngshang le shěngzhǎng,He established himself as governor of the prov...,4.6,0.4,3010364
3970,他站在那里，双手插在口袋里。,tā zhàn zài nàli shuāngshǒu chā zài kǒudài lǐ,He stood there with his hands in his pockets.,3.667,0.444,3003970


In [112]:
# Dropping this row due to strange encoding behavior
df_sentences.drop(15390,axis=0,inplace=True)

In [113]:
df_sentences.to_sql('example_sentences',conn,if_exists='replace',index=df_sentences['sentence_index'])

# Uncomment for debugging which rows aren't inserted

#conn.close()
#conn = sqlite3.connect('ccrs.db', isolation_level=None)
#try:
#    df_sentences.to_sql('example_sentences',conn,if_exists='replace',index=df_sentences['sentence_index'])
#except Exception as e:
#        print(f"Error inserting row {df_links.loc[conn.total_changes]['sentence_index']} into database: {e}")

18895

# Linking Tables

## Link Dictionary to Example sentences

In [114]:
stopwords =  stopwordsiso.stopwords(['zh'])

In [115]:
# create a linking table
df_sentences['words'] = df_sentences['Characters'].apply(lambda x: [w for w in jieba.lcut(x) if w not in stopwords])

df_sentences_exploded = df_sentences.explode('words').reset_index(drop=True)

df_links = pd.merge(df_cedict, df_sentences_exploded, left_on='simplified', right_on='words')
df_links = df_links[['cedict_index', 'sentence_index']].drop_duplicates().reset_index(drop=True)
df_links.sample(3)

Unnamed: 0,cedict_index,sentence_index
66512,2089256,3013498
15165,2018601,3007427
22135,2027785,3016730


In [116]:
# created a linking table
df_links.to_sql('cedict_sentences',conn,if_exists='replace',index=False)


87699

### Link CEDICT WORDS TO Hanzi

In [117]:
df_cedict.sample(3)

Unnamed: 0,traditional,simplified,pinyin,english,cedict_index
105610,邊陲,边陲,bian1 chui2,border area,2105610
65869,漫天遍地,漫天遍地,man4 tian1 bian4 di4,lit. to fill the whole sky and cover the land;...,2065869
19978,含山縣,含山县,Han2 shan1 xian4,"Hanshan county in Chaohu 巢湖[Chao2 hu2], Anhui",2019978


In [1]:
df_hanzi.sample(3)

NameError: name 'df_hanzi' is not defined

In [161]:
def is_cjk(char):
    char = ord(char)
    cjk_ranges = [
    (0x4E00,  0x62FF),
    (0x6300,  0x77FF),
    (0x7800,  0x8CFF),
    (0x8D00,  0x9FCC),
    (0x3400,  0x4DB5),
    (0x20000, 0x215FF),
    (0x21600, 0x230FF),
    (0x23100, 0x245FF),
    (0x24600, 0x260FF),
    (0x26100, 0x275FF),
    (0x27600, 0x290FF),
    (0x29100, 0x2A6DF),
    (0x2A700, 0x2B734),
    (0x2B740, 0x2B81D),
    (0x2B820, 0x2CEAF),
    (0x2CEB0, 0x2EBEF),
    (0x2F800, 0x2FA1F), ]
    
    
    for bottom, top in cjk_ranges:
        if char >= bottom and char <= top:
            return True
    return False


df_cedict['simplified_chars'] = df_cedict['simplified'].apply(lambda x: ''.join([c for c in x if is_cjk(c)]))

In [162]:
# extract Chinese characters from the 'text' column and store them in a new column 'chinese'

# split the Chinese characters in the 'chinese' column into a list and store the list in a new column 'chinese_list'
df_cedict['simplified_chars'] = df_cedict['simplified_chars'].apply(list)

# create a new DataFrame with unique Chinese characters and their indices
#unique_chars = sorted(set(''.join(df_cedf_hanzidict['simplified_chars'].sum())))
df = df_cedict[['cedict_index','simplified_chars']].explode('simplified_chars')



In [163]:
df.dropna(inplace=True)

In [164]:
df_linking = df.merge(df_hanzi[['hanzi','hanzi_index']], left_on= 'simplified_chars', right_on='hanzi', how='left').copy()

In [165]:
df_linking.dropna(how='any',inplace=True)

In [166]:
df_linking['hanzi_index'] = df_linking.hanzi_index.astype('int')

In [168]:
df_linking = df_linking[['cedict_index','hanzi_index']]

In [170]:
df_linking.to_sql('hanzi_cedict',conn,if_exists='replace')

311635

In [150]:
df.hanzi_index

0         1001712.0
1         1000623.0
2         1000426.0
3         1000946.0
4         1000426.0
            ...    
315885          NaN
315886          NaN
315887          NaN
315888          NaN
315889    1000073.0
Name: hanzi_index, Length: 315890, dtype: float64

In [128]:
df_linking.dropna(inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_linking.dropna(inplace=True)


### Hanzi to Radicals

In [None]:
#EDA

df_radicals.sample(3)

Unnamed: 0,radical_number,radical,variants,pinyin,stroke_count,ucn,meaning,traditional,radical_index
42,44,尸,,shi1,3,U+5C38,"[corpse, to impersonate the dead, to preside]",,100042
89,91,片,,pian4,4,U+7247,"[slice, splinter, strip, rad. 91]",,100089
82,84,气,,qi4,4,U+6C14,"[steam, vapor]",,100082


In [None]:
# Create a set of all radicals, varients and traditional 
unique_base_radicals =  pd.concat([df_radicals['radical'],df_radicals['traditional'],radical_variants_unique]).dropna().drop_duplicates(keep='first')

#unique_radicals = pd.concat([df_radicals['radical'],df_radicals['variants'],df_radicals['rad']],axis=1)

In [None]:
df_decomp = pd.read_csv(DATA_DIR/'extracted'/'FlattenedDecompositionTable.csv',index_col=0,encoding='utf-8')

In [None]:
import cchardet as chardet

with open(DATA_DIR/'extracted'/'FlattenedDecompositionTable.csv','rb') as f :
    result = chardet.detect(f.read())
print(result['encoding'])

UTF-8


In [122]:
# EDA
# Checking to see if the components will eventually break down into characers
main_component = df_decomp['Component']
right_component =  df_decomp['RightComponent']
left_component = df_decomp['LeftComponent']



In [123]:
(main_component.str.len() > 1).sum()
right_component[right_component.str.len() > 1 ].str.replace(' ','')
left_component[left_component.str.len() > 1 ].str.replace(' ','')

# Filter rows with multiple radicals

breakdown_right_components = right_component[right_component.str.len() > 1]
breakdown_left_components = left_component[left_component.str.len() > 1]



In [124]:
breakdown_left_components.unique().shape[0] / breakdown_left_components.shape[0]

0.7450980392156863

In [125]:
unique_base_radicals[unique_base_radicals == '爫']

NameError: name 'unique_base_radicals' is not defined

In [None]:
df_decomp.iloc[169]

Component             伶
Strokes               7
CompositionType       吅
LeftComponent         亻
LeftStrokes           2
RightComponent        令
RightStrokes          5
Signature          OOII
Notes                 /
Section               人
Name: 310, dtype: object

In [None]:
unique_components =  pd.concat([main_component,right_component,left_component],axis=0).drop_duplicates(keep='first').dropna()

In [None]:
print('Percentage of the set of unique radicals that are in the set of unique components')
unique_base_radicals.isin(unique_components).sum() / unique_base_radicals.shape[0]

Percentage of the set of unique radicals that are in the set of unique components


0.9549180327868853

In [59]:
print('Radicals that are not in the components_list')
unique_base_radicals[~unique_base_radicals.isin(unique_components)]

Radicals that are not in the components_list


22       匸
33       夊
2       乀 
3       乚 
41       尣
118      ⺮
162     阝 
182      飠
2        乁
3        乛
45      巜 
dtype: object

In [60]:
unique_components[~unique_components.isin(unique_base_radicals)]

1           丁
3           七
7           万
8           丈
9           三
         ... 
19760       𠚍
19762     木缶木
20521       𠤏
20750    口口田一
20835       歯
Length: 10448, dtype: object

### Further Decomposition

Layer 1, layer 2, Layer 3

To get the tree structure ready, it would be good to 
set it up in a few layers of decomposition

In [61]:
df_decomp[df_decomp['Component'] == '丁']

Unnamed: 0,Component,Strokes,CompositionType,LeftComponent,LeftStrokes,RightComponent,RightStrokes,Signature,Notes,Section
1,丁,2,吕,一,1,亅,1,MN,/,一


In [62]:
#df_ids_decomp = pd.read_csv(DATA_DIR/'extracted'/'idsDecomposition.csv',index_col=0)

In [63]:
from queue import Queue
queue = Queue()

class HanziNode:
    def __init__(self,val):
        self.leftChild = None
        self.rightChild = None
        self.data = val


    # Position Dict
    # Order of traversal Inorder

    # Print tree
    def print_tree(self):
        ret = []
        ret.append(self.data)
        if self.leftChild is not None:
            queue.put(self.leftChild)
        if self.rightChild is not None:
            queue.put(self.rightChild)

        #print (len(stack))
        while queue.empty() is False:
            ret = ret + queue.get().printTree() 
        return ret
    
    def preorder_traversal(self, root):
        ret = []
        if root:
            ret.append(root.data)
            ret = ret + self.preorderTraversal(root.leftChild)
            ret = ret + self.preorderTraversal(root.rightChild)
        return ret

    def is_leaf(self):
        return self.leftChild is None and self.rightChild is None

    def is_radical(self,radicals_col):
        # Checks if the node is a radical
        return radicals_col.str.contains(self.data).sum() != False
    
    def get_sub_components(self,df):
        return df[df['Component'] == self.data]

    def populate_tree(self, df_components):
        components_df_row = self.get_sub_components(df_components)

        if  components_df_row is not None or not self.is_radical(self.data):
            leftComponent =  components_df_row['LeftComponent'].iloc[0]
            rightComponent = components_df_row['RightComponent'].iloc[0]
            #print(leftComponent)
            #print(rightComponent)

            if leftComponent is not None:
                self.leftChild = HanziNode(leftComponent)
            else:
                self.leftChild = None
                
            if rightComponent is not None:
                self.rightChild = HanziNode(rightComponent)
            else:
                self.rightChild = None
                #self.rightChild.populate_tree(components_df_row) 

    def get_all_leaves(self):
        if self.leftChild is None and self.rightChild is None:
            return [self]
        else:
            leaves = []
            if self.leftChild is not None:
                leaves += self.leftChild.get_all_leaves()
            if self.rightChild is not None:
                leaves += self.rightChild.get_all_leaves()
            return leaves

In [64]:
hanziDecompTreeDict = {}

I

### Initialize Tree

## Layer 0

In [65]:
radicalList = list(unique_base_radicals)
for index,row in df_hanzi.iterrows():
    hanzi = row['hanzi']
    HanziRoot = HanziNode(hanzi)
    hanziDecompTreeDict[index] = HanziRoot


### Layer 1

In [66]:
noDataOnCharacterList  = []

for index, node in hanziDecompTreeDict.items():

    try:
        node.populate_tree(df_decomp)
    except:
        noDataOnCharacterList.append(index)


for i in noDataOnCharacterList:
    hanziDecompTreeDict[i].rightChild = None
    hanziDecompTreeDict[i].leftChild = None 

### Layer 1

In [67]:
noSecondLayerList = []

for index, node in hanziDecompTreeDict.items():
    try:
        node.rightChild.populate_tree(df_decomp)
        node.leftChild.populate_tree(df_decomp)
    except:
        noSecondLayerList.append(index)

### Layer 2

for index, node in 

In [68]:
noThirdLayerList = []

for index, node in hanziDecompTreeDict.items():     
    try:
        node.rightChild.leftChild.populate_tree(df_decomp)
        node.leftChild.leftChild.populate_tree(df_decomp)
        node.rightChild.rightChild.populate_tree(df_decomp)
        node.rightChild.leftChild.populate_tree(df_decomp)
    except:
        noThirdLayerList.append(index)

In [69]:
# Convert input to Pandas Series object
#def check_for_rare_characters(col):

char_series = df_decomp['LeftComponent']
char_series = char_series.dropna().str[0]
# Convert each character to its hexadecimal representation
char_series = char_series.apply( lambda x: ord(x))

# Define Unicode sets
common_set = set(range(0x4E00, 0xA000))  # 
extension_a_set = set(range(0x3400, 0x4E00))  #  Extension A
extension_b_set = set(range(0x20000, 0x2A6E0))  #  Extension B
extension_c_set = set(range(0x2A700, 0x2B740))  #  Extension C
extension_d_set = set(range(0x2B740, 0x2B820))  #  Extension D
extension_e_set = set(range(0x2B820, 0x2CEB0))  #  Extension E
extension_f_set = set(range(0x2CEB0, 0x2EC00))  #  Extension F
extension_g_set = set(range(0x30000, 0x31350))  #  Extension G
extension_h_set = set(range(0x31350, 0x32400))  #  Extension H

# Convert Unicode code points to strings and create pandas Series

extension_a_chars = pd.Series([chr(cp) for cp in extension_a_set])
extension_b_chars = pd.Series([chr(cp) for cp in extension_b_set])
extension_c_chars = pd.Series([chr(cp) for cp in extension_c_set])
extension_d_chars = pd.Series([chr(cp) for cp in extension_d_set])
extension_e_chars = pd.Series([chr(cp) for cp in extension_e_set])
extension_f_chars = pd.Series([chr(cp) for cp in extension_f_set])
extension_g_chars = pd.Series([chr(cp) for cp in extension_g_set])
extension_h_chars = pd.Series([chr(cp) for cp in extension_h_set])


# Define vectorized functions for each Unicode set
common_mask = char_series.isin(common_set)
extension_a_mask = char_series.isin(extension_a_chars)
extension_b_mask = char_series.isin(extension_b_chars)
extension_c_mask = char_series.isin(extension_c_chars)
extension_d_mask = char_series.isin(extension_d_chars)
extension_e_mask = char_series.isin(extension_e_chars)
extension_f_mask = char_series.isin(extension_f_chars)
extension_g_mask = char_series.isin(extension_g_chars)
extension_h_mask = char_series.isin(extension_h_chars)

# Create new column that specifies which Unicode set each character belongs to
# Create Series of character sets
char_set_series = pd.Series('', index=char_series.index)
char_set_series[common_mask] = ''
char_set_series[extension_a_mask] = 'A'
char_set_series[extension_b_mask] = 'B'
char_set_series[extension_c_mask] = 'C'
char_set_series[extension_d_mask] = 'D'
char_set_series[extension_e_mask] = 'E'
char_set_series[extension_f_mask] = 'F'
char_set_series[extension_g_mask] = 'G'
char_set_series[extension_h_mask] = 'H'

#df['char_set'] = char_set_series

#return char_series
    


## Tree Parsing


In [70]:
def traverse_tree(node):
    """
    Recursively traverses the tree and returns a dictionary of nodes.
    """
    if node is None:
        return {}

    left_descendant = traverse_tree(node.leftChild)
    right_descendant = traverse_tree(node.rightChild)

    node_dict = {
        'data': node.data,
        'left_child': left_descendant,
        'right_child': right_descendant
    }

    return node_dict

hanzi_idx = 1001
# Example usage:
tree_dict = traverse_tree(hanziDecompTreeDict[hanzi_idx])



In [71]:
def flatten_tree(tree, hanzi_idx):
    result = []
    counter = 0

    def flatten_node(node, hanzi_idx):
        nonlocal counter
        counter += 1
        
        if counter > 1:
            result.append((hanzi_idx, node['data'], counter - 1))

        if node['left_child']:
            flatten_node(node['left_child'], hanzi_idx )
        if node['right_child']:
            flatten_node(node['right_child'], hanzi_idx)

    flatten_node(tree, hanzi_idx)

    return result



In [72]:
flat_component_list = []

for hanzi_idx, RootNode in hanziDecompTreeDict.items():
    tree_dict = traverse_tree(RootNode)
    flat_rows = flatten_tree(tree_dict,hanzi_idx)
    flat_component_list = flat_component_list + flat_rows


In [73]:
df_hanzi_components = pd.DataFrame(flat_component_list, columns=['hanzi_index','component','position'])

In [74]:
# Filter out * inside of the dataset
df_hanzi_components  = df_hanzi_components[df_hanzi_components['component'] != '*']

In [75]:
df_hanzi_components.to_sql('hanzi_components',conn,if_exists='replace')

49103

## Chengyu (Idioms)

In [76]:
df_chengyu = pd.read_json(DATA_DIR/'chengyu_data.json')

In [77]:
df_chengyu.Frequency.value_counts()

0     12610
1       206
2        67
3        29
4        22
6         8
5         5
7         4
8         3
10        2
9         2
34        1
20        1
19        1
18        1
12        1
Name: Frequency, dtype: int64

In [78]:
df_chengyu.sample()

Unnamed: 0,ID,Abbr,Chinese,ChineseExplanation,EnglishLiteral,EnglishFigurative,Pinyin,Example,ExampleTranslation,Origin,OriginTranslation,Frequency
9400,9316,tsqt,泰山其颓,旧时用于哀悼大家敬仰的人。,,,tài shān qí tuí,,,《礼记·檀弓上》：“泰山其颓乎。梁木其坏乎。哲人其萎乎。”,,0


In [79]:
import re
def to_snake_case(s):
    return re.sub(r'([A-Z])(?<!^)', r'_\1', s).lower()[1:]

In [80]:
chengyu_cols =  [to_snake_case(s) for s in df_chengyu.columns]
chengyu_cols[0] = 'ID'

df_chengyu.columns = chengyu_cols

In [81]:
# Linking table to characters

# tokenize, remove stopwords
df_chengyu['chinese_tokens'] = df_chengyu['chinese'].apply(lambda x: [w for w in jieba.lcut(x) if w not in stopwords])
df_chengyu['chinese_explanation_tokens'] = df_chengyu['chinese_explanation'].apply(lambda x: [w for w in jieba.lcut(x) if w not in stopwords])

df_chengyu["chinese_tokens"] = df_chengyu["chinese_tokens"].apply(set)
df_chengyu["chinese_explanation_tokens"] = df_chengyu["chinese_explanation_tokens"].apply(set)


In [82]:
df_cedict['simplified_tokens'] = df_cedict['simplified'].apply(lambda x: [w for w in jieba.lcut(x) if w not in stopwords])

df_cedict["simplified_tokens"] = df_cedict["simplified_tokens"].apply(lambda x: set(x))


In [83]:
df_chengyu.head(1)

Unnamed: 0,ID,abbr,chinese,chinese_explanation,english_literal,english_figurative,pinyin,example,example_translation,origin,origin_translation,frequency,chinese_tokens,chinese_explanation_tokens
0,8910,swty,世外桃源,原指与现实社会隔绝、生活安乐的理想境界。后也指环境幽静生活安逸的地方。借指一种空想的脱离现实...,world outside peach place,"originally meant an imaginary, ideal place sep...",shì wài táo yuán,在这儿，在这～的仙境中，有了人世喧嚣的声音。（杨沫《青春之歌》第一部第三章）,,晋·陶潜《桃花园记》描述的一个与世隔绝，没有遭到祸乱的美好地方。,,34,{世外桃源},"{现实, 原指, 隔绝, 环境, 安乐, 社会, 借指, 斗争, 幽静, 理想境界, 空想,..."


In [84]:
from scipy.sparse import dok_matrix


In [95]:
import multiprocessing
import numpy as np
from itertools import product

def tokens_subset_of_characters(tokens, characters):
    """
    Check if all tokens are a subset of the characters.
    """
    token_set = set(tokens)
    char_set = set(characters)
    return int(token_set.issubset(char_set))

#def fill_sparse_matrix(group, df_chengyu, result_matrix, i, n_cols):
#    for j, chinese_explanation_tokens in enumerate(df_chengyu['chinese_explanation_tokens'].values):
#        result_matrix[group.index, i * n_cols + j] = tokens_subset_of_characters(chinese_explanation_tokens, group['simplified'])

def fill_sparse_matrix(group, df_chengyu, result_matrix, i, n_cols):
    for j, chinese_explanation_tokens in enumerate(df_chengyu['chinese_explanation_tokens'].values):
        index = group.index.values.astype(np.int32)
        values = tokens_subset_of_characters(chinese_explanation_tokens, group.values)
        result_matrix[index, i * n_cols + j] = values


def applyParallel(df, df_chengyu, func, result_matrix, counter, max_memory):
    # compute the number of rows and columns in the result matrix

    def increment_counter(result, counter):
        counter.get_lock().acquire()
        counter.value += 1
        counter.get_lock().release()

    n_rows, n_cols = len(df), len(df_chengyu)

    # create a pool of worker processes
    pool = multiprocessing.Pool(processes=4)

    # determine the chunk size and group the input dataframe into chunks
    chunk_size = max(1, int(max_memory * 1e6 / n_cols / 4))
    groups = [df.iloc[i:i+chunk_size] for i in range(0, n_rows, chunk_size)]

    # fill in the entries of the result matrix using multiple processes
    for i, group in enumerate(groups):
        pool.apply_async(
            fill_sparse_matrix,
            args=(group, df_chengyu, result_matrix, i, n_cols),
            callback=increment_counter,
            error_callback=print
        )

    # wait for all processes to finish
    pool.close()
    pool.join()

    # convert the result matrix to a sparse matrix
    result_sparse = result_matrix.tocsr()

    return result_sparse

max_memory = 32  # maximum memory usage in GB
n_chunks = 24
grouped = df_cedict.groupby(df_cedict.index // n_chunks)
num_rows = len(df_cedict)
num_cols = len(df_chengyu)

# create a sparse matrix to store the results
result_matrix = dok_matrix((num_rows, num_rows * num_cols), dtype=np.int32)

# create a shared memory counter
counter = multiprocessing.Value('i', 0)

# apply the function in parallel and return a sparse matrix of the results
result_sparse = applyParallel(df_cedict['simplified_tokens'], df_chengyu['chinese_explanation_tokens'], tokens_subset_of_characters, result_matrix, counter, max_memory)


# convert the sparse matrix to a boolean array and assign it to a new column of the input dataframe
#df_cedict['token_set'] = result_sparse.all(axis=1).toarray().ravel()
#df_cedict['token_set'] = result_sparse.any(axis=1).toarray().ravel()


'chinese_explanation_tokens'
'chinese_explanation_tokens'
'chinese_explanation_tokens'
'chinese_explanation_tokens'
'chinese_explanation_tokens'
'chinese_explanation_tokens'
'chinese_explanation_tokens'
'chinese_explanation_tokens'
'chinese_explanation_tokens'
'chinese_explanation_tokens'
'chinese_explanation_tokens'
'chinese_explanation_tokens'
'chinese_explanation_tokens'
'chinese_explanation_tokens'
'chinese_explanation_tokens'
'chinese_explanation_tokens'
'chinese_explanation_tokens'
'chinese_explanation_tokens'
'chinese_explanation_tokens'
'chinese_explanation_tokens'
'chinese_explanation_tokens'
'chinese_explanation_tokens'
'chinese_explanation_tokens'
'chinese_explanation_tokens'
'chinese_explanation_tokens'
'chinese_explanation_tokens'
'chinese_explanation_tokens'
'chinese_explanation_tokens'
'chinese_explanation_tokens'
'chinese_explanation_tokens'
'chinese_explanation_tokens'
'chinese_explanation_tokens'
'chinese_explanation_tokens'
'chinese_explanation_tokens'
'chinese_expla

In [102]:
result_sparse[0]

<1x1564413729 sparse matrix of type '<class 'numpy.int32'>'
	with 0 stored elements in Compressed Sparse Row format>

In [103]:
chunk_size = 1000  # number of rows to process at a time
n_rows = len(df_cedict)
n_batches = (n_rows + chunk_size - 1) // chunk_size  # number of batches
row_lengths = np.diff(result_sparse.indptr)

matching_indexes = []  # initialize empty list to store matching indexes

for i in range(n_batches):
    start_idx = i * chunk_size
    end_idx = min(start_idx + chunk_size, n_rows)
    row_counts = result_sparse[start_idx:end_idx, :].sum(axis=1)
    matching_rows, matching_cols = np.nonzero(row_counts == row_lengths[start_idx:end_idx])
    matching_rows += start_idx  # adjust row indexes to match the original dataframe
    matching_indexes.appendtoken_set((matching_rows, matching_cols))

# flatten the list of matching indexes and create a boolean mask to assign True to matching rows and False to non-matching rows
matching_rows = np.concatenate([x[0] for x in matching_indexes])
matching_cols = np.concatenate([x[1] for x in matching_indexes])
df_cedict['token_set'] = False
df_cedict.iloc[matching_rows, df_cedict.columns.get_loc('token_set')] = True



In [104]:
df_cedict

Unnamed: 0,traditional,simplified,pinyin,english,cedict_index,simplified_tokens,token_set
0,2019冠狀病毒病,2019冠状病毒病,er4 ling2 yi1 jiu3 guan1 zhuang4 bing4 du2 bing4,"COVID-19, the coronavirus disease identified i...",2000000,"{病, 冠状病毒, 2019}",True
1,21三體綜合症,21三体综合症,er4 shi2 yi1 san1 ti3 zong1 he2 zheng4,trisomy,2000001,"{综合症, 21, 三体}",True
2,3C,3C,san1 C,"abbr. for computers, communications, and consu...",2000002,{3C},True
3,3P,3P,san1 P,(slang) threesome,2000003,{3P},True
4,3Q,3Q,san1 Q,(Internet slang) thank you (loanword),2000004,{3Q},True
...,...,...,...,...,...,...,...
120678,𨭆,𬭶,hei1,hassium (chemistry),2120678,{𬭶},True
120679,𨭎,𬭳,xi3,seaborgium (chemistry),2120679,{𬭳},True
120680,𩧢,𱅒,cheng3,variant of 騁|骋[cheng3],2120680,{𱅒},True
120681,𰻞,𰻝,biang2,see 𰻞𰻞麵|𰻝𰻝面[biang2 biang2 mian4],2120681,{𰻝},True


In [101]:
chunk_size = 1000  # number of rows to process at a time
n_rows = len(df_cedict)
n_batches = (n_rows + chunk_size - 1) // chunk_size  # number of batches
row_lengths = np.diff(result_sparse.indptr)

for i in range(n_batches):
    start_idx = i * chunk_size
    end_idx = min(start_idx + chunk_size, n_rows)
    row_counts = result_sparse[start_idx:end_idx, :].sum(axis=1)
    #df_cedict.iloc[start_idx:end_idx, df_cedict.columns.get_loc('simplified_tokens')] = (row_counts == row_lengths[start_idx:end_idx]).ravel()
    df_cedict.iloc[start_idx:end_idx, df_cedict.columns.get_loc('simplified_tokens')] = (row_counts == row_lengths[start_idx:end_idx]).reshape(-1, num_cols).ravel()


ValueError: cannot reshape array of size 1000000 into shape (12963)

In [98]:
row_counts = result_sparse.sum(axis=1)
row_lengths = np.diff(result_sparse.indptr)
df_cedict['token_set'] = row_counts == row_lengths


KeyboardInterrupt: 

In [97]:
df_cedict.shape

(120683, 6)

In [90]:
import multiprocessing
import numpy as np
from itertools import product

def tokens_subset_of_characters(tokens, characters, result_array, index):
    """
    Check if all tokens are a subset of the characters.
    """
    token_set = set(tokens)
    char_set = set(characters)
    result_array[index] = int(token_set.issubset(char_set))
    

class SynchronizedArray:
    def __init__(self, shape):
        self.arr = multiprocessing.Array('i', int(np.prod(shape)))
        self.shape = shape

    def __getitem__(self, idx):
        return self.arr[idx]

    def __setitem__(self, idx, value):
        self.arr[idx] = value

    def __len__(self):
        return self.shape[0]


def increment_counter(result, counter):
    counter.get_lock().acquire()
    counter.value += 1
    counter.get_lock().release()

manager = multiprocessing.Manager()

max_memory = 32  # maximum memory usage in GB
n_chunks = 12
grouped = df_cedict.groupby(df_cedict.index // n_chunks)
num_rows = len(df_cedict)
num_cols = len(df_chengyu)

# create shared memory array
#result_array = SynchronizedArray((num_rows, num_cols))
result_array = manager.Array('i', num_rows * num_cols)
# create shared memory counter
counter = multiprocessing.Value('i', 0)

def applyParallel(dfGrouped, df_chengyu, func, result_array, counter):
    with multiprocessing.Pool(12) as p:
        for name, group in dfGrouped:
            group_tokens = group['simplified'].values
            for i, chinese_explanation_tokens in enumerate(df_chengyu['chinese_explanation_tokens'].values):
                p.apply_async(
                    func, 
                    args=(group_tokens, chinese_explanation_tokens, result_array, (counter.value, i)),
                    callback=increment_counter,
                    error_callback=print
                )
        p.close()
        p.join()

applyParallel(grouped, df_chengyu, tokens_subset_of_characters, result_array, counter)

# convert shared memory array to numpy array
result_np = np.frombuffer(result_array.arr.get_obj(), dtype=np.int32).reshape((num_rows, num_cols))

df_cedict['token_set'] = result_np.all(axis=1)


RemoteError: 
---------------------------------------------------------------------------
Traceback (most recent call last):
  File "/home/jentlejames/anaconda3/envs/ChineseAutomation/lib/python3.10/multiprocessing/managers.py", line 209, in _handle_request
    result = func(c, *args, **kwds)
  File "/home/jentlejames/anaconda3/envs/ChineseAutomation/lib/python3.10/multiprocessing/managers.py", line 387, in create
    obj = callable(*args, **kwds)
  File "/home/jentlejames/anaconda3/envs/ChineseAutomation/lib/python3.10/multiprocessing/managers.py", line 1024, in Array
    return array.array(typecode, sequence)
TypeError: 'int' object is not iterable
---------------------------------------------------------------------------

## News 

### Weibo