# Extracting the Data for this app

In [3]:
#!conda install stopwordsiso

In [1]:
from pathlib import Path

import pandas as pd
import numpy as np
import sqlite3

# NLP
import jieba
import stopwordsiso #

In [2]:
DATA_DIR = Path('/home/jentlejames/Projects/Data/Chinese Automation/data')

In [3]:
DATA_DIR

PosixPath('/home/jentlejames/Projects/Data/Chinese Automation/data')

In [436]:
conn = sqlite3.connect('../db/ccrs.db')

# Characters

pd.read_csv(DATA_DIR/)

In [437]:
df_hanzi = pd.read_csv(DATA_DIR/'extracted'/'uniqueCharacters.csv',index_col=0)
df_hanzi['hanzi_index'] = df_hanzi.index + 1_000_000

In [438]:
# Filter by column
hanziColumns = ['char','cumulativeRawFrequency','kMandarin','English','kTotalStrokes','hanzi_index']
df_hanzi = df_hanzi[hanziColumns].copy()

# Rename for sql column standard
df_hanzi.columns = ['hanzi','raw_frequency','pinyin','definition','stroke_count','hanzi_index']
df_hanzi.sample(3)

Unnamed: 0,hanzi,raw_frequency,pinyin,definition,stroke_count,hanzi_index
1512,豪,94.646005,háo,grand/heroic,14,1001512
1342,穷,93.289896,qióng,exhausted/poor,7,1001342
6928,餬,99.994711,hú,,17,1006928


In [439]:
df_hanzi.to_sql('hanzi_info',conn,if_exists='replace',index=df_hanzi['hanzi_index'])

9933

In [440]:
test_hanzi = pd.read_sql('SELECT * FROM hanzi_info', conn, index_col='hanzi_index')
test_hanzi.sample(3)

Unnamed: 0_level_0,hanzi,raw_frequency,pinyin,definition,stroke_count
hanzi_index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1000408,价,71.490523,jià,"price/value/valence (on an atom), great/good/m...",6
1002056,俘,97.328262,fú,prisoner of war,9
1001274,播,92.644179,bō,sow/scatter/spread/broadcast,15


# Radicals

In [441]:
df_radicals = pd.read_csv(DATA_DIR/'extracted'/'Radicals.csv',index_col=0)

In [442]:
# Filter



#### Objective 2: Meaning and definitions

In [443]:
# English Category Clean up 
# Remove whitespace formatting
df_radicals['english']  = df_radicals.english.str.replace('\xad','')

# Cleaning up the list of definitions 

# Needs to deal with the nested list, expanding it out into a table
df_radicals['Meaning'] = df_radicals['kDefinition'].str.split(';')
df_meaning = df_radicals['Meaning'].apply(pd.Series).copy()
df_meaning.head()

Unnamed: 0,0,1,2,3,4
0,one,"a, an",alone,,
1,number one,line,Kangxi radical 2,,
2,line,Kangxi radical 4,,,
3,second,2nd heavenly stem,,,
4,hook,Kangxi radical 6,,,


In [444]:
# Removing the extra information about Kangxi Radicals
for i in range(5):
    df_meaning[i] = np.where(df_meaning[i].str.contains('Kangxi'),np.NaN,df_meaning[i])
    df_meaning[i] = df_meaning[i].str.strip()

In [445]:
df_radicals.head()

Unnamed: 0,number,radical,variants,simplifiedradical,pinyin,english,strokecount,char,ucn,kDefinition,Meaning
0,1,一,,,yi1,one,1,一,U+4E00,"one; a, an; alone","[one, a, an, alone]"
1,2,丨,,,gun3,line,1,丨,U+4E28,number one; line; Kangxi radical 2,"[number one, line, Kangxi radical 2]"
2,4,丿,"乀 (fu2), 乁(yi2)",,pie3,slash,1,丿,U+4E3F,line; Kangxi radical 4,"[line, Kangxi radical 4]"
3,5,乙,"乚 (yin3), 乛",,yi4,second,1,乙,U+4E59,second; 2nd heavenly stem,"[second, 2nd heavenly stem]"
4,6,亅,,,jue2,hook,1,亅,U+4E85,hook; Kangxi radical 6,"[hook, Kangxi radical 6]"


In [446]:
# This line is to unpack the definitions even further, with the goal of
# unpacking the nested lists inside of the nested lists

# Populating an empty array
df_meaning['idx'] = np.NaN

# Recurses through each column, adding where it iis found  
for i in range(5):
    df_meaning['idx'] = np.where(df_radicals['english'] == df_meaning[i],i,df_meaning['idx'])


# Checking for redundant definitions
secondaryCheckIdx = df_meaning['idx'].isnull()



In [447]:
df_meaning['english'] = df_radicals['english']

In [448]:
# Unpacking Level 2 nested list of definitions, checking for matches
#df_meaning[df_meaning[4].str.contains(',') == True]


commaMeanings0 = df_meaning[secondaryCheckIdx][0].str.split(', | or ').apply(pd.Series)
#print(commaMeanings0.shape[1])
commaMeanings1 = df_meaning[secondaryCheckIdx][1].str.split(', | or ').apply(pd.Series)
#print(commaMeanings1.shape[0])
# Merging two nested lists together in order to check for matching words that indicate redudant information 
commaMeanings = pd.merge(commaMeanings0,commaMeanings1,how='outer',on=commaMeanings0.index).drop('key_0',axis=1)

# Makes possible to iterate through each
commaMeanings.columns = range(commaMeanings.shape[1])

commaMeanings['single_word_def_is_redundant'] = np.NaN
commaMeanings['english'] = df_meaning[secondaryCheckIdx].english.reset_index(drop=True)

for i in range(commaMeanings.shape[1] -2 ): # -2 for index column and english column
    commaMeanings['single_word_def_is_redundant'] = np.where(commaMeanings['english'] == commaMeanings[i], i, commaMeanings['single_word_def_is_redundant'])

commaMeanings['merge_idx'] =  df_meaning[secondaryCheckIdx].index

In [449]:
df_meaning = pd.merge(df_meaning,commaMeanings[['merge_idx','single_word_def_is_redundant']],how='left',left_on=df_meaning.index,right_on='merge_idx').drop('merge_idx',axis=1)
df_meaning.shape

(214, 8)

In [450]:
df_meaning['english'] = np.where(df_meaning['single_word_def_is_redundant'].isnull() & df_meaning['idx'].isnull(),df_meaning['english'],np.NaN)

In [451]:
df_meaning = df_meaning[['english',0,1,2,3,4]]

In [452]:
df_meaning

Unnamed: 0,english,0,1,2,3,4
0,,one,"a, an",alone,,
1,,number one,line,,,
2,slash,line,,,,
3,,second,2nd heavenly stem,,,
4,,hook,,,,
...,...,...,...,...,...,...
209,,"even, uniform, of equal length",,,,
210,tooth,teeth,"gears, cogs",age,,
211,,dragon,,,,
212,,turtle or tortoise,cuckold,,,


In [453]:
df_radicals['Meaning'] = df_meaning.apply(lambda x: ', '.join(x.dropna()), axis=1)
df_radicals['Meaning'] = '[' + df_radicals['Meaning'] + ']'
df_radicals.drop(['kDefinition','english'],axis=1,inplace=True)

### Extract Variants

In [454]:
radical_variants =  df_radicals['variants']#.str.split(',').dropna()
radical_variants =  radical_variants.str.replace('\([a-z1-4]*\)','',regex=True).dropna()
radical_variants.str.replace('\s?,\s?', ',',regex=True)
radical_variants_unique =  radical_variants.str.split(',').apply(pd.Series).copy()

radical_variants_unique = pd.concat([radical_variants_unique[0],radical_variants_unique[1]],axis=0).dropna()

#### merging traditional and simplified radicals

In [455]:
df_radicals

Unnamed: 0,number,radical,variants,simplifiedradical,pinyin,strokecount,char,ucn,Meaning
0,1,一,,,yi1,1,一,U+4E00,"[one, a, an, alone]"
1,2,丨,,,gun3,1,丨,U+4E28,"[number one, line]"
2,4,丿,"乀 (fu2), 乁(yi2)",,pie3,1,丿,U+4E3F,"[slash, line]"
3,5,乙,"乚 (yin3), 乛",,yi4,1,乙,U+4E59,"[second, 2nd heavenly stem]"
4,6,亅,,,jue2,1,亅,U+4E85,[hook]
...,...,...,...,...,...,...,...,...,...
209,210,齊,,齐,qi2,14,齊,U+9F4A,"[even, uniform, of equal length]"
210,211,齒,,齿,chi3,15,齒,U+9F52,"[tooth, teeth, gears, cogs, age]"
211,212,龍,,龙,long2,16,龍,U+9F8D,[dragon]
212,213,龜,,龟,gui1,16,龜,U+9F9C,"[turtle or tortoise, cuckold]"


In [456]:
df_radicals['simplifiedradical'].fillna(df_radicals['radical'],inplace=True)

In [457]:
df_radicals['simplifiedradical'].isnull().sum()

0

In [458]:
# Collecting instances where there is a traditional radical

df_radicals['traditional'] = np.where(df_radicals['simplifiedradical'] != df_radicals['radical'],df_radicals['radical'],np.NaN)

In [459]:
df_radicals.columns

Index(['number', 'radical', 'variants', 'simplifiedradical', 'pinyin',
       'strokecount', 'char', 'ucn', 'Meaning', 'traditional'],
      dtype='object')

In [460]:
df_radicals.sample(3)

Unnamed: 0,number,radical,variants,simplifiedradical,pinyin,strokecount,char,ucn,Meaning,traditional
142,144,行,,行,xing2,6,行,U+884C,"[walk enclosure, go, walk, move, travel, circu...",
25,26,卩,,卩,jie2,2,卩,U+5369,[seal],
161,162,辵,辶,辵,chuo4,7,辵,U+8FB5,"[walk, walking]",


In [461]:
df_radicals.drop(['simplifiedradical','char'],axis=1,inplace=True)

In [462]:
df_radicals.columns =['radical_number', 'radical', 'variants', 'pinyin', 'stroke_count', 'ucn',
       'meaning', 'traditional']

In [463]:
df_radicals.sample(1)

Unnamed: 0,radical_number,radical,variants,pinyin,stroke_count,ucn,meaning,traditional
25,26,卩,,jie2,2,U+5369,[seal],


In [464]:
df_radicals['radical_index'] = df_radicals.index + 100_000
df_radicals.to_sql('radicals',conn,if_exists='replace',index=df_radicals['radical_index'])

214

In [465]:
df_test_radicals =  pd.read_sql('SELECT * FROM radicals_info', conn, index_col='radical_index')
df_test_radicals.sample(3)


DatabaseError: Execution failed on sql 'SELECT * FROM radicals_info': no such table: radicals_info

## Dictionary Words

df_cedict = pd.read_

In [None]:
df_cedict = pd.read_csv(DATA_DIR/'extracted'/'ce_dict.csv')
df_cedict.sample(3)

Unnamed: 0,traditional,simplified,pinyin,english
72029,生煎包,生煎包,sheng1 jian1 bao1,pan-fried dumpling
77405,磁重聯,磁重联,ci2 chong2 lian2,(physics) magnetic reconnection
42352,戈爾,戈尔,Ge1 er3,Gore (name)


In [None]:
df_cedict['cedict_index'] = df_cedict.index + 2_000_000

In [None]:
df_cedict.to_sql('ce_dictionary',conn,if_exists='replace',index=df_cedict['cedict_index'])

120683

In [None]:
df_test_cedict =  pd.read_sql('SELECT * FROM ce_dictionary', conn, index_col='cedict_index')


In [None]:
df_test_cedict.sample(3)

Unnamed: 0_level_0,traditional,simplified,pinyin,english
cedict_index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2079479,穿堂風,穿堂风,chuan1 tang2 feng1,draft
2115669,養,养,yang3,to raise (animals)
2113853,面色,面色,mian4 se4,complexion


## HSK (used to add statistics and remove stopwords)

In [None]:
df_hsk = pd.read_csv(DATA_DIR/'HSK Standard Course 1-6-Table 1.csv')

In [None]:
df_hsk.sample(3)

Unnamed: 0,Id,Traditional,Simplified,English,HSK,HSK 5（二）词语搭配,Img,Txt,Pinyin,Explanation,...,Alternative,Grammar Reference,Song Lyrics,Song YouTube,Song Pinyin,Song Translation,Example Pinyin,Length,Character Phrase,Instagram Image Created
2157,2159,油炸,油炸,to deep-fry,5,,False,False,yóu zhá,油炸，就是 to deep-fry。,...,,,,,,,yào jiādiǎn tǔdòuní háishi yóuzhá tǔdòu tiáo,2,,
4622,4625,姿態,姿态,"attitude, posture, stance",6,,False,False,zī tài,姿态，就是态度，姿势。,...,,,,,,,tā yǐ guànyòng de qiángyìng yáncí zuòchū wéikà...,2,,
3329,3332,精確,精确,"accurate, precise",6,,False,False,jīng què,精确，就是准确。,...,,,,,,,tā jīngquè dì gù chū le zhòngliàng,2,,


## Example Sentences 

In [None]:
df_sentences = pd.read_csv(DATA_DIR/'sentences.tsv',sep='\t')
df_sentences.shape


(18896, 5)

In [None]:
df_sentences.columns = ['Characters', 'Pinyin', 'Meaning', 'HSK average',
       'Custom Ratio']
df_sentences['sentence_index'] = df_sentences.index + 3_000_000
df_sentences.sample(3)

Unnamed: 0,Characters,Pinyin,Meaning,HSK average,Custom Ratio,sentence_index
8696,我已经决定买一部脚踏车，不论贵不贵。,wǒ yǐjīng juédìng mǎi yī bù jiǎotàchē bùlùn gu...,"I have decided to buy a bicycle, whether it is...",4.375,0.375,3008696
15931,流星在空中画出了一道长长的弧线。,liúxīng zài kōngzhōng huà chū le yīdào cháng c...,The falling star described a long curve in the...,5.5,0.25,3015931
2226,我妹妹每天都吃天然食品，但是我不吃。,wǒ mèimei měitiān dōu chī tiānrán shípǐn dànsh...,"My sister eats natural foods every day, but I ...",3.3,0.6,3002226


In [None]:
# Dropping this row due to strange encoding behavior
df_sentences.drop(15390,axis=0,inplace=True)

In [None]:
df_sentences.to_sql('example_sentences',conn,if_exists='replace',index=df_sentences['sentence_index'])

# Uncomment for debugging which rows aren't inserted

#conn.close()
#conn = sqlite3.connect('ccrs.db', isolation_level=None)
#try:
#    df_sentences.to_sql('example_sentences',conn,if_exists='replace',index=df_sentences['sentence_index'])
#except Exception as e:
#        print(f"Error inserting row {df_links.loc[conn.total_changes]['sentence_index']} into database: {e}")

18895

# Linking Tables

## Link Dictionary to Example sentences

In [None]:
stopwords =  stopwordsiso.stopwords(['zh'])

In [None]:
# create a linking table
df_sentences['words'] = df_sentences['Characters'].apply(lambda x: [w for w in jieba.lcut(x) if w not in stopwords])

df_sentences_exploded = df_sentences.explode('words').reset_index(drop=True)

df_links = pd.merge(df_cedict, df_sentences_exploded, left_on='simplified', right_on='words')
df_links = df_links[['cedict_index', 'sentence_index']].drop_duplicates().reset_index(drop=True)
df_links.sample(3)

Unnamed: 0,cedict_index,sentence_index
67931,2091841,3005293
71834,2096471,3010622
71650,2096236,3002880


In [None]:
# created a linking table
df_links.to_sql('cedict_sentences',conn,if_exists='replace',index=False)


87699

### Hanzi to Radicals

In [None]:
#EDA

df_radicals.sample(3)

Unnamed: 0,radical_number,radical,variants,pinyin,stroke_count,ucn,meaning,traditional,radical_index
187,188,骨,,gu3,10,U+9AA8,"[bone, skeleton, frame, framework]",,100187
149,150,谷,,gu3,7,U+8C37,"[valley, gorge, ravine]",,100149
120,122,网,罒,wang3,6,U+7F51,"[net, network]",,100120


In [None]:
# Create a set of all radicals, varients and traditional 
unique_base_radicals =  pd.concat([df_radicals['radical'],df_radicals['traditional'],radical_variants_unique]).dropna().drop_duplicates(keep='first')

#unique_radicals = pd.concat([df_radicals['radical'],df_radicals['variants'],df_radicals['rad']],axis=1)

In [None]:
df_decomp = pd.read_csv(DATA_DIR/'extracted'/'FlattenedDecompositionTable.csv',index_col=0,encoding='utf-8')

In [None]:
import cchardet as chardet

with open(DATA_DIR/'extracted'/'FlattenedDecompositionTable.csv','rb') as f :
    result = chardet.detect(f.read())
print(result['encoding'])

UTF-8


In [None]:
# EDA
# Checking to see if the components will eventually break down into characers
main_component = df_decomp['Component']
right_component =  df_decomp['RightComponent']
left_component = df_decomp['LeftComponent']



In [None]:
(main_component.str.len() > 1).sum()
right_component[right_component.str.len() > 1 ].str.replace(' ','')
left_component[left_component.str.len() > 1 ].str.replace(' ','')

# Filter rows with multiple radicals

breakdown_right_components = right_component[right_component.str.len() > 1]
breakdown_left_components = left_component[left_component.str.len() > 1]



In [None]:
breakdown_left_components.unique().shape[0] / breakdown_left_components.shape[0]

0.7450980392156863

In [None]:
unique_base_radicals[unique_base_radicals == '爫']

85    爫
dtype: object

In [None]:
df_decomp.iloc[169]

Component             伶
Strokes               7
CompositionType       吅
LeftComponent         亻
LeftStrokes           2
RightComponent        令
RightStrokes          5
Signature          OOII
Notes                 /
Section               人
Name: 310, dtype: object

In [None]:
unique_components =  pd.concat([main_component,right_component,left_component],axis=0).drop_duplicates(keep='first').dropna()

In [None]:
print('Percentage of the set of unique radicals that are in the set of unique components')
unique_base_radicals.isin(unique_components).sum() / unique_base_radicals.shape[0]

Percentage of the set of unique radicals that are in the set of unique components


0.9549180327868853

In [None]:
print('Radicals that are not in the components_list')
unique_base_radicals[~unique_base_radicals.isin(unique_components)]

Radicals that are not in the components_list


22       匸
33       夊
2       乀 
3       乚 
41       尣
118      ⺮
162     阝 
182      飠
2        乁
3        乛
45      巜 
dtype: object

In [None]:
unique_components[~unique_components.isin(unique_base_radicals)]

1           丁
3           七
7           万
8           丈
9           三
         ... 
19760       𠚍
19762     木缶木
20521       𠤏
20750    口口田一
20835       歯
Length: 10448, dtype: object

### Further Decomposition

Layer 1, layer 2, Layer 3

To get the tree structure ready, it would be good to 
set it up in a few layers of decomposition

In [None]:
df_decomp[df_decomp['Component'] == '丁']

Unnamed: 0,Component,Strokes,CompositionType,LeftComponent,LeftStrokes,RightComponent,RightStrokes,Signature,Notes,Section
1,丁,2,吕,一,1,亅,1,MN,/,一


In [None]:
#df_ids_decomp = pd.read_csv(DATA_DIR/'extracted'/'idsDecomposition.csv',index_col=0)

In [None]:
from queue import Queue
queue = Queue()

class HanziNode:
    def __init__(self,val):
        self.leftChild = None
        self.rightChild = None
        self.data = val


    # Position Dict
    # Order of traversal Inorder

    # Print tree
    def print_tree(self):
        ret = []
        ret.append(self.data)
        if self.leftChild is not None:
            queue.put(self.leftChild)
        if self.rightChild is not None:
            queue.put(self.rightChild)

        #print (len(stack))
        while queue.empty() is False:
            ret = ret + queue.get().printTree() 
        return ret
    
    def preorder_traversal(self, root):
        ret = []
        if root:
            ret.append(root.data)
            ret = ret + self.preorderTraversal(root.leftChild)
            ret = ret + self.preorderTraversal(root.rightChild)
        return ret

    def is_leaf(self):
        return self.leftChild is None and self.rightChild is None

    def is_radical(self,radicals_col):
        # Checks if the node is a radical
        return radicals_col.str.contains(self.data).sum() != False
    
    def get_sub_components(self,df):
        return df[df['Component'] == self.data]

    def populate_tree(self, df_components):
        components_df_row = self.get_sub_components(df_components)

        if  components_df_row is not None or not self.is_radical(self.data):
            leftComponent =  components_df_row['LeftComponent'].iloc[0]
            rightComponent = components_df_row['RightComponent'].iloc[0]
            #print(leftComponent)
            #print(rightComponent)

            if leftComponent is not None:
                self.leftChild = HanziNode(leftComponent)
            else:
                self.leftChild = None
                
            if rightComponent is not None:
                self.rightChild = HanziNode(rightComponent)
            else:
                self.rightChild = None
                #self.rightChild.populate_tree(components_df_row) 

    def get_all_leaves(self):
        if self.leftChild is None and self.rightChild is None:
            return [self]
        else:
            leaves = []
            if self.leftChild is not None:
                leaves += self.leftChild.get_all_leaves()
            if self.rightChild is not None:
                leaves += self.rightChild.get_all_leaves()
            return leaves

In [None]:
hanziDecompTreeDict = {}

I

### Initialize Tree

## Layer 0

In [None]:
radicalList = list(unique_base_radicals)
for index,row in df_hanzi.iterrows():
    hanzi = row['hanzi']
    HanziRoot = HanziNode(hanzi)
    hanziDecompTreeDict[index] = HanziRoot


### Layer 1

In [None]:
noDataOnCharacterList  = []

for index, node in hanziDecompTreeDict.items():

    try:
        node.populate_tree(df_decomp)
    except:
        noDataOnCharacterList.append(index)


for i in noDataOnCharacterList:
    hanziDecompTreeDict[i].rightChild = None
    hanziDecompTreeDict[i].leftChild = None 

### Layer 1

In [None]:
noSecondLayerList = []

for index, node in hanziDecompTreeDict.items():
    try:
        node.rightChild.populate_tree(df_decomp)
        node.leftChild.populate_tree(df_decomp)
    except:
        noSecondLayerList.append(index)

### Layer 2

for index, node in 

In [None]:
noThirdLayerList = []

for index, node in hanziDecompTreeDict.items():     
    try:
        node.rightChild.leftChild.populate_tree(df_decomp)
        node.leftChild.leftChild.populate_tree(df_decomp)
        node.rightChild.rightChild.populate_tree(df_decomp)
        node.rightChild.leftChild.populate_tree(df_decomp)
    except:
        noThirdLayerList.append(index)

In [None]:
# Convert input to Pandas Series object
#def check_for_rare_characters(col):

char_series = df_decomp['LeftComponent']
char_series = char_series.dropna().str[0]
# Convert each character to its hexadecimal representation
char_series = char_series.apply( lambda x: ord(x))

# Define Unicode sets
common_set = set(range(0x4E00, 0xA000))  # 
extension_a_set = set(range(0x3400, 0x4E00))  #  Extension A
extension_b_set = set(range(0x20000, 0x2A6E0))  #  Extension B
extension_c_set = set(range(0x2A700, 0x2B740))  #  Extension C
extension_d_set = set(range(0x2B740, 0x2B820))  #  Extension D
extension_e_set = set(range(0x2B820, 0x2CEB0))  #  Extension E
extension_f_set = set(range(0x2CEB0, 0x2EC00))  #  Extension F
extension_g_set = set(range(0x30000, 0x31350))  #  Extension G
extension_h_set = set(range(0x31350, 0x32400))  #  Extension H

# Convert Unicode code points to strings and create pandas Series

extension_a_chars = pd.Series([chr(cp) for cp in extension_a_set])
extension_b_chars = pd.Series([chr(cp) for cp in extension_b_set])
extension_c_chars = pd.Series([chr(cp) for cp in extension_c_set])
extension_d_chars = pd.Series([chr(cp) for cp in extension_d_set])
extension_e_chars = pd.Series([chr(cp) for cp in extension_e_set])
extension_f_chars = pd.Series([chr(cp) for cp in extension_f_set])
extension_g_chars = pd.Series([chr(cp) for cp in extension_g_set])
extension_h_chars = pd.Series([chr(cp) for cp in extension_h_set])


# Define vectorized functions for each Unicode set
common_mask = char_series.isin(common_set)
extension_a_mask = char_series.isin(extension_a_chars)
extension_b_mask = char_series.isin(extension_b_chars)
extension_c_mask = char_series.isin(extension_c_chars)
extension_d_mask = char_series.isin(extension_d_chars)
extension_e_mask = char_series.isin(extension_e_chars)
extension_f_mask = char_series.isin(extension_f_chars)
extension_g_mask = char_series.isin(extension_g_chars)
extension_h_mask = char_series.isin(extension_h_chars)

# Create new column that specifies which Unicode set each character belongs to
# Create Series of character sets
char_set_series = pd.Series('', index=char_series.index)
char_set_series[common_mask] = ''
char_set_series[extension_a_mask] = 'A'
char_set_series[extension_b_mask] = 'B'
char_set_series[extension_c_mask] = 'C'
char_set_series[extension_d_mask] = 'D'
char_set_series[extension_e_mask] = 'E'
char_set_series[extension_f_mask] = 'F'
char_set_series[extension_g_mask] = 'G'
char_set_series[extension_h_mask] = 'H'

#df['char_set'] = char_set_series

#return char_series
    


## Tree Parsing


In [None]:
# level 1

level_one_right_child_dict = {}
level_two_right_left_descendant = {}
level_two_right_right_descendant = {}

level_one_left_child_dict = {}
level_two_left_left_descendant = {}
level_two_left_right_descendant = {}

for idx, tree in hanziDecompTreeDict.items():
    if type(idx) != int:
        print(idx,tree)
    if tree.rightChild is not None:
        level_one_right_child_dict[idx] = tree.rightChild.data

        if tree.rightChild.leftChild is not None:
            level_two_right_left_descendant[idx] = tree.rightChild.leftChild.data 
        if tree.rightChild.rightChild is not None:
            level_two_right_right_descendant[idx] = tree.rightChild.rightChild.data
        else:
            continue


    else:
        continue
    if tree.leftChild is not None:
        level_one_left_child_dict[idx] = tree.leftChild.data
        if tree.leftChild.leftChild is not None:
            level_two_left_left_descendant[idx] = tree.leftChild.leftChild.data 
        if tree.leftChild.rightChild is not None:
            level_two_left_right_descendant[idx] = tree.leftChild.rightChild.data
        else:
            continue
    

In [None]:
def traverse_tree(node):
    """
    Recursively traverses the tree and returns a dictionary of nodes.
    """
    if node is None:
        return {}

    left_descendant = traverse_tree(node.leftChild)
    right_descendant = traverse_tree(node.rightChild)

    node_dict = {
        'data': node.data,
        'left_child': left_descendant,
        'right_child': right_descendant
    }

    return node_dict

hanzi_idx = 1001
# Example usage:
tree_dict = traverse_tree(hanziDecompTreeDict[hanzi_idx])



In [None]:
def flatten_tree(tree, hanzi_idx):
    result = []
    counter = 0

    def flatten_node(node, hanzi_idx):
        nonlocal counter
        counter += 1
        
        if counter > 1:
            result.append((hanzi_idx, node['data'], counter - 1))

        if node['left_child']:
            flatten_node(node['left_child'], hanzi_idx )
        if node['right_child']:
            flatten_node(node['right_child'], hanzi_idx)

    flatten_node(tree, hanzi_idx)

    return result



In [None]:
flat_component_list = []

for hanzi_idx, RootNode in hanziDecompTreeDict.items():
    tree_dict = traverse_tree(RootNode)
    flat_rows = flatten_tree(tree_dict,hanzi_idx)
    flat_component_list = flat_component_list + flat_rows


In [None]:
df_hanzi_components = pd.DataFrame(flat_component_list, columns=['hanzi_index','component','position'])

In [None]:
# Filter out * inside of the dataset
df_hanzi_components  = df_hanzi_components[df_hanzi_components['component'] != '*']

In [None]:
df_hanzi_components.to_sql('hanzi_components',conn,if_exists='replace')

39072

## News 

### Weibo

## Chengyu (Idioms)