# Chinese Data Analysis

### Extracting Data from Tutor Session Notes


install instruction

conda create -n ChineseAutomation
conda update -n base -c defaults conda
conda install ipykernel

# Library installs
!pip install -qq pinyin pandas numpy

In [16]:
#!conda install -n ChineseAutomation ipykernel --update-deps --force-reinstall
!pip install -qq pinyin pandas numpy

In [17]:

import pandas as pd
import numpy as np


from pathlib import Path
import os
from xml.etree import ElementTree
import datetime, time


import re

# Collecting XML File of Tutoring Notes

In [18]:


def find_most_recent_ompl():
    """
    Searches for .opml files in downloads directory
    Finds the one with the most recent time stamp 
    
    returns Path Object

    """
    # Finds the downloads directory
    pathDir = Path(os.path.join(os.path.expanduser('~'),'Downloads'))

    opmlFileList = []

    # fetches all opml files

    for file in os.listdir(pathDir):
        if file.endswith('.opml'):
            opmlFileList.append(file)

    # Sorts them by timestamp, get the index of most recent file

    timestampPattern = re.compile(r'[0-9]{6}\-[0-9]{6}')

    fileTimestamps = []
    stringToDatetimePattern = "%y%m%d-%H%M%S"

    for file in opmlFileList:
        #Uses Regex to parse timestamp pattern
        
        timestamp = re.search(timestampPattern, file)[0]
        timestamp = time.mktime(datetime.datetime.strptime(timestamp, stringToDatetimePattern).timetuple())
        fileTimestamps.append(timestamp)

    # Sorts the highest timestamp 
    mostRecent = max(fileTimestamps)
    mostRecentIdx = fileTimestamps.index(mostRecent)

    return pathDir/opmlFileList[mostRecentIdx]

In [19]:
mostRecentNotesPath = find_most_recent_ompl()

with open(mostRecentNotesPath, 'rt') as f:
        tree = ElementTree.parse(f)

notesRoot = tree.getroot()

notesBody = notesRoot.find('body')


In [20]:
# Finds the XML Element that contains
# My Chinese Tutoring Notes


# Finds Base Element
chineseBaseElem = None
for elem in notesBody:
    #print(elem.attrib['text'])
    if elem.attrib['text'] == 'Focus Points':
        outerElem1 = elem
        #print(outerElem1.attrib['text'])

# Drills deeper
for elem in outerElem1:
    #rint(elem.attrib['text'])
    if elem.attrib['text'].strip() == 'Chinese':
        #print(True)
        outerElem2 = elem
        print(outerElem2.attrib['text'])
#print(chineseBaseElem.text)
# Finds the part with the tutoring notes 
for elem in outerElem2:
    print(elem.attrib['text'])
    if elem.attrib['text'].strip() == '好好学习，天天上上':
        chineseBaseElem = elem

# This avoids parsing through the rest of the data 
# that is exported by my workflowy app

Chinese
好好学习，天天上上
SOP for different times, lesson plans


In [21]:
chineseBaseElem.attrib['text']

'好好学习，天天上上'

In [22]:
# Here I want to know what category I gave
# The Vocabulary with manual assignment

# Also needs to know the raw notes by the date
# they were recorded

vocabCategoriesBaseElem = None
tutorSessionNotesBaseElem = None

for elem in chineseBaseElem:
    print(elem.attrib['text'])
    if elem.attrib['text'] == 'Categories':
        vocabCategoriesBaseElem = elem
    elif elem.attrib['text'] == 'Tutor Session Notes':
        tutorSessionNotesBaseElem = elem
    else:
        continue

Study Plans
videos 
Categories
Inbox
Tutor Session Notes
社交平台 she4 jiao1 ping2 tai2


# Part 2: Extracting XML Data into a Dataframe

In [23]:
# Goal here is to extract the notes from each
# Tutoring Session by date with a list of 
# Each of the elements that contain them


# Looks for element text which matches internal 
# workflowy pattern in opml 
timePattern = re.compile('time startYear')

elementsByDate = []
elementsDateAdded = []

# Checks for elements which contain dates
# Which will contain sub elements with vocab
# Appends them to a list, along with the dates
# From before
for outline in tutorSessionNotesBaseElem.iter():

    text = outline.attrib['text']
    
    #Extracts the Date 
    # finds new nodes
    
    hasTime = re.search(timePattern,text)
    
    if hasTime:
        #Extracting datetime info
        time = text.split()
        year = time[1].split('\"')[1]
        month = time[2].split('\"')[1]
        day = time[3].split('\"')[1]
        dateAdded = year+' '+month +' '+day
        #print(dateAdded)
        
        elementsByDate.append(outline)
        elementsDateAdded.append(dateAdded)
    else:
        continue 

In [24]:
tutorSession = elementsByDate[0]

In [25]:
# Function to find if a character is Chinese
def is_cjk(char):
    char = ord(char)
    cjk_ranges = [
    (0x4E00,  0x62FF),
    (0x6300,  0x77FF),
    (0x7800,  0x8CFF),
    (0x8D00,  0x9FCC),
    (0x3400,  0x4DB5),
    (0x20000, 0x215FF),
    (0x21600, 0x230FF),
    (0x23100, 0x245FF),
    (0x24600, 0x260FF),
    (0x26100, 0x275FF),
    (0x27600, 0x290FF),
    (0x29100, 0x2A6DF),
    (0x2A700, 0x2B734),
    (0x2B740, 0x2B81D),
    (0x2B820, 0x2CEAF),
    (0x2CEB0, 0x2EBEF),
    (0x2F800, 0x2FA1F), ]
    
    
    for bottom, top in cjk_ranges:
        if char >= bottom and char <= top:
            return True
    return False


pinyinRegex = re.compile(r"^(?P<initial>ch|zh|sh|r|c|b|d|g|f|h|k|j|m|l|n|q|p|s|t|w|y|x|z|)(?P<final>(?:(?<=ch)uang|(?<=ch)ang|(?<=ch)eng|(?<=ch)ong|(?<=ch)uai|(?<=ch)uan|(?<=ch)ai|(?<=ch)an|(?<=ch)ao|(?<=ch)en|(?<=ch)ou|(?<=ch)ua|(?<=ch)ui|(?<=ch)un|(?<=ch)uo|(?<=ch)a|(?<=ch)e|(?<=ch)i|(?<=ch)u)|(?:(?<=zh)uang|(?<=zh)ang|(?<=zh)eng|(?<=zh)ong|(?<=zh)uai|(?<=zh)uan|(?<=zh)ai|(?<=zh)an|(?<=zh)ao|(?<=zh)ei|(?<=zh)en|(?<=zh)ou|(?<=zh)ua|(?<=zh)ui|(?<=zh)un|(?<=zh)uo|(?<=zh)a|(?<=zh)e|(?<=zh)i|(?<=zh)u)|(?:(?<=sh)uang|(?<=sh)ang|(?<=sh)eng|(?<=sh)uai|(?<=sh)uan|(?<=sh)ai|(?<=sh)an|(?<=sh)ao|(?<=sh)ei|(?<=sh)en|(?<=sh)ou|(?<=sh)ua|(?<=sh)ui|(?<=sh)un|(?<=sh)uo|(?<=sh)a|(?<=sh)e|(?<=sh)i|(?<=sh)u)|(?:(?<=c)ang|(?<=c)eng|(?<=c)ong|(?<=c)uan|(?<=c)ai|(?<=c)an|(?<=c)ao|(?<=c)en|(?<=c)ou|(?<=c)ui|(?<=c)un|(?<=c)uo|(?<=c)a|(?<=c)e|(?<=c)i|(?<=c)u)|(?:(?<=b)ang|(?<=b)eng|(?<=b)ian|(?<=b)iao|(?<=b)ing|(?<=b)ai|(?<=b)an|(?<=b)ao|(?<=b)ei|(?<=b)en|(?<=b)ie|(?<=b)in|(?<=b)a|(?<=b)i|(?<=b)o|(?<=b)u)|(?:(?<=d)ang|(?<=d)eng|(?<=d)ian|(?<=d)iao|(?<=d)ing|(?<=d)ong|(?<=d)uan|(?<=d)ai|(?<=d)an|(?<=d)ao|(?<=d)ei|(?<=d)en|(?<=d)ia|(?<=d)ie|(?<=d)iu|(?<=d)ou|(?<=d)ui|(?<=d)un|(?<=d)uo|(?<=d)a|(?<=d)e|(?<=d)i|(?<=d)u)|(?:(?<=g)uang|(?<=g)ang|(?<=g)eng|(?<=g)ong|(?<=g)uai|(?<=g)uan|(?<=g)ai|(?<=g)an|(?<=g)ao|(?<=g)ei|(?<=g)en|(?<=g)ou|(?<=g)ua|(?<=g)ui|(?<=g)un|(?<=g)uo|(?<=g)a|(?<=g)e|(?<=g)u)|(?:(?<=f)ang|(?<=f)eng|(?<=f)iao|(?<=f)an|(?<=f)ei|(?<=f)en|(?<=f)ou|(?<=f)a|(?<=f)o|(?<=f)u)|(?:(?<!sh|ch|zh)(?<=h)uang|(?<!sh|ch|zh)(?<=h)ang|(?<!sh|ch|zh)(?<=h)eng|(?<!sh|ch|zh)(?<=h)ong|(?<!sh|ch|zh)(?<=h)uai|(?<!sh|ch|zh)(?<=h)uan|(?<!sh|ch|zh)(?<=h)ai|(?<!sh|ch|zh)(?<=h)an|(?<!sh|ch|zh)(?<=h)ao|(?<!sh|ch|zh)(?<=h)ei|(?<!sh|ch|zh)(?<=h)en|(?<!sh|ch|zh)(?<=h)ou|(?<!sh|ch|zh)(?<=h)ua|(?<!sh|ch|zh)(?<=h)ui|(?<!sh|ch|zh)(?<=h)un|(?<!sh|ch|zh)(?<=h)uo|(?<!sh|ch|zh)(?<=h)a|(?<!sh|ch|zh)(?<=h)e|(?<!sh|ch|zh)(?<=h)u)|(?:(?<=k)uang|(?<=k)ang|(?<=k)eng|(?<=k)ong|(?<=k)uai|(?<=k)uan|(?<=k)ai|(?<=k)an|(?<=k)ao|(?<=k)en|(?<=k)ou|(?<=k)ua|(?<=k)ui|(?<=k)un|(?<=k)uo|(?<=k)a|(?<=k)e|(?<=k)u)|(?:(?<=j)iang|(?<=j)iong|(?<=j)ian|(?<=j)iao|(?<=j)ing|(?<=j)üan|(?<=j)ia|(?<=j)ie|(?<=j)in|(?<=j)iu|(?<=j)üe|(?<=j)ün|(?<=j)i|(?<=j)ü)|(?:(?<=m)ang|(?<=m)eng|(?<=m)ian|(?<=m)iao|(?<=m)ing|(?<=m)ai|(?<=m)an|(?<=m)ao|(?<=m)ei|(?<=m)en|(?<=m)ie|(?<=m)in|(?<=m)iu|(?<=m)ou|(?<=m)a|(?<=m)e|(?<=m)i|(?<=m)o|(?<=m)u)|(?:(?<=l)iang|(?<=l)ang|(?<=l)eng|(?<=l)ian|(?<=l)iao|(?<=l)ing|(?<=l)ong|(?<=l)uan|(?<=l)ai|(?<=l)an|(?<=l)ao|(?<=l)ei|(?<=l)ia|(?<=l)ie|(?<=l)in|(?<=l)iu|(?<=l)ou|(?<=l)un|(?<=l)uo|(?<=l)üe|(?<=l)a|(?<=l)e|(?<=l)i|(?<=l)o|(?<=l)u|(?<=l)ü)|(?:(?<=n)iang|(?<=n)ang|(?<=n)eng|(?<=n)ian|(?<=n)iao|(?<=n)ing|(?<=n)ong|(?<=n)uan|(?<=n)ai|(?<=n)an|(?<=n)ao|(?<=n)ei|(?<=n)en|(?<=n)ie|(?<=n)in|(?<=n)iu|(?<=n)ou|(?<=n)un|(?<=n)uo|(?<=n)üe|(?<=n)a|(?<=n)e|(?<=n)i|(?<=n)u|(?<=n)ü)|(?:(?<=q)iang|(?<=q)iong|(?<=q)ian|(?<=q)iao|(?<=q)ing|(?<=q)üan|(?<=q)ia|(?<=q)ie|(?<=q)in|(?<=q)iu|(?<=q)üe|(?<=q)ün|(?<=q)i|(?<=q)ü)|(?:(?<=p)ang|(?<=p)eng|(?<=p)ian|(?<=p)iao|(?<=p)ing|(?<=p)ai|(?<=p)an|(?<=p)ao|(?<=p)ei|(?<=p)en|(?<=p)ie|(?<=p)in|(?<=p)ou|(?<=p)a|(?<=p)i|(?<=p)o|(?<=p)u)|(?:(?<=s)ang|(?<=s)eng|(?<=s)ong|(?<=s)uan|(?<=s)ai|(?<=s)an|(?<=s)ao|(?<=s)en|(?<=s)ou|(?<=s)ui|(?<=s)un|(?<=s)uo|(?<=s)a|(?<=s)e|(?<=s)i|(?<=s)u)|(?:(?<=r)ang|(?<=r)eng|(?<=r)ong|(?<=r)uan|(?<=r)an|(?<=r)ao|(?<=r)en|(?<=r)ou|(?<=r)ua|(?<=r)ui|(?<=r)un|(?<=r)uo|(?<=r)e|(?<=r)i|(?<=r)u)|(?:(?<=t)ang|(?<=t)eng|(?<=t)ian|(?<=t)iao|(?<=t)ing|(?<=t)ong|(?<=t)uan|(?<=t)ai|(?<=t)an|(?<=t)ao|(?<=t)ei|(?<=t)ie|(?<=t)ou|(?<=t)ui|(?<=t)un|(?<=t)uo|(?<=t)a|(?<=t)e|(?<=t)i|(?<=t)u)|(?:(?<=w)ang|(?<=w)eng|(?<=w)ai|(?<=w)an|(?<=w)ei|(?<=w)en|(?<=w)a|(?<=w)o|(?<=w)u)|(?:(?<=y)ang|(?<=y)ing|(?<=y)ong|(?<=y)uan|(?<=y)ai|(?<=y)an|(?<=y)ao|(?<=y)in|(?<=y)ou|(?<=y)ue|(?<=y)un|(?<=y)a|(?<=y)e|(?<=y)e|(?<=y)i|(?<=y)o|(?<=y)u)|(?:(?<=x)iang|(?<=x)iong|(?<=x)ian|(?<=x)iao|(?<=x)ing|(?<=x)üan|(?<=x)ia|(?<=x)ie|(?<=x)in|(?<=x)iu|(?<=x)üe|(?<=x)ün|(?<=x)i|(?<=x)ü)|(?:(?<=z)ang|(?<=z)eng|(?<=z)ong|(?<=z)uan|(?<=z)ai|(?<=z)an|(?<=z)ao|(?<=z)ei|(?<=z)en|(?<=z)ou|(?<=z)ui|(?<=z)un|(?<=z)uo|(?<=z)a|(?<=z)e|(?<=z)i|(?<=z)u)|(?:(?<!r|c|b|d|g|f|h|k|j|m|l|n|q|p|s|t|w|y|x|z)a|(?<!r|c|b|d|g|f|h|k|j|m|l|n|q|p|s|t|w|y|x|z)ai|(?<!r|c|b|d|g|f|h|k|j|m|l|n|q|p|s|t|w|y|x|z)an|(?<!r|c|b|d|g|f|h|k|j|m|l|n|q|p|s|t|w|y|x|z)ang|(?<!r|c|b|d|g|f|h|k|j|m|l|n|q|p|s|t|w|y|x|z)ao|(?<!r|c|b|d|g|f|h|k|j|m|l|n|q|p|s|t|w|y|x|z)e|(?<!r|c|b|d|g|f|h|k|j|m|l|n|q|p|s|t|w|y|x|z)ei|(?<!r|c|b|d|g|f|h|k|j|m|l|n|q|p|s|t|w|y|x|z)en|(?<!r|c|b|d|g|f|h|k|j|m|l|n|q|p|s|t|w|y|x|z)eng|(?<!r|c|b|d|g|f|h|k|j|m|l|n|q|p|s|t|w|y|x|z)er|(?<!r|c|b|d|g|f|h|k|j|m|l|n|q|p|s|t|w|y|x|z)o|(?<!r|c|b|d|g|f|h|k|j|m|l|n|q|p|s|t|w|y|x|z)ou))$")

#pinyinRegex = r'(miu|[pm]ou|[bpm](o|e(i|ng?)?|a(ng?|i|o)?|i(e|ng?|a[no])?|u))|(f(ou?|[ae](ng?|i)?|u))|(d(e(i|ng?)|i(a[on]?|u))|[dt](a(i|ng?|o)?|e(i|ng)?|i(a[on]?|e|ng|u)?|o(ng?|u)|u(o|i|an?|n)?))|(neng?|[ln](a(i|ng?|o)?|e(i|ng)?|i(ang|a[on]?|e|ng?|u)?|o(ng?|u)|u(o|i|an?|n)?|ve?))|([ghk](a(i|ng?|o)?|e(i|ng?)?|o(u|ng)|u(a(i|ng?)?|i|n|o)?))|(z[h]?ei|[cz]hua(i|ng?)?|[cz][h]?(a(i|ng?|o)?|en?g?|o(u|ng)?|u(a?n|o|i)?|(e|i)))|(song|shua(i|ng?)?|shei|s[h]?(a(i|ng?|o)?|en?g?|ou|u(a?n|o|i)?|i))|(r([ae]ng?|i|e|ao|ou|ong|u[oin]|ua?n?))|([jqx](i(a(o|ng?)?|[eu]|ong|ng?)?|u(e|a?n)?))|(wu|w?(a(i|o|ng?)?|ou?|e(i|ng?)?))|y(a(o|ng?)?|e|in?g?|o(u|ng)?|u(e|a?n)?[1234]?)'

In [54]:

def extract_cn_chars_from_indv_session(tutorSession):
    
    
    ChineseCharactersInNotes = []
    # Extracts the Chinese Characters one by one
    # From the raw text
    for tutorsNote in tutorSession.iter():
        rawText = tutorsNote.attrib['text']
        #print(rawText)
        chineseCharacters = ''
        for char in rawText:
            if is_cjk(char):
                chineseCharacters += char
        #print(len(chineseCharacters))
        
        ### Print test to find notes 
        #   that don't contain CN Characters
        # 
        #if len(chineseCharacters) == 0:
        #    print(rawText)

        ChineseCharactersInNotes.append(chineseCharacters)
        #print(chineseCharacters)
    # Checks for pinyin
    ChineseCharactersInNotes = [i for i in ChineseCharactersInNotes if i]

    return ChineseCharactersInNotes
    #pinyinInNote = re.findall(pinyinRegex,rawText)
    #pinyinInNote = re.search(r'[a-z]*[1234]?',tutorsNote)
    
    #print(pinyinInNote)


    #for subNote in tutorsNote:
        #print('//')
        #print(subNote.attrib['text'])
        #print(subNote.attrib['text'])

    ### Needs to check for the likelyhood that
    # # it is pinyin or not    



In [52]:
['3'] + 6 * ['2']

['3', '2', '2', '2', '2', '2', '2']

In [63]:
ChineseVocabColumn = []
dateColumn = []

for date, elem in enumerate(elementsByDate):
    #print(elementsDateAdded[date])
    cnVocabFromSession = extract_cn_chars_from_indv_session(elem)
    
    
    dateColumn = dateColumn + len(cnVocabFromSession) * [elementsDateAdded[date]]
    ChineseVocabColumn = ChineseVocabColumn + cnVocabFromSession
print(len(ChineseVocabColumn))

561


In [64]:
cnVocabularyDF = pd.DataFrame({'Vocab':ChineseVocabColumn,
                                'Date':dateColumn })

In [66]:
cnVocabularyDF.to_csv('./data/initialExtractVocabAndDateAdded.csv')