In [156]:
import pandas as pd
from pathlib import Path

import pinyin
import translators
from xml.etree import ElementTree

import re
from transformers import AutoTokenizer, AutoModelForMaskedLM
from sentence_transformers import SentenceTransformer
from sklearn.cluster import KMeans


In [157]:

PathForNotes = Path('/home/jentlejames/Downloads') 


# Target for automation
fileName = 'WF - Notes - 220722-110837.opml'


main_vocab_df = pd.read_csv('./vocab_master.csv',index_col=0)

# Chinese Japanese Character Ranges
cjk_ranges = [
    ( 0x4E00,  0x62FF),
    ( 0x6300,  0x77FF),
    ( 0x7800,  0x8CFF),
    ( 0x8D00,  0x9FCC),
    ( 0x3400,  0x4DB5),
    (0x20000, 0x215FF),
    (0x21600, 0x230FF),
    (0x23100, 0x245FF),
    (0x24600, 0x260FF),
    (0x26100, 0x275FF),
    (0x27600, 0x290FF),
    (0x29100, 0x2A6DF),
    (0x2A700, 0x2B734),
    (0x2B740, 0x2B81D),
    (0x2B820, 0x2CEAF),
    (0x2CEB0, 0x2EBEF),
    (0x2F800, 0x2FA1F),]

In [158]:
# Functions

def is_cjk(char):
    char = ord(char)
    for bottom, top in cjk_ranges:
        if char >= bottom and char <= top:
            return True
    return False

In [159]:
#import sys

file_opml = PathForNotes/fileName         #sys.argv[1]


with open(file_opml, 'rt') as f:
    tree = ElementTree.parse(f)


In [160]:

# Drills down the the relevant element on the tree for Chinese notes
# In future can change to find automatically 

notesRoot = tree.getroot()
notesBody = notesRoot.find('body')
outlineRoot = notesBody.find('outline')


In [161]:

# Looks for element text which matches internal 
# workflowy pattern in opml 
timePattern = re.compile('time startYear')

# Creating list of elements
# Which contain lesson notes 
elementsByDate = []
elementsDateAdded = []


# Checks for elements which contain dates
# Which will contain sub elements with vocab
# Appends them to a list, along with the dates
# From before
for outline in outlineRoot.iter():

    text = outline.attrib['text']
    
    #Extracts the Date 
    # finds new nodes
    
    hasTime = re.search(timePattern,text)
    
    if hasTime:
        #Extracting datetime info
        time = text.split()
        year = time[1].split('\"')[1]
        month = time[2].split('\"')[1]
        day = time[3].split('\"')[1]
        dateAdded = year+' '+month +' '+day
        #print(dateAdded)
        
        elementsByDate.append(outline)
        elementsDateAdded.append(dateAdded)
    else:
        continue 
        
    
        

In [162]:
#print(elementsByDate)

# Creates a dataframe of all the vocabulary accummulated
# In lessons

vocab_df = pd.DataFrame({},columns=['Characters','Date'])

for i, lessonNotes in enumerate(elementsByDate):
    
    vocabFromLesson = []
    
    #print(elementsDateAdded[i])
    for note in lessonNotes:
        noteText = note.attrib['text']
        
        chineseCharacters = ''
    
        for char in noteText:
            if is_cjk(char):
                chineseCharacters += char
        
        vocabFromLesson.append(chineseCharacters)
    
    #Generates a column of same shape as vocab
    dateColumn = [elementsDateAdded[i]] * len(vocabFromLesson)
    lessonDictionary = {'Characters': vocabFromLesson,
                        'Date': dateColumn}
    
    #dict(zip(vocabFromLesson,dateColumn))
    
    df_dictionary = pd.DataFrame(lessonDictionary,columns=['Characters','Date'])
    
    #print(df_dictionary.head())
    vocab_df = pd.concat([vocab_df,df_dictionary],ignore_index=True)
    
        
        
    
# Check if date is already contained



vocab_df.Date = pd.to_datetime(vocab_df.Date)


# Checks for new values by comparing the date column 
# Creates a temp df to add vocab

new_words_index = main_vocab_df['Characters'][~main_vocab_df['Characters'].isin(vocab_df)].index
new_vocab_temp_df = vocab_df.iloc[new_words_index]



In [163]:
# Check if date is already contained


vocab_df.Date = pd.to_datetime(vocab_df.Date)


In [164]:

# Checks for new values by comparing the date column 
# Creates a temp df to add vocab
new_words_index = main_vocab_df['Characters'][~main_vocab_df['Characters'].isin(vocab_df)].index

In [165]:
new_vocab_temp_df = vocab_df.iloc[new_words_index]

In [166]:
# Adds translation and pinyin to new entries 

if new_vocab_temp_df.shape[0] > 0:

    new_vocab_temp_df['Pinyin'] = new_vocab_temp_df['Characters'].apply(lambda letters: pinyin.get(letters))
# Calls Google Translate API to translate new vocab

    definitions = []
    for term in new_vocab_temp_df['Characters'].iteritems():
        # Slow calls 
        definition = translators.google(term[1],from_language='zh-CN',to_language='en')
        definitions.append(definition)

    new_vocab_temp_df['Meaning'] = definitions
    
    
    # main__vocab_df = pd.concat([main_vocab_df,new_vocab_temp_df],ignore_index=True).reset_index(drop=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_vocab_temp_df['Pinyin'] = new_vocab_temp_df['Characters'].apply(lambda letters: pinyin.get(letters))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_vocab_temp_df['Meaning'] = definitions


In [169]:
pd.concat([main_vocab_df,new_vocab_temp_df],ignore_index=True).reset_index(drop=True)
#main_vocab_df = new_vocab_temp_df

### NLP Clustering

In [170]:
model = SentenceTransformer("ckiplab/albert-base-chinese",device='cpu')

No sentence-transformers model found with name /home/jentlejames/.cache/torch/sentence_transformers/ckiplab_albert-base-chinese. Creating a new one with MEAN pooling.
Some weights of the model checkpoint at /home/jentlejames/.cache/torch/sentence_transformers/ckiplab_albert-base-chinese were not used when initializing AlbertModel: ['predictions.dense.bias', 'predictions.bias', 'predictions.decoder.weight', 'predictions.dense.weight', 'predictions.decoder.bias', 'predictions.LayerNorm.weight', 'predictions.LayerNorm.bias']
- This IS expected if you are initializing AlbertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing AlbertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of

In [171]:
# Creating Embeddings for vocab characters
#main_vocab_df.dropna(inplace=True)
#main_vocab_df.reset_index()

corpus = list(main_vocab_df['Characters'])

corpus_embeddings = model.encode(corpus)

In [172]:
# Perform kmean clustering
num_clusters = 8
clustering_model = KMeans(n_clusters=num_clusters)
clustering_model.fit(corpus_embeddings)
cluster_assignment = clustering_model.labels_

clustered_sentences = [[] for i in range(num_clusters)]
for sentence_id, cluster_id in enumerate(cluster_assignment):
    clustered_sentences[cluster_id].append(corpus[sentence_id])


In [173]:
clusterKeys = {}

for i, cluster in enumerate(clustered_sentences):
    #print("Cluster ", i+1)
    keys = dict(zip(cluster, [i] * len(cluster)))
    clusterKeys.update(keys)

In [174]:
main_vocab_df['cluster'] = main_vocab_df['Characters'].map(clusterKeys)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  main_vocab_df['cluster'] = main_vocab_df['Characters'].map(clusterKeys)


In [175]:
main_vocab_df.sort_values('cluster',inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  main_vocab_df.sort_values('cluster',inplace=True)


In [178]:
main_vocab_df.reset_index(drop=True)

Unnamed: 0,Characters,Date,Pinyin,Meaning,cluster
0,洗涤灵,2022-07-22,xǐdílíng,Washing spirit,0
1,锅边素,2022-07-22,guōbiānsù,Pista,0
2,绿皮火车,2022-07-21,lv̀píhuǒchē,Green leather train,0
3,电饭锅,2022-07-14,diànfànguō,Rice cooker,0
4,手眼协调,2022-06-17,shǒuyǎnxiédiào,Hand -eye coordination,0
...,...,...,...,...,...
223,滋生细菌,2022-07-22,zīshēngxìjūn,Breeding bacteria,7
224,非技术性的,2022-06-10,fēijìzhúxìngde,Non -technical,7
225,保健品,2022-07-22,bǎojiànpǐn,Health products,7
226,职位,2022-07-21,zhíwèi,Position,7


In [179]:

main_vocab_df.to_csv('vocab_master.csv')

In [None]:
# Creating the flashcards 

