In [None]:
# Install necesseray libraries

!pip install ankipandas
!pip install genanki

In [None]:
# Imports

from ankipandas import Collection
import pandas as pd
import re
import matplotlib.pyplot as plt
import random
import genanki

In [None]:
# Fill in the location of your Anki collection; we'll next open our collection

collection_location = ''
col = Collection(collection_location)

In [None]:
# Name of the deck with the Heisig lessons. Change the name if it has a different one in your collection

kanji_deck = 'Remembering the Kanji 1, 6th edition (2200 kanji)'

# Notes that belong to the kanji deck

kanji_notes = col.cards[col.cards.cdeck==].nid.values

In [None]:
# Name of the deck with sample sentences. Change the name to whichever deck you need

vocab_deck = 'Core 2k/6k Optimized Japanese Vocabulary with Sound Part 01'

# Notes that belong to the sentence deck

vocab_notes = col.cards[col.cards.cdeck==vocab_deck].nid.values

In [None]:
# Extract the fields of the sentence notes

vocab_fields = col.notes[col.notes.index.isin(vocab_notes)]['nflds']

In [None]:
# With this line, you can see what fields exist within a note. You can add '[0]' to the line below to see the first field, 
# '[1]' for the second etc. to see what's in a specific field and which number you need to extract it. (Python is zero-indexed,
# meaning the first field is given number 0 etc.) Remember the numbers of the fields you want, you'll need them in the cell
# below.

vocab_fields.values[0]

In [None]:
# Here we make a list of lists with all the fields that we'll need. As you can see, I am extracting three fields, which
# correspond to the sentence with furigana marking, the translation and a reference to an audio file. Note that the numbers 
# 0, 10 and 12 refer to the positions of the fields for a note; you can type 'vocab_fields.values[0][0]' as in the cell above
# to see what it is you're extracting.
# I am also replacing the html-markings, though this done very basically. Edit the code to add more fields to your likings, 
# or remove the replace if you don't need it. You can use 'vocab_data[0]' to get a sense of the data you have compiled.

vocab_data = []
for note in vocab_fields:
    selection = [note[8].replace('<b>', '').replace('</b>', ''), note[10].replace('<b>', '').replace('</b>', ''), note[12]]
    vocab_data.append(selection)

In [None]:
# Here I turn our data into a pandas dataframe. As you can see, I add three column names, because I have extracted three fields.
# The number of column names should correspond to the number of fields you extracted in the cell above this one and they should
# be in the right order as well.

vocab_data = pd.DataFrame(vocab_data, columns=['Sentence', 'Translation', 'Audio'])

In [None]:
# With this we print a preview of our dataframe to see the results are to our likings

vocab_data.head()

In [None]:
# This is a function to extract all the kanji characters from a sentence. I use a regular expression to do this. The first line
# references the column 'Sentence' which I named in the cell above; if you have given your Japanese sentence field a different
# name, you should also change it here.

def extract_kanji(row):
    sentence = row.Sentence
    kanji = set(re.findall(r'[㐀-䶵一-鿋豈-頻]', sentence))
    return kanji

In [None]:
# In this cell we apply the function we defined above and add the result as a new column to our data frame, 'Kanji'.

vocab_data['Kanji'] = vocab_data.apply(extract_kanji, axis=1)

In [None]:
# Preview our dataframe again to see it makes sense

vocab_data.head()

In [None]:
# Parallel to what we did for our sentences, now we first extract the fields from our kanji deck, collect the kanji and the
# Heisig lesson number into a list of lists and then turn it into a dataframe. Feel free to seperate these three lines of code
# into seperate cells if you want it to be more clear or if you need to adjust it to your deck.

kanji_fields = col.notes[col.notes.index.isin(kanji_notes)].nflds.values
kanji_data = [[k[0], int(k[5])] for k in kanji_fields]
kanji_data = pd.DataFrame(kanji_data, columns = ['Kanji', 'Lesson'])

In [None]:
# Preview our kanji data

kanji_data.head()

In [None]:
# A function to add the starting lesson to our sentence dataframe.

def starting_lesson(row):
    
    # Read the kanji from our sentence dataframe. If you named your field differently, you need to adjust the name below too.
    
    kanji = row.Kanji
    
    # If there were no kanji in the sentence, return 0, meaning you can start learning this sentence without any kanji.
    
    if len(kanji) == 0:
        return 0
    
    # Filter our kanji dataframe to just those characters used in the sentence.
    
    kanji_selection = kanji_data[kanji_data.Kanji.isin(kanji)]
    
    # If there are any kanji in this filtered dataframe, return the highest lesson number. If not, return 9999, meaning this
    # sentence has kanji characters not covered in Heisig's book.
    
    if kanji_selection.shape[0] > 0:
        return kanji_selection.Lesson.max()
    else:
        return 9999

In [None]:
# Apply the function defined above

vocab_data['Lesson_number'] = vocab_data.apply(starting_lesson, axis=1)

In [None]:
# This is just a graph so you can see how many sentences you can learn already

grouping = vocab_data.groupby('Lesson_number').Sentence.nunique().to_frame().cumsum()
grouping.plot(figsize=(15,12))
plt.show()

In [None]:
# Preview our data

vocab_data.head()

In [None]:
# Deck title, name and model name for the new deck we will create.

anki_deck_title = "Selection 2k"
anki_model_name = 'Selection 2k'
anki_deck_name = 'Selection 2k'

In [None]:
# Model id

model_id = random.randrange(1 << 30, 1 << 31)

# Style of our cards; feel free to change to your liking

style = """
.card {
 font-family: arial;
 font-size: 20px;
 text-align: center;
 color: black;
 background-color: white;
}

"""

# Layout of our cards, again feel free to change. qfmt refers to the front.

anki_model = genanki.Model(
    model_id,
    anki_model_name,
    fields=[{"name": "sentence"}, {"name": "translation"}, {"name": "heisig_lesson"}, {"name": "audio"}],
    templates=[
        {
            "name": "Card 1",
            "qfmt": '<span style="font-size: 20px; ">{{translation}}</span>',
            "afmt": '<span style="font-family: ＭＳ ゴシック; ">{{furigana:sentence}}</span><br><br>{{audio}}</p>',
        }
    ],
    css=style,
)

In [None]:
# Create a list of anki notes from our sentence dataframe. Again, mind the column names should overlap with how you defined
# them above.

anki_notes = []

for index, row in vocab_data.iterrows():
    sentence = row.Sentence
    translation = row.Translation
    audio = row.Audio
    lesson_number = row.Lesson_number
    anki_note = genanki.Note(
        model=anki_model,
        fields=[sentence, translation, str(lesson_number), audio],
    )
    anki_notes.append(anki_note)

In [None]:
anki_deck = genanki.Deck(model_id, anki_deck_title)
anki_package = genanki.Package(anki_deck)

# Add flashcards to the deck
for anki_note in anki_notes:
    anki_deck.add_note(anki_note)

# Save the deck to a file
file_location = ''
anki_package.write_to_file(file_location)

print("Created deck with {} flashcards".format(len(anki_deck.notes)))