French Genanki Script
====
This python notebook automates looking up french words and creating an anki deck of definitions
It does the following
1. Reads csv file of french words to look up
1. Searches collins dictionary for definition, IPA and audio,
1. Gets a language specific image from Bing
1. Resizes the image
1. Builds and exports an anki deck

How to use:
---
1. get an API key for bing image search
1. edit bing_settings.yaml
1. place search terms in anki_search.csv
1. run all cells
1. Import your deck
1. Check that eveything is okay in anki, then change the notes/cards to your preferred deck
1. Delete all the media files when you're done.

Toubleshooting
---
* If csv can't be read, check encording

In [44]:
from bs4 import BeautifulSoup as bs
import requests as req
import re, csv, yaml, genanki, random
from pathlib import Path
from PIL import Image
from resizeimage import resizeimage
from textwrap import dedent

In [45]:
### Main settings here ###

search_csv_filename = "anki_search.csv"
csv_file_encoding = 'mac_roman'
genanki_id_yaml = "genanki_ids.yaml"
bing_settings_yaml_filename = 'bing_settings.yaml'
resize_image_x, resize_image_y = 400,300

# note to self: don't upload your api keys to github!
#bing_settings_yaml_filename = 'bing_settings_personal.yaml' 

In [46]:
# create two random numbers for deck and model IDs
# if no file exists, create a genanki_ids.yaml file

def create_id():
    return random.randrange(1 << 30, 1 << 31)

if not Path(genanki_id_yaml).is_file():
    with open(genanki_id_yaml, 'wt') as f:
        f.write('deck_id: ' + str(create_id()) + '\n') 
        f.write('model_id: ' + str(create_id()) + '\n') 
# load IDs
ids = {}
with open(genanki_id_yaml, 'r') as stream:
    try:
        ids = yaml.load(stream)
    except yaml.YAMLError as exc:
        print(exc)

In [47]:
image_idx = 0 # which bing result should we take?
collins = "https://www.collinsdictionary.com/dictionary/french-english/{}"

In [48]:
# set up BING
# load BING image search API key
settings = {}
with open(bing_settings_yaml_filename, 'r') as stream:
    try:
        settings = yaml.load(stream)
        print('Loaded BING API settings')
    except yaml.YAMLError as exc:
        print(exc)

        # api documentation: 
headers = {"Ocp-Apim-Subscription-Key" : settings['subscription_key']}
params  = {
            'setLang':   settings['setLang'], 
            'mkt':       settings['mkt'],
            'imageType': settings['imageType'],
            'count':     settings['count']
# these other options seems to hinder the results, so I disabled them    
#           'freshness': settings['freshness'],
#           "license":   settings["license"], 
#           "imageType": settings["imageType"], 
#           'license':   settings['license'],
            }

Loaded BING API settings


In [49]:
# load words to look up from csv file
search_list = []
with open(search_csv_filename, encoding=csv_file_encoding) as f:
    reader = csv.reader(f)
    for row in reader:
        # assuming no header in csv file
        search_list.append(row[0]); #assumes only one column with search terms

search_list

['ourson',
 'tanière',
 'habile',
 'troupeau',
 'alentours',
 'craintive',
 'meute',
 'bouquetin',
 'rusé',
 'ailes']

In [50]:
def download(url, filename):
    with open(filename, 'wb') as f:
        print('Downloading: '+ filename)
        img_data = req.get(url)
        img_data.raise_for_status()
        f.write(img_data.content)

In [51]:
# get data for all search terms
results = []
for search in search_list:
    print('Processing: '+ search)
    result = {}
    result['word'] = search
    
    ###
    # process collins search
    collins_query = collins.format(search)
    result['collins_query'] = collins_query
    collinsReq  = req.get(collins_query)
    collinsData = collinsReq.text
    collinsSoup = bs(collinsData, "html.parser")
    

    # get definition 
    result['definition'] = \
        collinsSoup.find('span', 
                         attrs={'class': 'cit type-translation'}).text.strip()

    # get part of speech
    result['pos'] = collinsSoup.find('span', 
                                     attrs={'class': 'pos'}).text.strip()
        
    # create article for nouns
    regex = re.compile(r"(masculine|feminine)\s(noun)")
    match = regex.match(result['pos']) 
    if match is not None:
        if match.group(1) == "feminine":
            result['article'] = 'une'
        else:
            result['article'] = 'un'

    # get IPA
    result['ipa'] = \
        collinsSoup.find('span', attrs={'class': 'pron type-'}).text.strip()

    # get audio file url
    sound_attrs = {'class': " ".join(['hwd_sound',
                            'sound',
                            'audio_play_button',
                            'icon-volume-up',
                            'ptr'])}
    
    soundElement = collinsSoup.find('a', attrs=sound_attrs)
    result['sound_url'] = soundElement.get('data-src-mp3')

    # download files
    result['audio_file'] = search + '_pronounce.mp3'
    
    # if file doesn't exist locally then save it
    if not Path(result['audio_file']).is_file():
        download(result['sound_url'], result['audio_file'])
    else:
        print("File exists, skipping: " + result['audio_file'])
    
    ###
    # process bing image
    bing_advanced_query = " language:{} loc:{}" \
        .format(settings['language'], settings['loc'])                                           
    params['q'] = search + bing_advanced_query
    response = req.get(settings['image_api_url'], headers=headers, params=params)
    response.raise_for_status()
    search_results = response.json()
    
    result['bing_results_json'] = search_results
    result['image_url'] = search_results["value"][image_idx]["contentUrl"]
    result['image_page_url'] = search_results["value"][image_idx]["hostPageUrl"]
    
    # get end of url after last /
    original_image_filename = result['image_url'].rsplit('/', 1)[-1] 
    original_image_ext = original_image_filename.rsplit('.',1)[-1]
    
    result['image_file_original'] = original_image_filename
    result['image_file'] = search + "." + original_image_ext
    
    # download image file
    if not Path(result['image_file']).is_file():
        try:
            download(result['image_url'], result['image_file'])    
        except HTTPError:
            print("Couldn't download image, skipping")
            result['image_file'] = None
    else:
        print("File exists, skipping: " + result['image_file'])
    
    ###
    # resize image file
    resized_filename = search + "_resized." + original_image_ext
    
    if Path(result['image_file']).is_file() and not Path(resized_filename).is_file() :
        with open(result['image_file'], 'r+b') as f:
            with Image.open(f) as image:
                print("Resizing image: " + result['image_file'])
                resized_filename = search + "_resized." + original_image_ext
                cover = resizeimage.resize_cover(image, [resize_image_x, resize_image_y])
                cover.save(resized_filename, image.format)
    result['image_file_resized'] = resized_filename
    
    results.append(result)
print("Completed Queries!")

Processing: ourson
Downloading: ourson_pronounce.mp3
Downloading: ourson.jpg
Resizing image: ourson.jpg
Processing: tanière
Downloading: tanière_pronounce.mp3
Downloading: tanière.jpg
Resizing image: tanière.jpg
Processing: habile
Downloading: habile_pronounce.mp3
Downloading: habile.jpg
Resizing image: habile.jpg
Processing: troupeau
Downloading: troupeau_pronounce.mp3
Downloading: troupeau.jpg
Resizing image: troupeau.jpg
Processing: alentours
Downloading: alentours_pronounce.mp3
Downloading: alentours.jpg
Resizing image: alentours.jpg
Processing: craintive
Downloading: craintive_pronounce.mp3
Downloading: craintive.jpg
Resizing image: craintive.jpg
Processing: meute
Downloading: meute_pronounce.mp3
Downloading: meute.jpg
Resizing image: meute.jpg
Processing: bouquetin
Downloading: bouquetin_pronounce.mp3
Downloading: bouquetin.jpg
Resizing image: bouquetin.jpg
Processing: rusé
Downloading: rusé_pronounce.mp3
Downloading: rusé.jpg
Resizing image: rusé.jpg
Processing: ailes
Downloadin

In [52]:
# Create Anki model
my_model = genanki.Model(
  int(ids['model_id']),
  'French - 5k (Genaki)',
  fields=[
    {'name': 'Word or Phrase'},
    {'name': 'Article'},
    {'name': 'Part of Speech'},
    {'name': 'Definition'},
    {'name': 'Picture'},
    {'name': 'Audio'},
    {'name': 'IPA'},
    {'name': 'Mnemonic'},
    {'name': 'Source'}
  ],
  templates=[
    {
      'name': 'Picture2Word',
      'qfmt': "<div style='font-family: Arial; font-size: 20px;'>{{Picture}}</div>",
      'afmt': dedent("""\
                <div style='font-family: Arial; font-size: 20px;'>{{Audio}}</div>
                <div style='font-family: Arial; font-size: 20px;'>{{#Article}}{{Article}}{{/Article}}&nbsp;{{Word or Phrase}}</div>
                <div style='font-family: Arial; font-size: 20px;'>{{Part of Speech}}</div>
                <div style='font-family: Arial; font-size: 20px;'>{{Mnemonic}}</div>
                {{#Source}}<div style='font-family: Arial; font-size: 20px;'>{{Source}}</div>{{/Source}}
                """),
    },
    {
      'name': 'Word2Picture',
      'qfmt': dedent("""\
              <div style='font-family: Arial; font-size: 20px;'>{{Audio}}</div>
              <div style='font-family: Arial; font-size: 20px;'>{{#Article}}{{Article}}{{/Article}}&nbsp;{{Word or Phrase}}</div>
              <div style='font-family: Arial; font-size: 20px;'>{{Part of Speech}}</div>
                  """),
      'afmt': dedent("""\
                {{#Picture}}<div style='font-family: Arial; font-size: 20px;'>{{Picture}}</div>{{/Picture}}
                {{#Definition}}<div style='font-family: Arial; font-size: 20px;'>{{Definition}}</div>{{/Definition}}
                {{#Source}}<div style='font-family: Arial; font-size: 20px;'>{{Source}}</div>{{/Source}}
                """)
    },
    {
      'name': 'Spelling',
      'qfmt': dedent("""\
                Peut tu l'épeler?
                <div style='font-family: Arial; font-size: 20px;'>{{Audio}}</div>
                <div style='font-family: Arial; font-size: 20px;'>{{Picture}}</div>
                """),
      'afmt': dedent("""\
                <div style='font-family: Arial; font-size: 20px;'>{{#Article}}{{Article}}{{/Article}}&nbsp;{{Word or Phrase}}</div>
                {{#Source}}<div style='font-family: Arial; font-size: 20px;'>{{Source}}</div>{{/Source}}
                """)
    },
    {
      'name': 'Article',
      'qfmt': dedent("""\
                {{#Article}}
                <div style='font-family: Arial; font-size: 20px;'>[...] {{Word or Phrase}}</div>
                {{/Article}}
                """),
      'afmt': dedent("""\
                <div style='font-family: Arial; font-size: 20px;'>{{Article}}&nbsp;{{Word or Phrase}}</div>
                <div style='font-family: Arial; font-size: 20px;'>{{Mnemonic}}</div>
                {{#Source}}<div style='font-family: Arial; font-size: 20px;'>{{Source}}</div>{{/Source}}
                """)
    },
    {
      'name': 'Pronuncation',
      'qfmt': dedent("""\
                {{#Audio}}{{#IPA}}<div style='font-family: Arial; font-size: 20px;'>
                Comment prononcez-vous ce mot?
                <BR>
                <BR>
                {{Word or Phrase}}
                </div>{{/IPA}}{{/Audio}}
                """),
      'afmt': dedent("""\
                <div style='font-family: Arial; font-size: 20px;'>{{IPA}}</div>
                <div style='font-family: Arial; font-size: 20px;'>{{Audio}}</div>
                {{#Source}}<div style='font-family: Arial; font-size: 20px;'>Sources:&nbsp;{{Source}}</div>{{/Source}}
                """)
    }
  ],
css=""".card {
 font-family: arial;
 font-size: 20px;
 text-align: center;
 color: black;
 background-color: white;
}"""
)

In [53]:
# recall model defn
#     {'name': 'Primary Word/Phrase'},
#     {'name': 'Article'},
#     {'name': 'Part of Speech'},
#     {'name': 'Definition'},
#     {'name': 'Picture'},
#     {'name': 'Audio'},
#     {'name': 'IPA'},
#     {'name': 'Mnemonic'}
#     {'name': 'Source'}

# create notes for each result
my_notes = []
for item in results:
    my_note = genanki.Note(
      model=my_model,
      fields=[
          item['word'],
          item.get('article', ''),
          item.get('pos', ''), # part of speech
          item.get('definition', ''),
          '<img src="{}">'.format(item['image_file_resized']),
           "[sound:{}]".format(item.get('audio_file', '').format(r's')), #[sound:sound.mp3] format for anki decks
          item.get('ipa', ''),
          "", # no predefined mnemonic ;)
          '<a href="{}">collins</a><br><a href="{}">image-page</a>'.format(item['collins_query'], item['image_page_url'])
      ])
    my_notes.append(my_note)


In [54]:
# create deck and add notes
my_deck = genanki.Deck(
  int(ids['deck_id']),
  'French-genanki')

for note in my_notes:
    my_deck.add_note(note)

In [55]:
# create media database
media = [x.get('image_file_resized') 
             for x in results 
             if x['image_file'] is not None]
media.extend([x['audio_file'] 
                  for x in results 
                  if x['audio_file'] is not None])
media

['ourson_resized.jpg',
 'tanière_resized.jpg',
 'habile_resized.jpg',
 'troupeau_resized.jpg',
 'alentours_resized.jpg',
 'craintive_resized.jpg',
 'meute_resized.jpg',
 'bouquetin_resized.jpg',
 'rusé_resized.jpg',
 'ailes_resized.jpg',
 'ourson_pronounce.mp3',
 'tanière_pronounce.mp3',
 'habile_pronounce.mp3',
 'troupeau_pronounce.mp3',
 'alentours_pronounce.mp3',
 'craintive_pronounce.mp3',
 'meute_pronounce.mp3',
 'bouquetin_pronounce.mp3',
 'rusé_pronounce.mp3',
 'ailes_pronounce.mp3']

In [56]:
# create package
my_package = genanki.Package(my_deck)
my_package.media_files = media

In [57]:
# save data
my_package.write_to_file('output.apkg')