French Genanki Script
====
This python notebook automates looking up french words and creating an anki deck of definitions
It does the following
1. Reads csv file of french words to look up
1. Searches collins dictionary for definition, IPA and audio,
1. Gets a language specific image from Bing
1. Resizes the image
1. Builds and exports an anki deck

How to use:
---
1. set up your `model_id` and `deck_id`
1. get an API key for bing image search
1. edit bing_settings.yaml
1. place search terms in anki_search.csv
1. run all cells
1. Import your deck
1. Check that eveything is okay in anki, then change the notes/cards to your preferred deck
1. Delete all the media files when you're done.

Toubleshooting
---
* If csv can't be read, check encording

In [162]:
from bs4 import BeautifulSoup as bs
import requests as req
import re, csv, yaml, genanki, random
from pathlib import Path
from PIL import Image
from resizeimage import resizeimage
from textwrap import dedent

In [163]:
# create two random numbers for deck and model IDs
print(random.randrange(1 << 30, 1 << 31), random.randrange(1 << 30, 1 << 31) )

1824137637 1354543401


In [164]:
### Main settings here ###

search_csv_filename = "anki_search.csv"
csv_file_encoding = 'mac_roman'
model_id = None
deck_id = None
bing_settings_yaml_filename = 'bing_settings.yaml'

In [165]:
assert model_id is not None
assert deck_id is not None

In [166]:
image_idx = 0
collins = "https://www.collinsdictionary.com/dictionary/french-english/{}" #for defn, ipa and audio

In [167]:
# set up BING
# load BING image search API key
settings = {}
with open(bing_settings_yaml_filename, 'r') as stream:
    try:
        settings = yaml.load(stream)
        print('Loaded BING API settings')
    except yaml.YAMLError as exc:
        print(exc)

        # api documentation: 
headers = {"Ocp-Apim-Subscription-Key" : settings['subscription_key']}
params  = {
#           "license":   settings["license"], 
#           "imageType": settings["imageType"], 
            'setLang':   settings['setLang'], 
            'mkt':       settings['mkt'],
#           'freshness': settings['freshness'],
            'imageType': settings['imageType'],
#           'license':   settings['license'],
            'count':     settings['count']
            }

Loaded BING API settings


In [136]:
# load words from csv 
search_list = []
with open(search_csv_filename, encoding=csv_file_encoding) as csv_file:
    reader = csv.reader(csv_file)
    for row in reader:
        search_list.append(row[0]);

search_list

['ourson',
 'tanière',
 'habile',
 'troupeau',
 'alentours',
 'craintive',
 'meute',
 'bouquetin',
 'rusé',
 'ailes']

In [137]:
def download_image(url, filename):
    with open(filename, 'wb') as f:
        print('Downloading: '+ filename)
        img_data = req.get(url)
        img_data.raise_for_status()
        f.write(img_data.content)

In [149]:
# get data for all search terms
results = []
for search in search_list:
    print('Processing: '+ search)
    result = {}
    result['word'] = search
    
    ###
    # process collins search
    collins_query = collins.format(search)
    result['collins_query'] = collins_query
    collinsReq  = req.get(collins_query)
    collinsData = collinsReq.text
    collinsSoup = bs(collinsData, "html.parser")
    

    # get definition
    result['definition'] = collinsSoup.find('span', attrs={'class': 'cit type-translation'}).text.strip()

    # get part of speech
    result['pos'] = collinsSoup.find('span', attrs={'class': 'pos'}).text.strip()
        
    # create article for nouns
    regex = re.compile(r"(masculine|feminine)\s(noun)")
    match = regex.match(result['pos']) 
    if match is not None:
        if match.group(1) == "feminine":
            result['article'] = 'une'
        else:
            result['article'] = 'un'

    # get IPA
    result['ipa'] = collinsSoup.find('span', attrs={'class': 'pron type-'}).text.strip()

    # get audio file url
    soundElement = collinsSoup.find('a', attrs={'class': 'hwd_sound sound audio_play_button icon-volume-up ptr'})
    result['sound_url'] = soundElement.get('data-src-mp3')

    # download files
    result['audio_file'] = search + '_pronounce.mp3'
    
    # if file doesn't exist locally then save it
    if not Path(result['audio_file']).is_file():
        with open(result['audio_file'], 'wb') as f:
            print("Downolading: " + result['audio_file'] )
            mp3_data = req.get(result['sound_url'])
            mp3_data.raise_for_status()
            f.write(mp3_data.content)
    else:
        print("File exists, skipping: " + result['audio_file'])
    
    #process bing image
    params['q'] = search + " language:{} loc:{}".format(settings['language'], settings['loc'])
    response = req.get(settings['image_api_url'], headers=headers, params=params)
    response.raise_for_status()
    search_results = response.json()
    result['bing_results_json'] = search_results
    result['image_url'] = search_results["value"][image_idx]["contentUrl"]
    result['image_page_url'] = search_results["value"][image_idx]["hostPageUrl"]
    
    original_image_filename = result['image_url'].rsplit('/', 1)[-1] # get end of url after last /
    original_image_ext = original_image_filename.rsplit('.',1)[-1]
    
    result['image_file_original'] = original_image_filename
    result['image_file'] = search + "." + original_image_ext
    
    if not Path(result['image_file']).is_file():
        try:
            download_image(result['image_url'], result['image_file'])    
        except HTTPError:
            print("Couldn't download image, skipping")
            result['image_file'] = None
    else:
        print("File exists, skipping: " + result['image_file'])
    
    resized_filename = search + "_resized." + original_image_ext
    
    if Path(result['image_file']).is_file() and not Path(resized_filename).is_file() :
        with open(result['image_file'], 'r+b') as f:
            with Image.open(f) as image:
                print("Resizing image: " + result['image_file'])
                resized_filename = search + "_resized." + original_image_ext
                cover = resizeimage.resize_cover(image, [400, 300])
                cover.save(resized_filename, image.format)
    result['image_file_resized'] = resized_filename
    
    results.append(result)
print("Completed Queries!")

Processing: ourson
File exists, skipping: ourson_pronounce.mp3
Downloading: ourson.jpg
Resizing image: ourson.jpg
Processing: tanière
File exists, skipping: tanière_pronounce.mp3
File exists, skipping: tanière.jpg
Processing: habile
File exists, skipping: habile_pronounce.mp3
File exists, skipping: habile.jpg
Processing: troupeau
File exists, skipping: troupeau_pronounce.mp3
File exists, skipping: troupeau.jpg
Processing: alentours
File exists, skipping: alentours_pronounce.mp3
File exists, skipping: alentours.jpg
Processing: craintive
File exists, skipping: craintive_pronounce.mp3
File exists, skipping: craintive.jpg
Processing: meute
File exists, skipping: meute_pronounce.mp3
File exists, skipping: meute.jpg
Processing: bouquetin
File exists, skipping: bouquetin_pronounce.mp3
File exists, skipping: bouquetin.jpg
Processing: rusé
File exists, skipping: rusé_pronounce.mp3
File exists, skipping: rusé.jpg
Processing: ailes
File exists, skipping: ailes_pronounce.mp3
File exists, skipping:

In [99]:
results[0]

{'article': 'un',
 'audio_file': 'ourson_pronounce.mp3',
 'bing_results_json': {'_type': 'Images',
  'instrumentation': {'_type': 'ResponseInstrumentation'},
  'nextOffset': 3,
  'readLink': 'images/search?q=ourson language:fr loc:fr',
  'totalEstimatedMatches': 100,
  'value': [{'accentColor': '348097',
    'contentSize': '2110799 B',
    'contentUrl': 'https://media.koreus.com/201609/ourson-dos-maman.jpg',
    'datePublished': '2018-07-17T20:51:00.0000000Z',
    'encodingFormat': 'jpeg',
    'height': 1365,
    'hostPageDisplayUrl': 'www.koreus.com/image/ourson-dos-maman.html',
    'hostPageUrl': 'http://www.koreus.com/image/ourson-dos-maman.html',
    'imageId': '5C64087B550B330F7B1A7DD8D3B929F04E085D9A',
    'imageInsightsToken': 'ccid_aCA5MrnH*mid_5C64087B550B330F7B1A7DD8D3B929F04E085D9A*simid_608007126454174341*thid_OIP.aCA5MrnHn9BJxiVmLzMdgwHaE7',
    'insightsMetadata': {'availableSizesCount': 61,
     'pagesIncludingCount': 128},
    'name': 'Un ourson sur le dos de sa maman',

In [140]:
my_model = genanki.Model(
  model_id,
  'French - 5k (Genaki)',
  fields=[
    {'name': 'Word or Phrase'},
    {'name': 'Article'},
    {'name': 'Part of Speech'},
    {'name': 'Definition'},
    {'name': 'Picture'},
    {'name': 'Audio'},
    {'name': 'IPA'},
    {'name': 'Mnemonic'},
    {'name': 'Source'}
  ],
  templates=[
    {
      'name': 'Picture2Word',
      'qfmt': "<div style='font-family: Arial; font-size: 20px;'>{{Picture}}</div>",
      'afmt': dedent("""\
                <div style='font-family: Arial; font-size: 20px;'>{{Audio}}</div>
                <div style='font-family: Arial; font-size: 20px;'>{{#Article}}{{Article}}{{/Article}}&nbsp;{{Word or Phrase}}</div>
                <div style='font-family: Arial; font-size: 20px;'>{{Part of Speech}}</div>
                <div style='font-family: Arial; font-size: 20px;'>{{Mnemonic}}</div>
                {{#Source}}<div style='font-family: Arial; font-size: 20px;'>{{Source}}</div>{{/Source}}
                """),
    },
    {
      'name': 'Word2Picture',
      'qfmt': dedent("""\
              <div style='font-family: Arial; font-size: 20px;'>{{Audio}}</div>
              <div style='font-family: Arial; font-size: 20px;'>{{#Article}}{{Article}}{{/Article}}&nbsp;{{Word or Phrase}}</div>
              <div style='font-family: Arial; font-size: 20px;'>{{Part of Speech}}</div>
                  """),
      'afmt': dedent("""\
                {{#Picture}}<div style='font-family: Arial; font-size: 20px;'>{{Picture}}</div>{{/Picture}}
                {{#Definition}}<div style='font-family: Arial; font-size: 20px;'>{{Definition}}</div>{{/Definition}}
                {{#Source}}<div style='font-family: Arial; font-size: 20px;'>{{Source}}</div>{{/Source}}
                """)
    },
    {
      'name': 'Spelling',
      'qfmt': dedent("""\
                Peut tu l'épeler?
                <div style='font-family: Arial; font-size: 20px;'>{{Audio}}</div>
                <div style='font-family: Arial; font-size: 20px;'>{{Picture}}</div>
                """),
      'afmt': dedent("""\
                <div style='font-family: Arial; font-size: 20px;'>{{#Article}}{{Article}}{{/Article}}&nbsp;{{Word or Phrase}}</div>
                {{#Source}}<div style='font-family: Arial; font-size: 20px;'>{{Source}}</div>{{/Source}}
                """)
    },
    {
      'name': 'Article',
      'qfmt': dedent("""\
                {{#Article}}
                <div style='font-family: Arial; font-size: 20px;'>[...] {{Word or Phrase}}</div>
                {{/Article}}
                """),
      'afmt': dedent("""\
                <div style='font-family: Arial; font-size: 20px;'>{{Article}}&nbsp;{{Word or Phrase}}</div>
                <div style='font-family: Arial; font-size: 20px;'>{{Mnemonic}}</div>
                {{#Source}}<div style='font-family: Arial; font-size: 20px;'>{{Source}}</div>{{/Source}}
                """)
    },
    {
      'name': 'Pronuncation',
      'qfmt': dedent("""\
                {{#Audio}}{{#IPA}}<div style='font-family: Arial; font-size: 20px;'>
                Comment prononcez-vous ce mot?
                <BR>
                <BR>
                {{Word or Phrase}}
                </div>{{/IPA}}{{/Audio}}
                """),
      'afmt': dedent("""\
                <div style='font-family: Arial; font-size: 20px;'>{{IPA}}</div>
                <div style='font-family: Arial; font-size: 20px;'>{{Audio}}</div>
                {{#Source}}<div style='font-family: Arial; font-size: 20px;'>Sources:&nbsp;{{Source}}</div>{{/Source}}
                """)
    }
  ],
css=""".card {
 font-family: arial;
 font-size: 20px;
 text-align: center;
 color: black;
 background-color: white;
}"""
)

In [141]:
# recall model defn
#     {'name': 'Primary Word/Phrase'},
#     {'name': 'Article'},
#     {'name': 'Part of Speech'},
#     {'name': 'Definition'},
#     {'name': 'Picture'},
#     {'name': 'Audio'},
#     {'name': 'IPA'},
#     {'name': 'Mnemonic'}
#     {'name': 'Source'}

# create notes for each result
my_notes = []
for item in results:
    my_note = genanki.Note(
      model=my_model,
      fields=[
          item['word'],
          item.get('article', ''),
          item.get('pos', ''), # part of speech
          item.get('definition', ''),
          '<img src="{}">'.format(item['image_file_resized']),
           "[sound:{}]".format(item.get('audio_file', '').format(r's')), #[sound:sound.mp3] format for anki decks
          item.get('ipa', ''),
          "", # no predefined mnemonic ;)
          '<a href="{}">collins</a><br><a href="{}">image-page</a>'.format(item['collins_query'], item['image_page_url'])
      ])
    my_notes.append(my_note)


In [142]:
# create deck and add notes
my_deck = genanki.Deck(
  deck_id,
  'French-genanki')

for note in my_notes:
    my_deck.add_note(note)

In [143]:
# create media database
media = [x.get('image_file_resized') 
             for x in results 
             if x['image_file'] is not None]
media.extend([x['audio_file'] 
                  for x in results 
                  if x['audio_file'] is not None])
media

['ourson_resized.jpg',
 'tanière_resized.jpg',
 'habile_resized.jpg',
 'troupeau_resized.jpg',
 'alentours_resized.jpg',
 'craintive_resized.jpg',
 'meute_resized.jpg',
 'bouquetin_resized.jpg',
 'rusé_resized.jpg',
 'ailes_resized.jpg',
 'ourson_pronounce.mp3',
 'tanière_pronounce.mp3',
 'habile_pronounce.mp3',
 'troupeau_pronounce.mp3',
 'alentours_pronounce.mp3',
 'craintive_pronounce.mp3',
 'meute_pronounce.mp3',
 'bouquetin_pronounce.mp3',
 'rusé_pronounce.mp3',
 'ailes_pronounce.mp3']

In [144]:
# create package
my_package = genanki.Package(my_deck)
my_package.media_files = media

In [145]:
# save data
my_package.write_to_file('output.apkg')