FrenchGenanki.ipynb

{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "French Genanki Script\n",
    "====\n",
    "This python notebook automates looking up french words and creating an anki deck of definitions\n",
    "It does the following\n",
    "1. Reads csv file of french words to look up\n",
    "1. Searches collins dictionary for definition, IPA and audio,\n",
    "1. Gets a language specific image from Bing\n",
    "1. Resizes the image\n",
    "1. Builds and exports an anki deck\n",
    "\n",
    "How to use:\n",
    "---\n",
    "1. get an API key for bing image search\n",
    "1. edit bing_settings.yaml\n",
    "1. place search terms in anki_search.csv\n",
    "1. run all cells\n",
    "1. Import your deck\n",
    "1. Check that eveything is okay in anki, then change the notes/cards to your preferred deck\n",
    "1. Delete all the media files when you're done.\n",
    "\n",
    "Toubleshooting\n",
    "---\n",
    "* If csv can't be read, check encording"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 44,
   "metadata": {},
   "outputs": [],
   "source": [
    "from bs4 import BeautifulSoup as bs\n",
    "import requests as req\n",
    "import re, csv, yaml, genanki, random\n",
    "from pathlib import Path\n",
    "from PIL import Image\n",
    "from resizeimage import resizeimage\n",
    "from textwrap import dedent"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 45,
   "metadata": {},
   "outputs": [],
   "source": [
    "### Main settings here ###\n",
    "\n",
    "search_csv_filename = \"anki_search.csv\"\n",
    "csv_file_encoding = 'mac_roman'\n",
    "genanki_id_yaml = \"genanki_ids.yaml\"\n",
    "bing_settings_yaml_filename = 'bing_settings.yaml'\n",
    "resize_image_x, resize_image_y = 400,300\n",
    "\n",
    "# note to self: don't upload your api keys to github!\n",
    "#bing_settings_yaml_filename = 'bing_settings_personal.yaml' "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 46,
   "metadata": {},
   "outputs": [],
   "source": [
    "# create two random numbers for deck and model IDs\n",
    "# if no file exists, create a genanki_ids.yaml file\n",
    "\n",
    "def create_id():\n",
    "    return random.randrange(1 << 30, 1 << 31)\n",
    "\n",
    "if not Path(genanki_id_yaml).is_file():\n",
    "    with open(genanki_id_yaml, 'wt') as f:\n",
    "        f.write('deck_id: ' + str(create_id()) + '\\n') \n",
    "        f.write('model_id: ' + str(create_id()) + '\\n') \n",
    "# load IDs\n",
    "ids = {}\n",
    "with open(genanki_id_yaml, 'r') as stream:\n",
    "    try:\n",
    "        ids = yaml.load(stream)\n",
    "    except yaml.YAMLError as exc:\n",
    "        print(exc)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 47,
   "metadata": {},
   "outputs": [],
   "source": [
    "image_idx = 0 # which bing result should we take?\n",
    "collins = \"https://www.collinsdictionary.com/dictionary/french-english/{}\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 48,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Loaded BING API settings\n"
     ]
    }
   ],
   "source": [
    "# set up BING\n",
    "# load BING image search API key\n",
    "settings = {}\n",
    "with open(bing_settings_yaml_filename, 'r') as stream:\n",
    "    try:\n",
    "        settings = yaml.load(stream)\n",
    "        print('Loaded BING API settings')\n",
    "    except yaml.YAMLError as exc:\n",
    "        print(exc)\n",
    "\n",
    "        # api documentation: \n",
    "headers = {\"Ocp-Apim-Subscription-Key\" : settings['subscription_key']}\n",
    "params  = {\n",
    "            'setLang':   settings['setLang'], \n",
    "            'mkt':       settings['mkt'],\n",
    "            'imageType': settings['imageType'],\n",
    "            'count':     settings['count']\n",
    "# these other options seems to hinder the results, so I disabled them    \n",
    "#           'freshness': settings['freshness'],\n",
    "#           \"license\":   settings[\"license\"], \n",
    "#           \"imageType\": settings[\"imageType\"], \n",
    "#           'license':   settings['license'],\n",
    "            }"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 49,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['ourson',\n",
       " 'tanière',\n",
       " 'habile',\n",
       " 'troupeau',\n",
       " 'alentours',\n",
       " 'craintive',\n",
       " 'meute',\n",
       " 'bouquetin',\n",
       " 'rusé',\n",
       " 'ailes']"
      ]
     },
     "execution_count": 49,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# load words to look up from csv file\n",
    "search_list = []\n",
    "with open(search_csv_filename, encoding=csv_file_encoding) as f:\n",
    "    reader = csv.reader(f)\n",
    "    for row in reader:\n",
    "        # assuming no header in csv file\n",
    "        search_list.append(row[0]); #assumes only one column with search terms\n",
    "\n",
    "search_list"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 50,
   "metadata": {},
   "outputs": [],
   "source": [
    "def download(url, filename):\n",
    "    with open(filename, 'wb') as f:\n",
    "        print('Downloading: '+ filename)\n",
    "        img_data = req.get(url)\n",
    "        img_data.raise_for_status()\n",
    "        f.write(img_data.content)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 51,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Processing: ourson\n",
      "Downloading: ourson_pronounce.mp3\n",
      "Downloading: ourson.jpg\n",
      "Resizing image: ourson.jpg\n",
      "Processing: tanière\n",
      "Downloading: tanière_pronounce.mp3\n",
      "Downloading: tanière.jpg\n",
      "Resizing image: tanière.jpg\n",
      "Processing: habile\n",
      "Downloading: habile_pronounce.mp3\n",
      "Downloading: habile.jpg\n",
      "Resizing image: habile.jpg\n",
      "Processing: troupeau\n",
      "Downloading: troupeau_pronounce.mp3\n",
      "Downloading: troupeau.jpg\n",
      "Resizing image: troupeau.jpg\n",
      "Processing: alentours\n",
      "Downloading: alentours_pronounce.mp3\n",
      "Downloading: alentours.jpg\n",
      "Resizing image: alentours.jpg\n",
      "Processing: craintive\n",
      "Downloading: craintive_pronounce.mp3\n",
      "Downloading: craintive.jpg\n",
      "Resizing image: craintive.jpg\n",
      "Processing: meute\n",
      "Downloading: meute_pronounce.mp3\n",
      "Downloading: meute.jpg\n",
      "Resizing image: meute.jpg\n",
      "Processing: bouquetin\n",
      "Downloading: bouquetin_pronounce.mp3\n",
      "Downloading: bouquetin.jpg\n",
      "Resizing image: bouquetin.jpg\n",
      "Processing: rusé\n",
      "Downloading: rusé_pronounce.mp3\n",
      "Downloading: rusé.jpg\n",
      "Resizing image: rusé.jpg\n",
      "Processing: ailes\n",
      "Downloading: ailes_pronounce.mp3\n",
      "Downloading: ailes.jpg\n",
      "Resizing image: ailes.jpg\n",
      "Completed Queries!\n"
     ]
    }
   ],
   "source": [
    "# get data for all search terms\n",
    "results = []\n",
    "for search in search_list:\n",
    "    print('Processing: '+ search)\n",
    "    result = {}\n",
    "    result['word'] = search\n",
    "    \n",
    "    ###\n",
    "    # process collins search\n",
    "    collins_query = collins.format(search)\n",
    "    result['collins_query'] = collins_query\n",
    "    collinsReq  = req.get(collins_query)\n",
    "    collinsData = collinsReq.text\n",
    "    collinsSoup = bs(collinsData, \"html.parser\")\n",
    "    \n",
    "\n",
    "    # get definition \n",
    "    result['definition'] = \\\n",
    "        collinsSoup.find('span', \n",
    "                         attrs={'class': 'cit type-translation'}).text.strip()\n",
    "\n",
    "    # get part of speech\n",
    "    result['pos'] = collinsSoup.find('span', \n",
    "                                     attrs={'class': 'pos'}).text.strip()\n",
    "        \n",
    "    # create article for nouns\n",
    "    regex = re.compile(r\"(masculine|feminine)\\s(noun)\")\n",
    "    match = regex.match(result['pos']) \n",
    "    if match is not None:\n",
    "        if match.group(1) == \"feminine\":\n",
    "            result['article'] = 'une'\n",
    "        else:\n",
    "            result['article'] = 'un'\n",
    "\n",
    "    # get IPA\n",
    "    result['ipa'] = \\\n",
    "        collinsSoup.find('span', attrs={'class': 'pron type-'}).text.strip()\n",
    "\n",
    "    # get audio file url\n",
    "    sound_attrs = {'class': \" \".join(['hwd_sound',\n",
    "                            'sound',\n",
    "                            'audio_play_button',\n",
    "                            'icon-volume-up',\n",
    "                            'ptr'])}\n",
    "    \n",
    "    soundElement = collinsSoup.find('a', attrs=sound_attrs)\n",
    "    result['sound_url'] = soundElement.get('data-src-mp3')\n",
    "\n",
    "    # download files\n",
    "    result['audio_file'] = search + '_pronounce.mp3'\n",
    "    \n",
    "    # if file doesn't exist locally then save it\n",
    "    if not Path(result['audio_file']).is_file():\n",
    "        download(result['sound_url'], result['audio_file'])\n",
    "    else:\n",
    "        print(\"File exists, skipping: \" + result['audio_file'])\n",
    "    \n",
    "    ###\n",
    "    # process bing image\n",
    "    bing_advanced_query = \" language:{} loc:{}\" \\\n",
    "        .format(settings['language'], settings['loc'])                                           \n",
    "    params['q'] = search + bing_advanced_query\n",
    "    response = req.get(settings['image_api_url'], headers=headers, params=params)\n",
    "    response.raise_for_status()\n",
    "    search_results = response.json()\n",
    "    \n",
    "    result['bing_results_json'] = search_results\n",
    "    result['image_url'] = search_results[\"value\"][image_idx][\"contentUrl\"]\n",
    "    result['image_page_url'] = search_results[\"value\"][image_idx][\"hostPageUrl\"]\n",
    "    \n",
    "    # get end of url after last /\n",
    "    original_image_filename = result['image_url'].rsplit('/', 1)[-1] \n",
    "    original_image_ext = original_image_filename.rsplit('.',1)[-1]\n",
    "    \n",
    "    result['image_file_original'] = original_image_filename\n",
    "    result['image_file'] = search + \".\" + original_image_ext\n",
    "    \n",
    "    # download image file\n",
    "    if not Path(result['image_file']).is_file():\n",
    "        try:\n",
    "            download(result['image_url'], result['image_file'])    \n",
    "        except HTTPError:\n",
    "            print(\"Couldn't download image, skipping\")\n",
    "            result['image_file'] = None\n",
    "    else:\n",
    "        print(\"File exists, skipping: \" + result['image_file'])\n",
    "    \n",
    "    ###\n",
    "    # resize image file\n",
    "    resized_filename = search + \"_resized.\" + original_image_ext\n",
    "    \n",
    "    if Path(result['image_file']).is_file() and not Path(resized_filename).is_file() :\n",
    "        with open(result['image_file'], 'r+b') as f:\n",
    "            with Image.open(f) as image:\n",
    "                print(\"Resizing image: \" + result['image_file'])\n",
    "                resized_filename = search + \"_resized.\" + original_image_ext\n",
    "                cover = resizeimage.resize_cover(image, [resize_image_x, resize_image_y])\n",
    "                cover.save(resized_filename, image.format)\n",
    "    result['image_file_resized'] = resized_filename\n",
    "    \n",
    "    results.append(result)\n",
    "print(\"Completed Queries!\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 52,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Create Anki model\n",
    "my_model = genanki.Model(\n",
    "  int(ids['model_id']),\n",
    "  'French - 5k (Genaki)',\n",
    "  fields=[\n",
    "    {'name': 'Word or Phrase'},\n",
    "    {'name': 'Article'},\n",
    "    {'name': 'Part of Speech'},\n",
    "    {'name': 'Definition'},\n",
    "    {'name': 'Picture'},\n",
    "    {'name': 'Audio'},\n",
    "    {'name': 'IPA'},\n",
    "    {'name': 'Mnemonic'},\n",
    "    {'name': 'Source'}\n",
    "  ],\n",
    "  templates=[\n",
    "    {\n",
    "      'name': 'Picture2Word',\n",
    "      'qfmt': \"<div style='font-family: Arial; font-size: 20px;'>{{Picture}}</div>\",\n",
    "      'afmt': dedent(\"\"\"\\\n",
    "                <div style='font-family: Arial; font-size: 20px;'>{{Audio}}</div>\n",
    "                <div style='font-family: Arial; font-size: 20px;'>{{#Article}}{{Article}}{{/Article}}&nbsp;{{Word or Phrase}}</div>\n",
    "                <div style='font-family: Arial; font-size: 20px;'>{{Part of Speech}}</div>\n",
    "                <div style='font-family: Arial; font-size: 20px;'>{{Mnemonic}}</div>\n",
    "                {{#Source}}<div style='font-family: Arial; font-size: 20px;'>{{Source}}</div>{{/Source}}\n",
    "                \"\"\"),\n",
    "    },\n",
    "    {\n",
    "      'name': 'Word2Picture',\n",
    "      'qfmt': dedent(\"\"\"\\\n",
    "              <div style='font-family: Arial; font-size: 20px;'>{{Audio}}</div>\n",
    "              <div style='font-family: Arial; font-size: 20px;'>{{#Article}}{{Article}}{{/Article}}&nbsp;{{Word or Phrase}}</div>\n",
    "              <div style='font-family: Arial; font-size: 20px;'>{{Part of Speech}}</div>\n",
    "                  \"\"\"),\n",
    "      'afmt': dedent(\"\"\"\\\n",
    "                {{#Picture}}<div style='font-family: Arial; font-size: 20px;'>{{Picture}}</div>{{/Picture}}\n",
    "                {{#Definition}}<div style='font-family: Arial; font-size: 20px;'>{{Definition}}</div>{{/Definition}}\n",
    "                {{#Source}}<div style='font-family: Arial; font-size: 20px;'>{{Source}}</div>{{/Source}}\n",
    "                \"\"\")\n",
    "    },\n",
    "    {\n",
    "      'name': 'Spelling',\n",
    "      'qfmt': dedent(\"\"\"\\\n",
    "                Peut tu l'épeler?\n",
    "                <div style='font-family: Arial; font-size: 20px;'>{{Audio}}</div>\n",
    "                <div style='font-family: Arial; font-size: 20px;'>{{Picture}}</div>\n",
    "                \"\"\"),\n",
    "      'afmt': dedent(\"\"\"\\\n",
    "                <div style='font-family: Arial; font-size: 20px;'>{{#Article}}{{Article}}{{/Article}}&nbsp;{{Word or Phrase}}</div>\n",
    "                {{#Source}}<div style='font-family: Arial; font-size: 20px;'>{{Source}}</div>{{/Source}}\n",
    "                \"\"\")\n",
    "    },\n",
    "    {\n",
    "      'name': 'Article',\n",
    "      'qfmt': dedent(\"\"\"\\\n",
    "                {{#Article}}\n",
    "                <div style='font-family: Arial; font-size: 20px;'>[...] {{Word or Phrase}}</div>\n",
    "                {{/Article}}\n",
    "                \"\"\"),\n",
    "      'afmt': dedent(\"\"\"\\\n",
    "                <div style='font-family: Arial; font-size: 20px;'>{{Article}}&nbsp;{{Word or Phrase}}</div>\n",
    "                <div style='font-family: Arial; font-size: 20px;'>{{Mnemonic}}</div>\n",
    "                {{#Source}}<div style='font-family: Arial; font-size: 20px;'>{{Source}}</div>{{/Source}}\n",
    "                \"\"\")\n",
    "    },\n",
    "    {\n",
    "      'name': 'Pronuncation',\n",
    "      'qfmt': dedent(\"\"\"\\\n",
    "                {{#Audio}}{{#IPA}}<div style='font-family: Arial; font-size: 20px;'>\n",
    "                Comment prononcez-vous ce mot?\n",
    "                <BR>\n",
    "                <BR>\n",
    "                {{Word or Phrase}}\n",
    "                </div>{{/IPA}}{{/Audio}}\n",
    "                \"\"\"),\n",
    "      'afmt': dedent(\"\"\"\\\n",
    "                <div style='font-family: Arial; font-size: 20px;'>{{IPA}}</div>\n",
    "                <div style='font-family: Arial; font-size: 20px;'>{{Audio}}</div>\n",
    "                {{#Source}}<div style='font-family: Arial; font-size: 20px;'>Sources:&nbsp;{{Source}}</div>{{/Source}}\n",
    "                \"\"\")\n",
    "    }\n",
    "  ],\n",
    "css=\"\"\".card {\n",
    " font-family: arial;\n",
    " font-size: 20px;\n",
    " text-align: center;\n",
    " color: black;\n",
    " background-color: white;\n",
    "}\"\"\"\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 53,
   "metadata": {},
   "outputs": [],
   "source": [
    "# recall model defn\n",
    "#     {'name': 'Primary Word/Phrase'},\n",
    "#     {'name': 'Article'},\n",
    "#     {'name': 'Part of Speech'},\n",
    "#     {'name': 'Definition'},\n",
    "#     {'name': 'Picture'},\n",
    "#     {'name': 'Audio'},\n",
    "#     {'name': 'IPA'},\n",
    "#     {'name': 'Mnemonic'}\n",
    "#     {'name': 'Source'}\n",
    "\n",
    "# create notes for each result\n",
    "my_notes = []\n",
    "for item in results:\n",
    "    my_note = genanki.Note(\n",
    "      model=my_model,\n",
    "      fields=[\n",
    "          item['word'],\n",
    "          item.get('article', ''),\n",
    "          item.get('pos', ''), # part of speech\n",
    "          item.get('definition', ''),\n",
    "          '<img src=\"{}\">'.format(item['image_file_resized']),\n",
    "           \"[sound:{}]\".format(item.get('audio_file', '').format(r's')), #[sound:sound.mp3] format for anki decks\n",
    "          item.get('ipa', ''),\n",
    "          \"\", # no predefined mnemonic ;)\n",
    "          '<a href=\"{}\">collins</a><br><a href=\"{}\">image-page</a>'.format(item['collins_query'], item['image_page_url'])\n",
    "      ])\n",
    "    my_notes.append(my_note)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 54,
   "metadata": {},
   "outputs": [],
   "source": [
    "# create deck and add notes\n",
    "my_deck = genanki.Deck(\n",
    "  int(ids['deck_id']),\n",
    "  'French-genanki')\n",
    "\n",
    "for note in my_notes:\n",
    "    my_deck.add_note(note)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 55,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['ourson_resized.jpg',\n",
       " 'tanière_resized.jpg',\n",
       " 'habile_resized.jpg',\n",
       " 'troupeau_resized.jpg',\n",
       " 'alentours_resized.jpg',\n",
       " 'craintive_resized.jpg',\n",
       " 'meute_resized.jpg',\n",
       " 'bouquetin_resized.jpg',\n",
       " 'rusé_resized.jpg',\n",
       " 'ailes_resized.jpg',\n",
       " 'ourson_pronounce.mp3',\n",
       " 'tanière_pronounce.mp3',\n",
       " 'habile_pronounce.mp3',\n",
       " 'troupeau_pronounce.mp3',\n",
       " 'alentours_pronounce.mp3',\n",
       " 'craintive_pronounce.mp3',\n",
       " 'meute_pronounce.mp3',\n",
       " 'bouquetin_pronounce.mp3',\n",
       " 'rusé_pronounce.mp3',\n",
       " 'ailes_pronounce.mp3']"
      ]
     },
     "execution_count": 55,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# create media database\n",
    "media = [x.get('image_file_resized') \n",
    "             for x in results \n",
    "             if x['image_file'] is not None]\n",
    "media.extend([x['audio_file'] \n",
    "                  for x in results \n",
    "                  if x['audio_file'] is not None])\n",
    "media"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 56,
   "metadata": {},
   "outputs": [],
   "source": [
    "# create package\n",
    "my_package = genanki.Package(my_deck)\n",
    "my_package.media_files = media"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 57,
   "metadata": {},
   "outputs": [],
   "source": [
    "# save data\n",
    "my_package.write_to_file('output.apkg')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}