Add audio

Sourced from http://mtc.ntnu.edu.tw/chinese-resource.htm.
jiru · Mar 16, 2023 · 9c0bb02 · 9c0bb02
1 parent d863492
commit 9c0bb02
Show file tree

Hide file tree

Showing 7 changed files with 292 additions and 23 deletions.
diff --git a/README.md b/README.md
@@ -8,13 +8,14 @@ This is an Anki deck to review vocabulary from the textbook 當代中文課程 a
 ## Features
 
 * Contains all the vocabulary from book 1 to book 6 lesson 1
+* Every card has audio [as pronunced by the authors of the book](http://mtc.ntnu.edu.tw/chinese-resource.htm)
 * Proper Taiwanese-style characters rendering (using html lang tags)
 * Colored Hanzi according to tones (generated at run time, so it works for any new card too)
   * You can change the colors by editing the card CSS and replacing the [color codes](https://en.wikipedia.org/wiki/Web_colors)
 * Includes part of speech (noun, verb, measure word…)
 * Two types of cards:
-  * hanzi+english → pinyin
-  * pinyin+english → hanzi
+  * hanzi+english → pinyin+audio
+  * pinyin+english+audio → hanzi
 * People and place names tagged
 * Lesson number included to easily search cards from a specific lesson or book:
   * Search for `B1L5-I` to get the dialog from lesson 5, book 1
@@ -27,9 +28,14 @@ This is an Anki deck to review vocabulary from the textbook 當代中文課程 a
 
 First install [genanki](https://github.com/kerrickstaley/genanki), and then run:
 ```
+./build.sh output.apkg
+```
+It will download the audio, process it and generate the deck into a file `output.apkg`.
+
+If you don't want the audio, just run:
+```
 ./generate_deck.py output.apkg
 ```
-It will generate the deck into a file `output.apkg`.
 
 ## How to use
 

diff --git a/build.sh b/build.sh
@@ -0,0 +1,26 @@
+#!/bin/bash
+
+set -e
+
+if [ "$#" -ne 1 ]; then
+  echo "Usage: $0 <output.apkg>"
+  exit 1
+fi
+
+./download_audio.sh
+
+./extract_audio.sh 1 '當代中文課程 第一冊 課本/'
+./extract_audio.sh 2 '當代中文課程 第二冊 課本/'
+./extract_audio.sh 3 '當代中文課程 第三冊 課本/'
+./extract_audio.sh 4 '當代中文課程 第四冊 課本/'
+./extract_audio.sh 5 '當代中文課程 第五冊 課本/'
+./extract_audio.sh 6 '當代中文課程 第六冊 課本/'
+
+./generate_deck.py \
+  --audio-1='當代中文課程 第一冊 課本/' \
+  --audio-2='當代中文課程 第二冊 課本/' \
+  --audio-3='當代中文課程 第三冊 課本/' \
+  --audio-4='當代中文課程 第四冊 課本/' \
+  --audio-5='當代中文課程 第五冊 課本/' \
+  --audio-6='當代中文課程 第六冊 課本/' \
+  "$1"
diff --git a/download_audio.sh b/download_audio.sh
@@ -0,0 +1,35 @@
+#!/bin/bash
+
+set -e
+
+archives=(
+  http://mtc.ntnu.edu.tw/upload_files/book/A%20Course%20in%20Contemporary%20Chinese%201%20-%20Textbook%20Audio%20Files.rar
+  http://mtc.ntnu.edu.tw/upload_files/book/A%20Course%20in%20Contemporary%20Chinese%202%20-%20Textbook%20Audio%20Files.rar
+  http://mtc.ntnu.edu.tw/upload_files/book/A%20Course%20in%20Contemporary%20Chinese%203%20-%20Textbook%20Audio%20Files.rar
+  http://mtc.ntnu.edu.tw/upload_files/book/A%20Course%20in%20Contemporary%20Chinese%204%20-%20Textbook%20Audio%20Files.rar
+  http://mtc.ntnu.edu.tw/upload_files/book/A%20Course%20in%20Contemporary%20Chinese%205%20-%20Textbook%20Audio%20Files.rar
+  http://mtc.ntnu.edu.tw/upload_files/book/A%20Course%20in%20Contemporary%20Chinese%206%20-%20Textbook%20Audio%20Files.rar
+)
+
+sha1sums=(
+  a4bed58618f0425eafd5ab031909e031e7a6bbf5
+  dae3a8c8c3fb7e78f1ede3a671940237da8f5a9a
+  9423150fba32c1a3ae5d65c83d6e1ce953ac616f
+  04c97d1693696e5df41fdcd275df96fae4cc7e18
+  29254c66e122c538a8f494e99409d7265abe041b
+  0e4a4607ac8d59c0e4ee3578f148ca225cf22327
+)
+
+for book in $(seq 1 "${#archives[@]}"); do
+  let i=book-1 || true
+  url="${archives[$i]}"
+  filename="A Course in Contemporary Chinese $book - Textbook Audio Files.rar"
+  wget --continue "$url" -O "$filename"
+  sha1sum="${sha1sums[$i]}"
+  sha1sum -c <(echo "$sha1sum  $filename")
+done
+
+for book in $(seq 1 "${#archives[@]}"); do
+  filename="A Course in Contemporary Chinese $book - Textbook Audio Files.rar"
+  unrar x -y "$filename"
+done
diff --git a/extract_audio.sh b/extract_audio.sh
@@ -0,0 +1,144 @@
+#!/bin/bash
+
+print_silences() {
+  ffmpeg -nostdin -i "$1" -filter_complex "[0:a]silencedetect=n=-70dB:d=1.3[outa]" -map [outa] -f s16le -y /dev/null 2>&1
+}
+
+split_by_silence() {
+  local ext output_file book_number="$1" input_file="$2" output_dir="$3"
+
+  case "$book_number" in
+    5) split_start=2 ;;
+    *) split_start=1 ;;
+  esac
+
+  ext=${input_file/*\./}
+  output_file=$output_dir/B$book_number.${input_file/*\//}
+  output_file=${output_file%.*}
+  print_silences "$input_file" \
+    | perl -ne '
+      INIT { printf "set -e\n"; $ss=0; $se=0; $margin=0.25; }
+      if (/silence_start: (\S+)/) {
+        $ss=$1;
+        if ($ctr >= '"$split_start"') {
+          printf "ffmpeg -nostdin -i \"'"$input_file"'\" -ss %f -t %f -c copy -vn -sn -dn -y \"'"$output_file"'.%03d.'$ext'\"\n", $se-$margin, ($ss-$se)+$margin*2, $ctr;
+        }
+        $ctr+=1;
+      }
+      if (/silence_end: (\S+)/) {
+        $se=$1;
+      }' \
+    | bash -x
+    if [ $? -ne 0 ] || ! find "$output_dir" -type f | grep -q .; then
+      echo "Unable to split file '$input_file' by silence"
+      exit 1
+    fi
+}
+
+get_spectrum_entropy() {
+  ffmpeg -nostdin -loglevel quiet "$@" -af aspectralstats,ametadata=print:file=- -f null - 2>&1 | grep -o 'entropy=.*' | cut -d= -f2
+}
+
+is_bell_sound() {
+  local numbers sum n avg1000
+
+  numbers=$(get_spectrum_entropy "$@")
+  sum=$(bc <<<"`echo $numbers | sed 's,\S\+,(\0),g;s,e-,*10^-,g' | tr ' ' +`")
+  n=$(echo $numbers | tr ' ' '\n' | wc -l)
+  avg1000=$(bc <<<"$sum*1000/$n")
+  [ "$avg1000" -le 28 ] && return 0 || return 1
+}
+
+get_vocab_start_pos() {
+  local margin="0.25" input_file="$1" skip="$2"
+
+  start=$(ffmpeg -nostdin -i "$input_file" -af silencedetect=d=0.15:noise=-35dB -f null - 2>&1 \
+    | grep -om $skip 'silence_end: [^ ]\+' \
+    | tail -n 1 \
+    | grep -o '[0-9.]\+'
+  )
+  start=$(awk "BEGIN {print $start-$margin}")
+  echo "$start"
+}
+
+remove_prefix() {
+  local book_number="$1" input_file="$2"
+  local skip=2 duration start ext output_file
+
+  start=$(get_vocab_start_pos "$input_file" "$skip")
+  if is_bell_sound -to "$start" -i "$input_file"; then
+    if [ "$book_number" -eq 5 ]; then
+      # skip the whole file as it just contains "課文一" phrase
+      rm "$input_file"
+      return
+    fi
+    let skip++ # skip bell sound
+    start=$(get_vocab_start_pos "$input_file" "$skip")
+  fi
+  ext=${input_file/*\./}
+  output_file="$input_file".tmp.$ext
+  ffmpeg -nostdin -ss "$start" -i "$input_file" -c copy -y "$output_file"
+
+  # Check if result looks okay
+  duration=$(ffprobe -i "$output_file" -show_entries format=duration -v quiet -of csv="p=0")
+  if awk "BEGIN { if ($duration < 0.5) { print \"fail\" }}" | grep -q fail; then
+    echo "Failed to remove prefix from '$input_file': truncated file way too short"
+    exit 1
+  else
+    mv "$output_file" "$input_file"
+  fi
+}
+
+count_silences() {
+  print_silences "$1" | grep -c silence_duration:
+}
+
+find_vocab_chaps() {
+  local book_number="$1" output_basedir="$2"
+  local vocab_files chapfile chapfiles lesson lessons
+
+  case "$book_number" in
+    5|6) vocab_files=2367 ;;
+    *)   vocab_files=24 ;;
+  esac
+
+  lessons=$(ls -1 "$output_basedir"/*-0[$vocab_files].??? | sed 's,.*/,,;s/-.*//' | uniq)
+  for lesson in $lessons; do
+    chapfiles=("$output_basedir"/$lesson-0[$vocab_files].???)
+    if [ "${#chapfiles[@]}" -gt 2 ]; then
+      for chapfile in "${chapfiles[@]}"; do
+        echo $(count_silences "$chapfile") $chapfile
+      done \
+        | sort -n \
+        | tail -n 2 \
+        | cut -d' ' -f 2-
+    else
+      for chapfile in "${chapfiles[@]}"; do
+        echo "$chapfile"
+      done
+    fi
+  done
+}
+
+if [ $# -ne 2 ]; then
+  echo "Usage: $0 <book-number> <chapter-audio-directory>"
+  echo "Example: $0 1 '當代中文課程 第一冊 課本/'"
+  exit 1
+fi
+
+book_number="$1"
+output_basedir="$2"
+while [ "${output_basedir}" != "${output_basedir%/}" ]; do
+  output_basedir=${output_basedir%/};
+done
+
+while read -r chapter; do
+  output_dir=${chapter/.*\//}.split
+  rm -rf "$output_dir"
+  mkdir -p "$output_dir"
+  split_by_silence "$book_number" "$chapter" "$output_dir"
+
+  for file in "$output_dir"/*; do
+    remove_prefix "$book_number" "$file"
+  done
+done < <(find_vocab_chaps "$book_number" "$output_basedir")
diff --git a/generate_deck.py b/generate_deck.py
@@ -3,6 +3,9 @@
 import genanki
 import csv
 from sys import argv
+from optparse import OptionParser
+from glob import glob
+from os.path import basename
 
 # Pseudorandom IDs generated using
 # import random; random.randrange(1 << 30, 1 << 31)
@@ -21,18 +24,61 @@ def read_file(file):
   with open(file, 'r') as f:
     return f.read()
 
-def add_notes(deck, model, tsv_file):
+def sorted_glob(pattern):
+  result = glob(pattern)
+  result.sort()
+  return result
+
+def add_notes(deck, model, tsv_file, audios_all_books):
+  audios_by_lesson = {}
+  book = 1
+  for audios_one_book in audios_all_books:
+    if audios_one_book:
+      dir_number = 1
+      lesson_dirs = sorted_glob(audios_one_book + "/*/")
+      for lesson_dir in lesson_dirs:
+        lesson = int(dir_number/2+0.5)
+        sublesson = "I"*((dir_number-1)%2+1)
+        fq_lesson = f"B{book}L{lesson:02}-{sublesson}"
+        audios_by_lesson[fq_lesson] = sorted_glob(lesson_dir + "/*")
+        dir_number = dir_number + 1
+    book = book + 1
+
+  added_audios = []
   with open(tsv_file) as ccc:
     deckreader = csv.reader(ccc, delimiter='\t')
     for row in deckreader:
-      note = CCCNote(
-        model=model,
-        fields=row[0:5],
-        tags=row[5].split(' ')
-      )
+      try:
+        fields = row[0:5] + ['']
+        lesson = row[4]
+        tags = row[5].split(' ')
+      except IndexError:
+        line = deckreader.line_num
+        print(f"Error parsing '{tsv_file}' at line {line}")
+        exit(1)
+
+      try:
+        audio = audios_by_lesson[lesson].pop(0)
+        added_audios.append(audio)
+        audio = basename(audio)
+        fields[5] = f"[sound:{audio}]"
+      except IndexError:
+        print(f"Warning: missing audio for lesson {lesson}, vocab {row[0:3]}")
+      except KeyError:
+        pass
+
+      note = CCCNote(model=model, fields=fields, tags=tags)
       deck.add_note(note)
 
-def gen_model():
+  for lesson in audios_by_lesson:
+    if (len(audios_by_lesson[lesson]) > 0):
+      print(f"Warning: unassigned audios for lesson {lesson}:")
+      for audio in audios_by_lesson[lesson]:
+        print("  " + audio)
+
+  return added_audios
+
+def gen_model(audios_all_books):
   script = '\n<script>\n' \
         + read_file('colorize_hanzi.js') \
         + '\n</script>'
@@ -45,6 +91,7 @@ def gen_model():
       {"font": "Arial", "name": "English"},
       {"font": "Arial", "name": "Part of speech"},
       {"font": "Arial", "name": "Lesson"},
+      {"font": "Arial", "name": "Audio"},
     ],
     templates=[
       {
@@ -61,18 +108,28 @@ def gen_model():
     css=read_file('tmpl.css'),
   )
 
-def compile_deck(output_file):
+def compile_deck(output_file, audios_all_books):
   my_deck = genanki.Deck(deck_id, '當代中文課程')
-  my_model = gen_model()
-  add_notes(my_deck, my_model, 'ccc.tsv')
-  genanki.Package(my_deck).write_to_file(output_file)
+  my_model = gen_model(audios_all_books)
+  added_audios = add_notes(my_deck, my_model, 'ccc.tsv', audios_all_books)
+
+  my_package = genanki.Package(my_deck)
+  my_package.media_files = added_audios
+  my_package.write_to_file(output_file)
 
 def run():
-  try:
-    output_file = argv[1]
-  except IndexError:
-    print(f"Usage: {argv[0]} <output_deck.apkg>")
-  else:
-    compile_deck(output_file)
-
-run()
+  total_books = 6
+  parser = OptionParser(usage="Usage: %prog [options] <output_deck.apkg>")
+  for i in range(1, total_books+1):
+    parser.add_option(f"", f"--audio-{i}", dest=f"audio{i}", metavar="DIR",
+                      help=f"get audio files of book {i} from subdirectories of DIR")
+  (options, args) = parser.parse_args()
+
+  if len(args) != 1:
+    parser.error("incorrect number of arguments")
+
+  audios = [getattr(options, f"audio{i}") for i in range(1, total_books+1)]
+  compile_deck(args[0], audios)
+
+if __name__ == "__main__":
+  run()
diff --git a/tmpl.cmn.qfmt.html b/tmpl.cmn.qfmt.html
@@ -3,5 +3,6 @@
 {{/Part of speech}}
 
 <span lang="en" id="ddzw-pinyin">{{Pinyin}}</span>
+{{Audio}}
 <br><br>
 {{English}}
diff --git a/tmpl.eng.afmt.html b/tmpl.eng.afmt.html
@@ -2,5 +2,5 @@
 
 <hr id=answer>
 
-<span lang="en"><span id="ddzw-pinyin">{{Pinyin}}</span><br><br>{{English}}</span><br><br>
+<span lang="en"><span id="ddzw-pinyin">{{Pinyin}}</span>{{Audio}}<br><br>{{English}}</span><br><br>
 <span lang="en"><small>{{Lesson}}</small></span>