Add support for Hanping exports

The program can now be used to build a deck from a word list exported from the Hanping Android app, in addition to using the HSK list (which is still the default behavior).
kerrickstaley · Jun 22, 2015 · 09c283b · 09c283b
1 parent 24fe608
commit 09c283b
Show file tree

Hide file tree

Showing 7 changed files with 304 additions and 20 deletions.
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/Cargo.toml b/Cargo.toml
@@ -6,6 +6,7 @@ authors = ["Kerrick Staley <kerrick@kerrickstaley.com>"]
 
 [dependencies]
 
+getopts = "0.2.11"
 time = "0.1.24"
 regex = "0.1.38"
 regex_macros = "0.1.20"

diff --git a/make_apkg_hanping.sh b/make_apkg_hanping.sh
@@ -0,0 +1,6 @@
+#!/bin/sh
+set -e
+cargo run --release -- --hanping_words=hanping_words.txt --extra_entries=hanping_extra_entries.txt
+echo '{}' > /tmp/media
+cd /tmp
+zip deck.apkg collection.anki2 media
diff --git a/src/cedict.rs b/src/cedict.rs
@@ -1,3 +1,4 @@
+use std::ascii::AsciiExt;
 use std::collections::HashMap;
 
 #[derive(Clone)]
@@ -18,9 +19,10 @@ pub struct Classifier<'a> {
 }
 
 pub struct DictSearchParams<'a> {
-  simp: Option<&'a str>,
-  trad: Option<&'a str>,
-  pinyin: Option<&'a str>,
+  pub simp: Option<&'a str>,
+  pub trad: Option<&'a str>,
+  // the case of .pinyin is ignored when searching
+  pub pinyin: Option<&'a str>,
 }
 
 fn parse_entry<'a>(entry_str: &'a str) -> Option<Entry<'a>> {
@@ -39,7 +41,7 @@ fn parse_entry<'a>(entry_str: &'a str) -> Option<Entry<'a>> {
           match clfr_re.captures(clfr_str) {
             Some(cap) => {
               clfrs.push(
-                  Classifier{
+                  Classifier {
                       trad: cap.at(1).unwrap_or(""),
                       simp: cap.at(2).unwrap_or(cap.at(1).unwrap_or("")),
                       pinyin: cap.at(3).unwrap_or(""),
@@ -107,7 +109,7 @@ fn entry_matches(entry: &Entry, params: &DictSearchParams) -> bool {
   }
   match params.pinyin {
     Some(pinyin) => {
-      if entry.pinyin != pinyin {
+      if entry.pinyin.to_ascii_lowercase() != pinyin.to_ascii_lowercase() {
         return false;
       }
     },
@@ -146,17 +148,23 @@ impl<'a> Dict<'a> {
     }
     rv.trad_idx = build_index(&rv.entries, |ent| ent.trad);
     rv.simp_idx = build_index(&rv.entries, |ent| ent.simp);
-    rv.pinyin_idx = build_index(&rv.entries, |ent| ent.pinyin);
+    rv.pinyin_idx = build_index(&rv.entries, |ent| &ent.pinyin.to_ascii_lowercase());
     rv
   }
 
   pub fn search(&self, params: DictSearchParams) -> Vec<Entry<'a>> {
+    // TODO: figure out if this can be expressed more succinctly
+    let pinyin_lower_string = match params.pinyin {
+      Some(p) => p.to_ascii_lowercase(),
+      None => "".to_string(),
+    };
+    let pinyin = params.pinyin.map(|_| &pinyin_lower_string as &str);
     // TODO: this is hella messy, tixif!
     let candidate_idxs = match params.trad.and_then(|x| self.trad_idx.get(x)) {
       Some(c) => c,
       None => match params.simp.and_then(|x| self.simp_idx.get(x)) {
         Some(c) => c,
-        None => match params.pinyin.and_then(|x| self.pinyin_idx.get(x)) {
+        None => match pinyin.and_then(|x| self.pinyin_idx.get(x)) {
           Some(c) => c,
           // either there were no candidates (HashMap lookup returned None) or the caller didn't
           // fill out any of params's fields

diff --git a/src/hanping.rs b/src/hanping.rs
@@ -0,0 +1,168 @@
+use cedict;
+use chinese_note;
+use std;
+
+pub fn get_chinese_notes<'a>(wordlist: &'a str, extra_entries: &'a str)
+    -> Vec<chinese_note::ChineseNote<'a>> {
+  let dict = cedict::Dict::new_with_extra_entries(extra_entries);
+  let mut rv = Vec::new();
+  for line in wordlist.split("\n") {
+    let pl = match parse_line(&line) {
+      Ok(pl) => pl,
+      Err(s) => {
+        panic!("{}", s);
+      }
+    };
+    let entries = dict.search(
+        cedict::DictSearchParams{
+            trad: Some(&pl.trad), simp: Some(&pl.simp), pinyin: Some(&pl.pinyin)});
+    if entries.len() != 1 {
+      println!("Warning: number of entries for {:?} was {}; not exactly 1.", pl, entries.len());
+    }
+    if entries.len() > 0 {
+      // entries[entries.len() - 1] causes it to prefer entries with lowercase pinyin, e.g.
+      //   乾 干 [gan1] /dry/clean/in vain/dried food/foster/adoptive/to ignore/
+      // will be preferred over
+      // 乾 干 [Gan1] /surname Gan/
+      rv.push(chinese_note::ChineseNote{ce: entries[entries.len() - 1].clone(), tags: vec!()});
+    }
+  }
+  rv // TODO
+}
+
+pub struct ParsedLine {
+  pub trad: String,
+  pub simp: String,
+  pub pinyin: String,
+}
+
+impl std::fmt::Debug for ParsedLine {
+  fn fmt(&self, formatter: &mut std::fmt::Formatter) -> Result<(), std::fmt::Error> {
+    write!(
+        formatter,
+        "ParsedLine {{ trad: \"{}\", simp: \"{}\", pinyin: \"{}\" }}",
+        self.trad, self.simp, self.pinyin)
+  }
+}
+
+// TODO: can we make this only pub for testing?
+pub fn parse_line(line: &str) -> Result<ParsedLine, String> {
+  let line_re = regex!(r"^(.+?)(?: \[(.+?)\])? +(.+)$");
+  let caps = match line_re.captures(line) {
+    Some(caps) => caps,
+    None => { return Err("Could not parse Hanping word list line: ".to_string() + line) },
+  };
+  let mut rv = ParsedLine{
+    trad: String::new(),
+    simp: String::new(),
+    pinyin: String::new(),
+  };
+  rv.trad = caps.at(1).unwrap().to_string();
+  rv.simp = match caps.at(2) {
+    Some(simp_dash) =>
+        simp_dash.chars().zip(rv.trad.chars()).map(
+            |item| if item.0 != '-' { item.0 } else { item.1 }).collect(),
+    None => rv.trad.to_string(),
+  };
+  let rest = caps.at(3).unwrap();
+  let mut syllables = rv.trad.chars().count();
+  let formatted_pinyin = unsafe {
+    // TODO: The below depends on the fact that a byte representing an ASCII character (space) can't
+    // appear inside a multi-byte UTF-8 character. There's probably a less hacky way to do this.
+    let mut bytes_seen = 0;
+    for b in rest.bytes() {
+      if b == ' ' as u8 {
+        syllables -= 1;
+        if syllables <= 0 {
+          break;
+        }
+      }
+      bytes_seen += 1;
+    }
+    rest.slice_unchecked(0, bytes_seen)
+  };
+  rv.pinyin = pinyin_to_ascii(&formatted_pinyin);
+  Ok(rv)
+}
+
+// TODO: can we make this only pub for testing?
+pub fn pinyin_to_ascii(pinyin: &str) -> String {
+  let data = [
+      ['ā', 'á', 'ǎ', 'à', 'a'],
+      ['ē', 'é', 'ě', 'è', 'e'],
+      ['ī', 'í', 'ǐ', 'ì', 'i'],
+      ['ō', 'ó', 'ǒ', 'ò', 'o'],
+      ['ū', 'ú', 'ǔ', 'ù', 'u'],
+      ['ǖ', 'ǘ', 'ǚ', 'ǜ', 'ü'],
+  ];
+  let mut rv = "".to_string();
+  let mut tone = 5;
+
+  'process_char: for ch in pinyin.chars() {
+    if tone == 5 {
+      for r in 0..6 {
+        // we skip checking the last column and let this case fall-through to the below
+        // the result is he same either way
+        for c in 0..4 {
+          if data[r][c] == ch {
+            tone = (c + 1) as isize;
+            if r == 5 {
+              rv.push_str("u:");
+            } else {
+              rv.push(data[r][4]);
+            }
+            continue 'process_char;
+          }
+        }
+      }
+    }
+    if ch == ' ' {
+      rv.push_str(&tone.to_string());
+      tone = 5;
+    }
+    rv.push(ch);
+  }
+  if tone != -1 {
+    rv.push_str(&tone.to_string());
+  }
+  rv
+}
+
+#[cfg(test)]
+mod tests {
+  use super::*;
+
+  #[test]
+  fn jilupian_line_parses_correctly() {
+    let line = "紀錄片 [纪录-]     jì lù piàn       newsreel • documentary (film or TV program) • CL: 部 (bù)";
+    let parsed_line = parse_line(line).unwrap();
+    assert_eq!(parsed_line.trad, "紀錄片");
+    assert_eq!(parsed_line.simp, "纪录片");
+    assert_eq!(parsed_line.pinyin, "ji4 lu4 pian4");
+  }
+
+  #[test]
+  fn ouer_line_parses_correctly() {
+    // tests case where tone mark is not on last vowel
+    let line = "偶爾 [-尔]        ǒu ěr            occasionally • once in a while • sometimes";
+    let parsed_line = parse_line(line).unwrap();
+    assert_eq!(parsed_line.trad, "偶爾");
+    assert_eq!(parsed_line.simp, "偶尔");
+    assert_eq!(parsed_line.pinyin, "ou3 er3");
+  }
+
+  #[test]
+  fn cu_line_parses_correctly() {
+    // tests case where trad == simp, and there is exactly one space between records
+    let line = "粗 cū coarse • rough • thick (for cylindrical objects) • unfinished • vulgar • rude • crude";
+    let parsed_line = parse_line(line).unwrap();
+    assert_eq!(parsed_line.trad, "粗");
+    assert_eq!(parsed_line.simp, "粗");
+    assert_eq!(parsed_line.pinyin, "cu1");
+  }
+
+  #[test]
+  fn test_pinyin_to_ascii() {
+    assert_eq!(pinyin_to_ascii("hē diǎn lǜ chá ba"), "he1 dian3 lu:4 cha2 ba5");
+  }
+}
diff --git a/src/hsk.rs b/src/hsk.rs
@@ -11,8 +11,6 @@ use cedict;
 use std::collections::HashMap;
 use yaml::constructor::*;
 
-pub const ID_STR: &'static str = "kerrick hsk";
-
 #[derive(Clone)]
 struct HskWord {
   simp: String,
@@ -205,10 +203,7 @@ pub fn get_chinese_notes() -> Vec<chinese_note::ChineseNote<'static>> {
       println!("{} not in dict", word.simp);
       continue;
     }
-    let mut ce = best_entry(&word, &dict, &preferred).clone();
-    if ce.simp == ce.trad {
-      ce.trad = "";
-    }
+    let ce = best_entry(&word, &dict, &preferred).clone();
     rv.push(chinese_note::ChineseNote{ce: ce,
                                       tags: vec!(format!("HSK_Level_{}", word.level))});
   }