<a href="https://colab.research.google.com/github/kojiishi/contextual-spacing/blob/master/contextual_spacing_analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Preparations

In [57]:
!python -V
import numpy as np
import pandas as pd
import re
import unicodedata

!rm -rf contextual-spacing
!git clone https://github.com/kojiishi/contextual-spacing.git

Python 3.6.7
Cloning into 'contextual-spacing'...
remote: Enumerating objects: 23, done.[K
remote: Counting objects: 100% (23/23), done.[K
remote: Compressing objects: 100% (19/19), done.[K
remote: Total 23 (delta 7), reused 7 (delta 3), pack-reused 0[K
Unpacking objects: 100% (23/23), done.


### Unicode

In [58]:
def u_hex(value):
  hexstr = hex(value)[2:].upper()
  return ('000' + hexstr)[-4:]

def u_name_or_null(c):
  try:
    return unicodedata.name(c)
  except ValueError:
    return None

def u_enc(c, encoding):
  code = 0
  for byte in c.encode(encoding, 'ignore'):
    code = code * 256 + byte
  return u_hex(code) if code else ''
  
def get_unicode_list(range):
  return [[
    u_hex(ord(c)),
    c,
    u_name_or_null(c),
    unicodedata.category(c),
    unicodedata.east_asian_width(c),
    u_enc(c, 'cp932'),
    u_enc(c, 'cp936'),
    u_enc(c, 'cp949'),
    u_enc(c, 'cp950'),
  ] for c in [chr(c) for c in range]]

def get_unicode_df(range):
  df = pd.DataFrame(get_unicode_list(range),
                    index=range,
                    columns=['hex', 'char', 'name', 'gc', 'eaw',
                             'cp932', 'cp936', 'cp949', 'cp950'])
  #df.index.name = 'code'
  return df

r = range(0x20, 0x1FFFF)
u = get_unicode_df(r)
u.head(11)

Unnamed: 0,hex,char,name,gc,eaw,cp932,cp936,cp949,cp950
32,0020,,SPACE,Zs,Na,0020,0020,0020,0020
33,0021,!,EXCLAMATION MARK,Po,Na,0021,0021,0021,0021
34,0022,"""",QUOTATION MARK,Po,Na,0022,0022,0022,0022
35,0023,#,NUMBER SIGN,Po,Na,0023,0023,0023,0023
36,0024,$,DOLLAR SIGN,Sc,Na,0024,0024,0024,0024
37,0025,%,PERCENT SIGN,Po,Na,0025,0025,0025,0025
38,0026,&,AMPERSAND,Po,Na,0026,0026,0026,0026
39,0027,',APOSTROPHE,Po,Na,0027,0027,0027,0027
40,0028,(,LEFT PARENTHESIS,Ps,Na,0028,0028,0028,0028
41,0029,),RIGHT PARENTHESIS,Pe,Na,0029,0029,0029,0029


### Read features.txt from the git repositry, [original](http://blogs.adobe.com/CCJKType/files/2018/04/features.txt) from [Ken's CJK Type blog](https://blogs.adobe.com/CCJKType/2018/04/contextual-spacing.html)

In [59]:
class Feature:
  def __init__(self, name):
    self.name = name
    self.classes = {}

  @staticmethod
  def parse_as_dict():
    features = {}
    feature = None
    f = open('contextual-spacing/features.txt')
    for line in f.readlines():
      m = re.search(r'^feature (\w+) {', line)
      if m:
        name = m[1]
        feature = Feature(name)
        features[name] = feature
        continue
      if feature:
        feature.parse_line(line)
    f.close()
    return features

  def parse_line(self, line):
    m = re.search(r'@(\w+) = \[(.+)\];', line)
    if m:
      name = m[1]
      for code in m[2].split(' '):
        self.add_code_to_class(name, code)

  def add_code_to_class(self, name, code):
    assert code.startswith('uni'), 'code must starts with "uni"'
    code = code[3:]
    # If `code` has a suffix like `uniFF1F.cn`, remove it and append to `name`.
    code_and_lang = code.split('.')
    if len(code_and_lang) == 2:
      name = name + '.' + code_and_lang[1]
    # Append the code to the class.
    code = int(code_and_lang[0], 16)
    if name in self.classes:
      self.classes[name].append(code)
    else:
      self.classes[name] = [code]

  def to_series_dict(self):
    return {self.name: self.to_series()}

  def to_series(self):
    s = None
    for value, list in self.classes.items():
      s1 = pd.Series(value, index=list)
      if s is None:
        s = s1
      else:
        s = s.combine(s1, lambda x, y: x if pd.isnull(y) else (y if pd.isnull(x) else str(x)+' '+str(y)))
    return s

features = Feature.parse_as_dict()
u = u.assign(**features['cspc'].to_series_dict())
u = u.assign(**features['vcsp'].to_series_dict())
u.query('cspc == cspc or vcsp == vcsp').head(10)

Unnamed: 0,hex,char,name,gc,eaw,cp932,cp936,cp949,cp950,cspc,vcsp
8216,2018,‘,LEFT SINGLE QUOTATION MARK,Pi,A,8165,A1AE,A1AE,A1A5,OpeningBracket,OpeningBracketVert.vert
8217,2019,’,RIGHT SINGLE QUOTATION MARK,Pf,A,8166,A1AF,A1AF,A1A6,ClosingBracket,ClosingBracketVert.vert
8220,201C,“,LEFT DOUBLE QUOTATION MARK,Pi,A,8167,A1B0,A1B0,A1A7,OpeningBracket,OpeningBracketVert.vert
8221,201D,”,RIGHT DOUBLE QUOTATION MARK,Pf,A,8168,A1B1,A1B1,A1A8,ClosingBracket,ClosingBracketVert.vert
12289,3001,、,IDEOGRAPHIC COMMA,Po,W,8141,A1A2,A1A2,A142,PeriodComma Centered.tw,CenteredVert.tw
12290,3002,。,IDEOGRAPHIC FULL STOP,Po,W,8142,A1A3,A1A3,A143,PeriodComma Centered.tw,CenteredVert.tw
12296,3008,〈,LEFT ANGLE BRACKET,Ps,W,8171,A1B4,A1B4,A171,OpeningBracket,
12297,3009,〉,RIGHT ANGLE BRACKET,Pe,W,8172,A1B5,A1B5,A172,ClosingBracket,
12298,300A,《,LEFT DOUBLE ANGLE BRACKET,Ps,W,8173,A1B6,A1B6,A16D,OpeningBracket,
12299,300B,》,RIGHT DOUBLE ANGLE BRACKET,Pe,W,8174,A1B7,A1B7,A16E,ClosingBracket,


### [CSS Text 4 Character classes](https://drafts.csswg.org/css-text-4/#text-pacing-classes)

In [60]:
s = pd.Series(dtype=object)
for c in r:
  v = []
  gc = unicodedata.category(chr(c))
  eaw = unicodedata.east_asian_width(chr(c))
  if (gc == 'Ps' and ((c >= 0x3000 and c <=0x303F) or eaw == 'F')) or c in [0x2018, 0x201C]:
    v.append('open')
  if (gc == 'Pe' and ((c >= 0x3000 and c <=0x303F) or eaw == 'F')) or c in [0x2019, 0x201D]:
    v.append('close')
  if c in [0x00B7, 0x2027, 0x30FB]:
    v.append('middle')
  if c in [0xFF1A, 0xFF1B]:
    v.append('colon')
  if c in [0x3001, 0x3002, 0xFF0C, 0xFF0E]:
    v.append('dot')
  if v:
    s.at[c] = ' '.join(v)
u = u.assign(css4=s)
u.query('css4 == css4').head(10)

Unnamed: 0,hex,char,name,gc,eaw,cp932,cp936,cp949,cp950,cspc,vcsp,css4
183,00B7,·,MIDDLE DOT,Po,A,,A1A4,A1A4,A150,,,middle
8216,2018,‘,LEFT SINGLE QUOTATION MARK,Pi,A,8165.0,A1AE,A1AE,A1A5,OpeningBracket,OpeningBracketVert.vert,open
8217,2019,’,RIGHT SINGLE QUOTATION MARK,Pf,A,8166.0,A1AF,A1AF,A1A6,ClosingBracket,ClosingBracketVert.vert,close
8220,201C,“,LEFT DOUBLE QUOTATION MARK,Pi,A,8167.0,A1B0,A1B0,A1A7,OpeningBracket,OpeningBracketVert.vert,open
8221,201D,”,RIGHT DOUBLE QUOTATION MARK,Pf,A,8168.0,A1B1,A1B1,A1A8,ClosingBracket,ClosingBracketVert.vert,close
8231,2027,‧,HYPHENATION POINT,Po,A,,,,A145,,,middle
12289,3001,、,IDEOGRAPHIC COMMA,Po,W,8141.0,A1A2,A1A2,A142,PeriodComma Centered.tw,CenteredVert.tw,dot
12290,3002,。,IDEOGRAPHIC FULL STOP,Po,W,8142.0,A1A3,A1A3,A143,PeriodComma Centered.tw,CenteredVert.tw,dot
12296,3008,〈,LEFT ANGLE BRACKET,Ps,W,8171.0,A1B4,A1B4,A171,OpeningBracket,,open
12297,3009,〉,RIGHT ANGLE BRACKET,Pe,W,8172.0,A1B5,A1B5,A172,ClosingBracket,,close


### Needs manual review

In [61]:
def compute_review(code, row):
  css = row['css4']
  cspc = row['cspc']
  vcsp = row['vcsp']
  if pd.isnull(css):
    return 'Review'
  if css == 'open':
    if cspc == 'OpeningBracket' and vcsp == 'OpeningBracketVert.vert':
      return 'OK'
  if css == 'close':
    if cspc == 'ClosingBracket' and vcsp == 'ClosingBracketVert.vert':
      return 'OK'
  return 'Review'
    
s = pd.Series(dtype=object)
for code, row in u.query('cspc == cspc or vcsp == vcsp or css4 == css4').iterrows():
  result =compute_review(code, row)
  if result is not None:
    s.at[code] = result
u = u.assign(review=s)
u.query('review != "OK" and (cspc == cspc or vcsp == vcsp or css4 == css4)')

Unnamed: 0,hex,char,name,gc,eaw,cp932,cp936,cp949,cp950,cspc,vcsp,css4,review
183,00B7,·,MIDDLE DOT,Po,A,,A1A4,A1A4,A150,,,middle,Review
8231,2027,‧,HYPHENATION POINT,Po,A,,,,A145,,,middle,Review
12289,3001,、,IDEOGRAPHIC COMMA,Po,W,8141,A1A2,A1A2,A142,PeriodComma Centered.tw,CenteredVert.tw,dot,Review
12290,3002,。,IDEOGRAPHIC FULL STOP,Po,W,8142,A1A3,A1A3,A143,PeriodComma Centered.tw,CenteredVert.tw,dot,Review
12296,3008,〈,LEFT ANGLE BRACKET,Ps,W,8171,A1B4,A1B4,A171,OpeningBracket,,open,Review
12297,3009,〉,RIGHT ANGLE BRACKET,Pe,W,8172,A1B5,A1B5,A172,ClosingBracket,,close,Review
12298,300A,《,LEFT DOUBLE ANGLE BRACKET,Ps,W,8173,A1B6,A1B6,A16D,OpeningBracket,,open,Review
12299,300B,》,RIGHT DOUBLE ANGLE BRACKET,Pe,W,8174,A1B7,A1B7,A16E,ClosingBracket,,close,Review
12300,300C,「,LEFT CORNER BRACKET,Ps,W,8175,A1B8,A1B8,A175,OpeningBracket,,open,Review
12301,300D,」,RIGHT CORNER BRACKET,Pe,W,8176,A1B9,A1B9,A176,ClosingBracket,,close,Review


### Full list of either CSPC or CSS4 is set

In [62]:
pd.options.display.max_rows = None
u.query('cspc == cspc or vcsp == vcsp or css4 == css4')

Unnamed: 0,hex,char,name,gc,eaw,cp932,cp936,cp949,cp950,cspc,vcsp,css4,review
183,00B7,·,MIDDLE DOT,Po,A,,A1A4,A1A4,A150,,,middle,Review
8216,2018,‘,LEFT SINGLE QUOTATION MARK,Pi,A,8165,A1AE,A1AE,A1A5,OpeningBracket,OpeningBracketVert.vert,open,OK
8217,2019,’,RIGHT SINGLE QUOTATION MARK,Pf,A,8166,A1AF,A1AF,A1A6,ClosingBracket,ClosingBracketVert.vert,close,OK
8220,201C,“,LEFT DOUBLE QUOTATION MARK,Pi,A,8167,A1B0,A1B0,A1A7,OpeningBracket,OpeningBracketVert.vert,open,OK
8221,201D,”,RIGHT DOUBLE QUOTATION MARK,Pf,A,8168,A1B1,A1B1,A1A8,ClosingBracket,ClosingBracketVert.vert,close,OK
8231,2027,‧,HYPHENATION POINT,Po,A,,,,A145,,,middle,Review
12289,3001,、,IDEOGRAPHIC COMMA,Po,W,8141,A1A2,A1A2,A142,PeriodComma Centered.tw,CenteredVert.tw,dot,Review
12290,3002,。,IDEOGRAPHIC FULL STOP,Po,W,8142,A1A3,A1A3,A143,PeriodComma Centered.tw,CenteredVert.tw,dot,Review
12296,3008,〈,LEFT ANGLE BRACKET,Ps,W,8171,A1B4,A1B4,A171,OpeningBracket,,open,Review
12297,3009,〉,RIGHT ANGLE BRACKET,Pe,W,8172,A1B5,A1B5,A172,ClosingBracket,,close,Review


### Only in CSS

In [63]:
u.query('cspc != cspc and vcsp != vcsp and css4 == css4')

Unnamed: 0,hex,char,name,gc,eaw,cp932,cp936,cp949,cp950,cspc,vcsp,css4,review
183,00B7,·,MIDDLE DOT,Po,A,,A1A4,A1A4,A150,,,middle,Review
8231,2027,‧,HYPHENATION POINT,Po,A,,,,A145,,,middle,Review
12318,301E,〞,DOUBLE PRIME QUOTATION MARK,Pe,W,,A895,,A1AA,,,close,Review


### Only in features.txt

In [64]:
u.query('(cspc == cspc or vcsp == vcsp) and css4 != css4')

Unnamed: 0,hex,char,name,gc,eaw,cp932,cp936,cp949,cp950,cspc,vcsp,css4,review
65040,FE10,︐,PRESENTATION FORM FOR VERTICAL COMMA,Po,W,,,,,,PeriodCommaVert,,Review
65041,FE11,︑,PRESENTATION FORM FOR VERTICAL IDEOGRAPHIC COMMA,Po,W,,,,,,PeriodCommaVert,,Review
65042,FE12,︒,PRESENTATION FORM FOR VERTICAL IDEOGRAPHIC FUL...,Po,W,,,,,,PeriodCommaVert,,Review
65047,FE17,︗,PRESENTATION FORM FOR VERTICAL LEFT WHITE LENT...,Ps,W,,,,,,OpeningBracketVert,,Review
65048,FE18,︘,PRESENTATION FORM FOR VERTICAL RIGHT WHITE LEN...,Pe,W,,,,,,ClosingBracketVert,,Review
65077,FE35,︵,PRESENTATION FORM FOR VERTICAL LEFT PARENTHESIS,Ps,W,,A6E0,,A15F,,OpeningBracketVert,,Review
65078,FE36,︶,PRESENTATION FORM FOR VERTICAL RIGHT PARENTHESIS,Pe,W,,A6E1,,A160,,ClosingBracketVert,,Review
65079,FE37,︷,PRESENTATION FORM FOR VERTICAL LEFT CURLY BRACKET,Ps,W,,A6F0,,A163,,OpeningBracketVert,,Review
65080,FE38,︸,PRESENTATION FORM FOR VERTICAL RIGHT CURLY BRA...,Pe,W,,A6F1,,A164,,ClosingBracketVert,,Review
65081,FE39,︹,PRESENTATION FORM FOR VERTICAL LEFT TORTOISE S...,Ps,W,,A6E2,,A167,,OpeningBracketVert,,Review


### Both are set

In [65]:
u.query('(cspc == cspc or vcsp == vcsp) and css4 == css4')

Unnamed: 0,hex,char,name,gc,eaw,cp932,cp936,cp949,cp950,cspc,vcsp,css4,review
8216,2018,‘,LEFT SINGLE QUOTATION MARK,Pi,A,8165,A1AE,A1AE,A1A5,OpeningBracket,OpeningBracketVert.vert,open,OK
8217,2019,’,RIGHT SINGLE QUOTATION MARK,Pf,A,8166,A1AF,A1AF,A1A6,ClosingBracket,ClosingBracketVert.vert,close,OK
8220,201C,“,LEFT DOUBLE QUOTATION MARK,Pi,A,8167,A1B0,A1B0,A1A7,OpeningBracket,OpeningBracketVert.vert,open,OK
8221,201D,”,RIGHT DOUBLE QUOTATION MARK,Pf,A,8168,A1B1,A1B1,A1A8,ClosingBracket,ClosingBracketVert.vert,close,OK
12289,3001,、,IDEOGRAPHIC COMMA,Po,W,8141,A1A2,A1A2,A142,PeriodComma Centered.tw,CenteredVert.tw,dot,Review
12290,3002,。,IDEOGRAPHIC FULL STOP,Po,W,8142,A1A3,A1A3,A143,PeriodComma Centered.tw,CenteredVert.tw,dot,Review
12296,3008,〈,LEFT ANGLE BRACKET,Ps,W,8171,A1B4,A1B4,A171,OpeningBracket,,open,Review
12297,3009,〉,RIGHT ANGLE BRACKET,Pe,W,8172,A1B5,A1B5,A172,ClosingBracket,,close,Review
12298,300A,《,LEFT DOUBLE ANGLE BRACKET,Ps,W,8173,A1B6,A1B6,A16D,OpeningBracket,,open,Review
12299,300B,》,RIGHT DOUBLE ANGLE BRACKET,Pe,W,8174,A1B7,A1B7,A16E,ClosingBracket,,close,Review
