In [1]:
import sys
# use the following on r2d2
qlvldir = "/home/aardvark/code/typetokenQLVL"
# qlvldir = "/home/enzocxt/Projects/QLVL/typetoken_workdir/typetokenQLVL"
sys.path.append(qlvldir)

In [2]:
import pandas as pd

In [3]:
from qlvl.conf import ConfigLoader
from qlvl.utils import pickle, unpickle
from qlvl import Vocab, TypeTokenMatrix
from qlvl import ItemFreqHandler, ColFreqHandler, TokenHandler
from qlvl import compute_association, compute_cosine, compute_distance, compute_simrank
from qlvl.basics.mxcalc import compute_token_weights, compute_token_vectors
from collections import Counter, defaultdict
from qlvl.models.typetoken import build_tc_weight_matrix
from scipy.spatial.distance import squareform
from qlvl.basics.terms import TypeNode
from sklearn.manifold import MDS

Parameter settings
Create an object conf to tune settings. During this initialization, conf has already read the default settings file (in /home/aardvark/code/typetokenQLVL/qlvl/config.ini) consisting default parameter settings.

In [14]:
conf = ConfigLoader()
default_settings = conf.settings

In [15]:
# use your own file path
rootdir = "/home/projects/semmetrix/chilecto/model/football"
new_conf = "{}/config.football.ini".format(rootdir)
settings = conf.update_config(new_conf)

# always print values to check before you use
print(settings['line-machine'])
print(settings['line-format'])
print(settings['type'], settings['colloc'], settings['token'])
print(settings['file-encoding'])
print(settings['outfile-encoding'])

# corpus_name = 'ConceptsJocelyne'
corpus_path = settings['corpus-path']
print(corpus_path)
# settings['output-path'] = "/home/semmetrix/collmtx-stefano/"
output_path = settings['output-path']
print(output_path)

Using the default one: '~/tmp'!
([^\t]+)\t(\w)[^\t]+\t([^\t]+)
word,pos,lemma
lemma/pos lemma/pos lemma/pos/fid/lid
utf-8
utf-8
/home/projects/semmetrix/chilecto/corp/giga20181203
/home/projects/semmetrix/chilecto/model/football


In [16]:
fnames_ml = "/home/semmetrix/chilecto/corp/giga20181203/ml_250m.fnames"
fnames_tw = "/home/semmetrix/chilecto/corp/giga20181203/tw_250m.fnames"
fnames_mltw = "/home/semmetrix/chilecto/corp/giga20181203/ml_tw_500m.fnames"
fnames_full = fnames_mltw

corpus_t1 = 'giga_mltw_t1'
corpus_t2 = 'giga_mltw_t2'
corpus_full = 'giga_mltw_full'

## Load files and matrices

### Mainland Chinese + Taiwan Chinese vocab (500m)

In [17]:
vocab_fname = "{}/giga_mltw_full.vocab".format(output_path)
vocab = Vocab.load(vocab_fname)
print(vocab.describe())

Total items: 2495280
Total freqs: 501064226
count  2.495280e+06
mean   2.008048e+02
std    2.685521e+04
min    1.000000e+00
25%    1.000000e+00
50%    2.000000e+00
75%    4.000000e+00
max    3.180893e+07


### mainland chinese vocab (250m)

In [38]:
vocab_ml_fname = "{}/ml250.type.vocab".format(output_path)
vocab_ml = Vocab.load(vocab_ml_fname)
print(vocab_ml.describe())

Total items: 1405671
Total freqs: 249980195
count  1.405671e+06
mean   1.778369e+02
std    1.811502e+04
min    1.000000e+00
25%    1.000000e+00
50%    2.000000e+00
75%    5.000000e+00
max    1.440590e+07


### taiwan chinese vocab (250m)

In [39]:
vocab_tw_fname = "{}/tw250.type.vocab".format(output_path)
vocab_tw = Vocab.load(vocab_tw_fname)
print(vocab_tw.describe())

Total items: 1493367
Total freqs: 251084031
count  1.493367e+06
mean   1.681328e+02
std    1.762829e+04
min    1.000000e+00
25%    1.000000e+00
50%    2.000000e+00
75%    4.000000e+00
max    1.740304e+07


## Concept_variant

In [10]:
file_name = "/home/projects/semmetrix/chilecto/model/football/football.csv"

In [11]:
data = pd.read_csv(file_name,sep=",")

In [12]:
ConceptsVariants = {}
for idx, row in data.iterrows():
    key = row['concept'] 
    value = row['variant'] 
    ConceptsVariants[key]=[item for item in value.split(',')]

In [13]:
ConceptsVariants

{'attacker': ['前锋', '锋线', '射手'],
 'kickoff': ['开球', '开赛'],
 'offside': ['越位'],
 'goalkeeper': ['守门员', '门将'],
 'goal': ['进球', '得分', '破门'],
 'hands': ['手球', '手触球'],
 'cornerkick': ['角球'],
 'foul': ['犯规'],
 'referee': ['裁判', '主裁判', '主裁', '黑衣人'],
 'penalitykick': ['点球', '极刑', '十二码球', '十二码罚球'],
 'counterattack': ['反击', '反攻'],
 'defender': ['后卫', '防守队员', '防守球员', '防守端', '防线'],
 'cross': ['横传', '传中', '长传'],
 'freekick': ['任意球', '自由球'],
 'game': ['比赛', '赛事', '交锋', '竞赛', '球赛', '对阵', '对垒']}

## Variant frequency

### variant freq

Search variant with all tags

In [18]:
def search_variant(variant, vocab_dict):
    """Search keys of vocab_dict contains variant
    
    """
    output = {}
    for key in vocab_dict.keys():
        word = key.split('/')[0]
        if variant == word:
            output[key] = vocab_dict[key]
            
    return output
# Test
search_variant('反击', vocab)

{'反击/VC': 6026, '反击/Nv': 1198}

In [122]:
for key in ConceptsVariants:
    print(key, ':')
    for var in ConceptsVariants[key]:
        print('  - ', var)
        frq_ml = search_variant(var, vocab_ml)
        frq_tw = search_variant(var, vocab_tw)
        for key_2 in list(set(list(frq_ml.keys()) + list(frq_tw.keys()))):
            if key_2 in frq_ml.keys():
                f1 = frq_ml[key_2]
            else:
                f1 = 0
            if key_2 in frq_tw.keys():
                f2 = frq_tw[key_2]
            else:
                f2 = 0
                
            p_str = '     * %8s: ML=%6d   TW=%6d' % (key_2, f1, f2)
            print(p_str)
    print('')

attacker :
  -  进攻队员
  -  进攻球员
  -  前锋
     *    前锋/Na: ML=  7091   TW=  2398
  -  锋线
     *    锋线/Na: ML=   774   TW=    14
  -  射手
     *    射手/Na: ML=  2273   TW=   355

kickoff :
  -  开球
     *    开球/Nv: ML=    60   TW=   302
     *    开球/VA: ML=   494   TW=   846
  -  开赛
     *    开赛/Nv: ML=  8470   TW=  2771
     *    开赛/VH: ML=   135   TW=    28

offside :
  -  越位
     *    越位/VA: ML=   589   TW=    65

goalkeeper :
  -  守门员
     *   守门员/Na: ML=  3510   TW=   217
  -  门将
     *    门将/Na: ML=  3970   TW=   487

goal :
  -  进球
     *    进球/Na: ML=  5214   TW=   549
  -  得分
     *    得分/Nv: ML=  7403   TW=  3222
     *    得分/VH: ML=   174   TW=    53
     *    得分/Na: ML=   932   TW=   413
  -  破门
     *    破门/Nb: ML=  2818   TW=    63
     *    破门/Na: ML=   194   TW=     9

hands :
  -  手球
     *    手球/Na: ML=  2348   TW=   480
  -  手触球
     *   手触球/Na: ML=     4   TW=     1

cornerkick :
  -  角球
     *    角球/Na: ML=  1611   TW=    58

foul :
  -  犯规
     *    犯规/Nv: ML=   572   TW

[]

In [73]:
for key in ConceptsVariants:
    print(key, ':')
    for i in ConceptsVariants[key]:
        if i in vocab:
            frq = vocab[i]
        else:
            frq = 0
            
        p_str = '    %s: %d' % (i, frq)
        print(p_str)

attacker :
    进攻队员/Na: 0
    进攻球员/Na: 0
    前锋/Na: 9489
    锋线/Na: 788
    射手/Na: 2628
kickoff :
    开球/Na: 0
    开赛/Na: 0
offside :
    越位/Na: 0
goalkeeper :
    守门员/Na: 3727
    门将/Na: 4457
goal :
    进球/Na: 5763
    得分/Na: 1345
    破门/Na: 203
hands :
    手球/Na: 2828
    手触球/Na: 5
cornerkick :
    角球/Na: 1669
foul :
    犯规/Na: 0
referee :
    裁判/Na: 14730
    主裁判/Na: 0
    主裁/Na: 0
    黑衣人/Na: 0
penalitykick :
    点球/Na: 3408
    极刑/Na: 387
    十二码球/Na: 86
    十二码罚球/Na: 0
counterattack :
    防守反击/Na: 0
    防反/Na: 16
    反击/Na: 0
    反攻/Na: 0
defender :
    后卫/Na: 7942
    防守队员/Na: 0
    防守球员/Na: 0
    防守端/Na: 0
    防线/Na: 2926
cross :
    横传/Na: 0
    传中/Na: 0
    长传/Na: 0
    转移/Na: 0
freekick :
    任意球/Na: 1904
    自由球/Na: 180
game :
    比赛/Na: 315098
    赛事/Na: 19891
    交锋/Na: 0
    竞赛/Na: 25890
    球赛/Na: 4631
    对阵/Na: 0
    对垒/Na: 0


### variant freq in ML / TW

In [74]:
for key in ConceptsVariants:
    print(key, ':')
    for i in ConceptsVariants[key]:
        if i in vocab_ml:
            frq_ml = vocab_ml[i]
        else:
            frq_ml = 0
        if i in vocab_tw:
            frq_tw = vocab_tw[i]
        else:
            frq_tw = 0
            
        p_str = '    %s: \tML=%d \tTW=%d' % (i, frq_ml, frq_tw)
        print(p_str)

attacker :
    进攻队员/Na: 	ML=0 	TW=0
    进攻球员/Na: 	ML=0 	TW=0
    前锋/Na: 	ML=7091 	TW=2398
    锋线/Na: 	ML=774 	TW=14
    射手/Na: 	ML=2273 	TW=355
kickoff :
    开球/Na: 	ML=0 	TW=0
    开赛/Na: 	ML=0 	TW=0
offside :
    越位/Na: 	ML=0 	TW=0
goalkeeper :
    守门员/Na: 	ML=3510 	TW=217
    门将/Na: 	ML=3970 	TW=487
goal :
    进球/Na: 	ML=5214 	TW=549
    得分/Na: 	ML=932 	TW=413
    破门/Na: 	ML=194 	TW=9
hands :
    手球/Na: 	ML=2348 	TW=480
    手触球/Na: 	ML=4 	TW=1
cornerkick :
    角球/Na: 	ML=1611 	TW=58
foul :
    犯规/Na: 	ML=0 	TW=0
referee :
    裁判/Na: 	ML=10564 	TW=4166
    主裁判/Na: 	ML=0 	TW=0
    主裁/Na: 	ML=0 	TW=0
    黑衣人/Na: 	ML=0 	TW=0
penalitykick :
    点球/Na: 	ML=3400 	TW=8
    极刑/Na: 	ML=193 	TW=194
    十二码球/Na: 	ML=0 	TW=86
    十二码罚球/Na: 	ML=0 	TW=0
counterattack :
    防守反击/Na: 	ML=0 	TW=0
    防反/Na: 	ML=16 	TW=0
    反击/Na: 	ML=0 	TW=0
    反攻/Na: 	ML=0 	TW=0
defender :
    后卫/Na: 	ML=6388 	TW=1554
    防守队员/Na: 	ML=0 	TW=0
    防守球员/Na: 	ML=0 	TW=0
    防守端/Na: 	ML=0 	TW=0
    防线/Na: 	ML=2056 	TW=