# PCA  on Order Features

In [1]:
import yaml,logging,numpy,pandas
from optparse import OptionParser
from feature import languages,Feature
from matplotlib import pyplot as plt
%matplotlib inline
features = yaml.load(open('features.yml'))
logging.basicConfig(level=logging.ERROR)

In [2]:
order_features = pandas.read_csv('word-order-features.csv')
order_features.describe()

Unnamed: 0,representation
count,56.0
mean,568.821429
std,511.034427
min,5.0
25%,66.0
50%,446.0
75%,1144.75
max,1519.0


Meaning there are 56 word-order related features according to WALS. They cover 568 languages on average with very large variance. We can crop the top half and see what gives.

In [3]:
# get the order features corresponding to quantile q
def qfeatures(q):
    hingh = order_features['representation'].quantile(q)
    return order_features[order_features['representation'] >= hingh]['id'].values
    
# get the submatrix corresponding to a quantile
def qlangs(q):
    cols = [c for c in languages.columns if c.split(" ")[0] in qfeatures(q)]
    qlangs = languages[cols]
    qlangs = qlangs.replace(to_replace=".+",regex=True,value=1)
    qlangs = qlangs.replace(to_replace='',value=0)
    qlangs['features_count'] = qlangs.apply(lambda x: sum(x), axis=1)
    return qlangs[qlangs['features_count'] == len(cols)]

for q in [0.7,0.8,0.9]:
    print("Number of languages that Share the top",q, "of order features:",len(qlangs(q)))

Number of languages that Share the top 0.7 of order features: 421
Number of languages that Share the top 0.8 of order features: 780
Number of languages that Share the top 0.9 of order features: 1016


sanity check:

In [4]:
qlangs09 = qlangs(0.9)
qlangs09

Unnamed: 0,"81A Order of Subject, Object and Verb",82A Order of Subject and Verb,83A Order of Object and Verb,87A Order of Adjective and Noun,143F Postverbal Negative Morphemes,143E Preverbal Negative Morphemes,143A Order of Negative Morpheme and Verb,143G Minor morphological means of signaling negation,features_count
0,1,1,1,1,1,1,1,1,8
7,1,1,1,1,1,1,1,1,8
8,1,1,1,1,1,1,1,1,8
10,1,1,1,1,1,1,1,1,8
12,1,1,1,1,1,1,1,1,8
13,1,1,1,1,1,1,1,1,8
16,1,1,1,1,1,1,1,1,8
17,1,1,1,1,1,1,1,1,8
20,1,1,1,1,1,1,1,1,8
23,1,1,1,1,1,1,1,1,8


In [33]:
mat09 = numpy.zeros((0,len(qlangs09)))
numfeats = 0
for name,data in features.items():
    if data['wals_id'] in order_features['id'].values:
        print(name+":",data['desc'])
        numfeats += 1
        feat =  Feature(name,data)
        mat09 = numpy.vstack((mat09,feat.get_languages(qlangs09.index)))

isvso: binarising 81A on VSO (3)
word_order1: binarising the wals feature to 1 if there is some order, -1 if not (that the wals 7 for this feature)
isvs: binarising 82A  on the VS value (2)
isosv: binarising 81A on OSV (6)
issv: binarising 82A  on the SV value (1)
issvo: binarising 81A on the SVO value (2), meaning it returns 1 if the language is SVO, -1 otherwise
word_order2: binarizing 82A on existance of value (3)
isvos: binarising 81A on VOS (4)
adj_noun_adj: binarizing 87A on the case where both orders are found (3)
word_order3: binarizing 87A on existance of value (3)
issov: binarising 81A on the SOV value (1)
isovs: binarising 81A on OVS (5)
adj_noun: binarizing 87A on AdjN (1)
noun_adj: binarizing 87A on NAdj (2)


In [34]:
from sklearn.decomposition import PCA
pca = PCA(n_components=3)
pca.fit(mat09)
PCA(copy=True, iterated_power='auto', n_components=3, random_state=None, svd_solver='auto', tol=0.0, whiten=False)
print(pca.explained_variance_ratio_)

[ 0.56475388  0.19397961  0.09862021]


In [32]:
print(mat09.shape)
cov = numpy.cov(mat09)
print(cov.shape)
numpy.cov(mat09)[:5,:5]

(14, 1016)
(14, 14)


array([[  2.87122299e-01,   3.67712657e-02,   1.49230053e-01,
         -6.12854428e-04,  -2.48206043e-01],
       [  3.67712657e-02,   4.17051317e-01,   7.67697141e-02,
          9.30918118e-04,   2.07563710e-01],
       [  1.49230053e-01,   7.67697141e-02,   9.99274660e-01,
         -4.10379737e-03,  -2.27555176e-01],
       [ -6.12854428e-04,   9.30918118e-04,  -4.10379737e-03,
          7.86625810e-03,   1.59807610e-03],
       [ -2.48206043e-01,   2.07563710e-01,  -2.27555176e-01,
          1.59807610e-03,   6.47220822e-01]])