In [103]:
import pandas as pd 
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.mixture import GaussianMixture as GMM
from sklearn.decomposition import PCA, FastICA
from sklearn import linear_model
from matplotlib import pyplot as plt, cm as cm, mlab as mlab
import seaborn as sns; sns.set()
import progressbar as pb
import time 
import math

In [104]:
# read csv/excel data files 
pnas_data1 = pd.read_csv('/home/jaeweon/research/data/pnas_data1.csv')

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [118]:
# format data 

# extract 9 Complexity Characteristic variables 
features = ['PolPop', 'PolTerr', 'CapPop', 'levels', 'government','infrastr', 'writing', 'texts', 'money']
feature_time = ['Time'] + features

# take subset of original data table with 9 CCs and change it into numpy array 
data_mat = StandardScaler().fit_transform(pnas_data1.loc[:, features].values)
time = pnas_data1.loc[:, ['Time']].values

In [133]:
#Gaussian Mixture Model 
#fit GMM
gmm = GMM(n_components=2).fit(data_mat)
cov = gmm.covariances_
prob_distr = gmm.predict_proba(data_mat)
print(gmm.weights_[0], gmm.weights_[1])

# determine to which of the two gaussians each data point belongs by looking at probability distribution 
gauss1_idx = [i for i in range(len(prob_distr)) if prob_distr[i][0] >= prob_distr[i][1]]
gauss2_idx = [j for j in range(len(prob_distr)) if prob_distr[j][1] >= prob_distr[j][0]]

gauss1_time = [time[i] for i in gauss1_idx] # time for the first gaussian data
gauss2_time = [time[j] for j in gauss2_idx] # time for the second gaussian data

gauss1_point = [data_mat[i] for i in gauss1_idx] # 9-d data point for the first gaussian
gauss2_point = [data_mat[j] for j in gauss2_idx] # 9-d data point for the second gaussian

0.397523705222 0.602476294778


In [137]:
# main eigenvectors for covariances of each gaussians
eigval1, eigvec1 = np.linalg.eig(cov[0])
eigval2, eigvec2 = np.linalg.eig(cov[1])

# find the eigenvector corresponding to the largest eigenvalue for each of the two gaussians
max_eigvec1 = eigvec1[:, np.argmax(max(eigval1))] 
max_eigvec2 = eigvec2[:, np.argmax(max(eigval2))]

gauss1_proj = np.matmul(gauss1_point, max_eigvec1)
gauss2_proj = np.matmul(gauss2_point, max_eigvec2)



In [135]:
# Multiple linear regression over time
ols1 = linear_model.LinearRegression()
ols2 = linear_model.LinearRegression()
model1 = ols1.fit(gauss1_time, gauss1_proj)
model2 = ols2.fit(gauss2_time, gauss2_proj)

print("coefficients for the first gaussian: ", model1.coef_)
print("intercept for the first gaussian: ", model1.intercept_)
print("coefficients for the second gaussian: ",  model2.coef_)
print("intercept for the second gaussian: ", model2.intercept_)

coefficients for the first gaussian:  [-0.00018419]
intercept for the first gaussian:  2.62177559821
coefficients for the second gaussian:  [ 0.00054835]
intercept for the second gaussian:  1.35529871633


In [83]:
# pca components 
pca = PCA(n_components=9)
pca.fit(data_mat)
components = pca.components_

In [84]:
# calculate angle between two vectors 
def angle(vec1, vec2):
    """
    Given two vectors, compute the angle between the vectors
    """
    assert vec1.shape == vec2.shape
    
    cos_vec = np.inner(vec1, vec2)/(np.linalg.norm(vec1)*np.linalg.norm(vec2))
    angle = math.acos(cos_vec)
    in_deg = math.degrees(angle)
    if in_deg >= 90:
        return (180-in_deg)
    return in_deg


In [138]:
# examine where the angle between the two main eigenvectors for each gaussian comes from
comp1 = np.matmul(max_eigvec1.T, components)
comp2 = np.matmul(max_eigvec2.T, components)

print(comp1)
print(comp2)
for i in range(1, len(components)): # angle using only some components
    print("angle using first %s components: " %(i+1), angle(comp1[:i+1], comp2[:i+1]))

[ 0.44786061  0.06604639  0.1315556   0.53394828 -0.19907041  0.21932552
  0.0070829   0.39071195 -0.50220753]
[-0.35292858 -0.1346638  -0.38086179 -0.2442904  -0.12845045 -0.03122347
 -0.08851483 -0.47627445  0.63279296]
angle using first 2 components:  12.495913311568557
angle using first 3 components:  30.86984463794994
angle using first 4 components:  34.137423612232794
angle using first 5 components:  43.81706490089974
angle using first 6 components:  45.12623621762637
angle using first 7 components:  45.621579754337915
angle using first 8 components:  39.658912681329156
angle using first 9 components:  33.56245604869352


In [None]:
# 