In [None]:
import fix_notebook_imports

from src import util

import numpy as np
import pandas as pd
from sklearn import preprocessing
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from mpl_toolkits.mplot3d import Axes3D
from matplotlib import pyplot as plt
import seaborn as sns
sns.set(rc={'figure.figsize':(11.7,8.27)})
palette = sns.color_palette("bright", 10)


In [None]:
# Read in dataframe from CSV

df = pd.read_csv(f"{util.DATA_RAW_DIR}/WVS_per_country.csv")


In [None]:
# Keep one row per country using only the most recent data about that country

countries = []
bools = [False for _ in range(df.shape[0])]
for wave in range(6, 3, -1):
    wave_bools = []
    for i, row in df.iterrows():
        if (row["Wave"] == wave) and (row["Country"] not in countries):
            wave_bools.append(True)
            countries.append(row["Country"])
        else: 
            wave_bools.append(False)
    bools = np.logical_or(wave_bools, bools)

df = df[bools]


In [None]:
# Only include meaningful categories 

starts_with = [
    "a", # Perceptions of life
    "b", # Environment
    'c', # Work
    'd', # Family
    'e', # Politics and Society
    'f', # Religion and Morale
    'g', # National Identity
    'h', # Security
    'i' # Science
]

drop_columns = [col for col in df.columns if col.lower()[0] not in starts_with]
df = df.drop(labels=drop_columns, axis=1)


In [None]:
# Drop specialty columns except standard deviation data

drop_columns = [col for col in df.columns if ("_" in col) and (col.lower()[-3:] != '_sd')]
df = df.drop(labels=drop_columns, axis=1)


In [None]:
# Drop columns with too much missing information

df = df.fillna(df.mean(), limit=6)
df = df.dropna(axis=1)


In [None]:
# Drop non-numeric columns (besides Country label)

numeric_columns = list(df.select_dtypes([np.float]).columns)
keep_columns = ["Country"] + numeric_columns
df = df.filter(keep_columns)


In [None]:
# Normalize columns (mean=0, std=1)

df_normalized = df.filter(numeric_columns).apply(lambda x: (x - x.mean()) / x.std(), axis=0)
df_normalized


In [None]:
print(f"Dimensionality: {len(df_normalized.filter(numeric_columns).columns)}")


In [None]:
def plot_embeddings_2D(df_embedded, method):
    fig = plt.figure(figsize=(20,12))
    ax = sns.scatterplot(
        x="dim_1", 
        y="dim_2", 
        data=df_embedded
    )

    plt.title(f'Culture Embeddings (Data: World Values Survey, Method: {method})')

    def label_point(x, y, val, ax):
        a = pd.concat({'x': x, 'y': y, 'val': val}, axis=1)
        for i, point in a.iterrows():
            ax.text(point['x']+.02, point['y'], str(point['val']))

    label_point(df_embedded.dim_1, df_embedded.dim_2, df_embedded.Country, plt.gca())


In [None]:
def plot_embeddings_3D(df_embedded, method):
    fig = plt.figure(figsize=(20,12))
    ax = Axes3D(fig)
    ax.scatter(
        df_embedded["dim_1"],
        df_embedded["dim_2"],
        df_embedded["dim_3"]
    )

    plt.title(f'Culture Embeddings (Data: World Values Survey, Method: {method})')

    def label_point(x, y, z, val, ax):
        a = pd.concat({'x': x, 'y': y, 'z': z, 'val': val}, axis=1)
        for i, point in a.iterrows():
            ax.text(point['x']+.02, point['y'], point['z'], str(point['val']))

    label_point(df_embedded.dim_1, df_embedded.dim_2, df_embedded.dim_3, df_embedded.Country, plt.gca())
    

In [None]:
# Plot PCA 

pca = PCA()
X_embedded = pca.fit_transform(df_normalized.filter(numeric_columns).values)
df_pca = pd.DataFrame(X_embedded[:,:3], columns=["dim_1", "dim_2", "dim_3"])
df_pca["Country"] = list(df["Country"])
plot_embeddings_2D(df_pca, method="PCA")
plot_embeddings_3D(df_pca, method="PCA")


In [None]:
# Plot t-SNE

tsne_2D = TSNE(n_components=2, perplexity=10, learning_rate=100)
X_embedded_2D = tsne_2D.fit_transform(df_normalized.filter(numeric_columns).values)
df_tsne_2D = pd.DataFrame(X_embedded_2D, columns=["dim_1", "dim_2"])
df_tsne_2D["Country"] = list(df["Country"])
plot_embeddings_2D(df_tsne_2D, method="t-SNE")
