In [1]:
import matplotlib.pyplot as plt
import numpy as np
from scipy.stats import binned_statistic_dd
import pandas as pd
from itertools import product
import os
import sys
from scipy.spatial.distance import pdist
from sklearn.manifold import MDS
from scipy.stats import spearmanr
from scipy.spatial.distance import cdist
from sklearn.metrics.pairwise import cosine_similarity
sys.path.append("../data_extraction/")
from utils import flatten_logic, Serialization
from functools import reduce
from tqdm import tqdm
from mpl_toolkits.mplot3d import Axes3D
from multiprocessing import Pool
tqdm.pandas()

FEATURE_COLUMNS = ['Valence', 'Arousal', 'Dominance', 'Politeness', 'Formality']
NUM_QUANTILES = 4



In [7]:
def get_nonzero_prop(df):
    print(np.round(np.count_nonzero(df)/df.size, 2))

def pmi(df, positive=True):
    col_totals = df.sum(axis=0)
    total = col_totals.sum()
    row_totals = df.sum(axis=1)
    expected = np.outer(row_totals, col_totals) / total
    df = df / expected
    # Silence distracting warnings about log(0):
    with np.errstate(divide='ignore'):
        df = np.log(df)
    df[np.isinf(df)] = 0.0  # log(0) = 0
    if positive:
        df[df < 0] = 0.0
        df = np.nan_to_num(df)
    return df


def create_derived_representation(matrix):
    matrix_np = matrix.to_numpy()
    print(matrix_np.shape)
    ppmi = pmi(matrix_np)
    get_nonzero_prop(ppmi)
    P, D, Q = np.linalg.svd(ppmi, full_matrices=False)
    return matrix_np, ppmi, P, D, Q

def scree_plot(sing_val_matrix):
    var_explained = np.round(sing_val_matrix**2/np.sum(sing_val_matrix**2), decimals=3)
    total_var_explained = np.concatenate(([0], np.cumsum(var_explained)))
    print(var_explained[:10])
    print(total_var_explained[:10])
    plt.clf()
    plt.plot(np.arange(len(var_explained[:10])), var_explained[:10])
    plt.xlabel("Component Number")
    plt.ylabel("Variance Explained")
    plt.title("Proportion of Variance Explained by Each Singular Value")
    plt.show()
    plt.clf()
    plt.plot(np.arange(-1, len(total_var_explained[:10])), total_var_explained[:11])
    plt.ylim(0, np.max(total_var_explained[:11]) + 0.02)
    plt.xlabel("Component Number")
    plt.ylabel("Cumulative Variance Explained")
    plt.title("Cumulative Variance Explained by Each Singular Value")
    plt.show()


In [4]:
df = Serialization.load_obj(f"stance_pipeline_full_data_{NUM_QUANTILES}_quantiles_full_data")
x = df.groupby("bin").mean()[FEATURE_COLUMNS]

# Get all unique markers
all_markers = sorted(df['rel_marker'].unique())
all_markers = [marker for marker in all_markers if marker not in ["'d", "10x"]]
df = df[df['rel_marker'].isin(all_markers)]
# Combine the subreddit and marker and aggregate
df['sub_marker'] = df["subreddit"] + "_" + df['rel_marker']
# agg = df.groupby(["bin", "sub_marker"]).count()

# Get all unique communities and bins
comms = df['subreddit'].unique()
markers = df['rel_marker'].unique()
bins = df['bin'].unique()
com_markers = list(product(comms, markers))
com_markers = ["_".join(pair) for pair in com_markers]
len(com_markers)

102304

In [5]:
# Need probability
sem_sit_counts_per_community = df.groupby(["subreddit", "bin"]).count()[['sub_marker', "Valence"]]
all_sub_counts = pd.DataFrame(0, index=pd.MultiIndex.from_product([bins, comms], names=["bin", "subreddit"]), columns=sem_sit_counts_per_community.columns)
sem_sit_counts_per_community = sem_sit_counts_per_community.add(all_sub_counts, fill_value=0)
sem_sit_counts_per_community['percent'] = sem_sit_counts_per_community.groupby(level=0)['sub_marker'].transform(lambda x: (x / x.sum()))
com_to_need = {}
for sub in comms:
    need_vec = sem_sit_counts_per_community.loc[sub]['percent']
    com_to_need[sub] = need_vec.to_numpy()
need_df = pd.DataFrame(com_to_need).T
need_df.columns = sem_sit_counts_per_community.loc[sub].index
need_df.describe()

bin,V1A1D1P1F1,V1A1D1P1F2,V1A1D1P1F3,V1A1D1P1F4,V1A1D1P2F1,V1A1D1P2F2,V1A1D1P2F3,V1A1D1P2F4,V1A1D1P3F1,V1A1D1P3F2,...,V4A4D4P2F3,V4A4D4P2F4,V4A4D4P3F1,V4A4D4P3F2,V4A4D4P3F3,V4A4D4P3F4,V4A4D4P4F1,V4A4D4P4F2,V4A4D4P4F3,V4A4D4P4F4
count,92.0,92.0,92.0,92.0,92.0,92.0,92.0,92.0,92.0,92.0,...,92.0,92.0,92.0,92.0,92.0,92.0,92.0,92.0,92.0,92.0
mean,0.002242,0.001492,0.001071,0.000823,0.002292,0.001789,0.001374,0.001056,0.002047,0.001745,...,0.001082,0.001279,0.001109,0.001494,0.001845,0.002198,0.003359,0.003206,0.003687,0.004154
std,0.000793,0.000523,0.000456,0.000621,0.000692,0.000452,0.000405,0.000707,0.000603,0.000389,...,0.000492,0.000641,0.000556,0.000786,0.001063,0.001346,0.002517,0.001872,0.002484,0.003216
min,0.000256,0.00048,0.000215,6.4e-05,0.000428,0.000652,0.000391,0.000201,0.000624,0.000715,...,0.000271,0.000199,8.1e-05,0.000231,0.000359,0.000231,0.000319,0.000393,0.00068,0.000864
25%,0.001727,0.001114,0.000678,0.000384,0.001901,0.001484,0.001109,0.000587,0.001703,0.001515,...,0.00073,0.000784,0.000652,0.000885,0.001072,0.001267,0.001901,0.001737,0.001892,0.00208
50%,0.002203,0.001507,0.001046,0.000712,0.002257,0.001824,0.001352,0.000923,0.002007,0.001697,...,0.000918,0.001241,0.000952,0.001174,0.001484,0.001792,0.002993,0.002674,0.002711,0.002887
75%,0.002706,0.001788,0.001381,0.001072,0.002725,0.002047,0.001656,0.001253,0.002287,0.00204,...,0.001363,0.001689,0.001509,0.002097,0.002545,0.00291,0.004382,0.004316,0.005333,0.005505
max,0.005237,0.003102,0.002742,0.003895,0.004494,0.003225,0.002556,0.005554,0.00489,0.002832,...,0.002731,0.003125,0.002527,0.003472,0.006084,0.007752,0.021749,0.009998,0.014249,0.021208


In [6]:
pav_matrix = Serialization.load_obj("pavalanathan_cooc_data_full_data")
cooc_matrix = Serialization.load_obj("our_cooc_data_full_data")

In [9]:
cooc_sum = cooc_matrix.sum()
adjusted_cooc_matrix = cooc_matrix[cooc_sum[cooc_sum != 0].index]

In [10]:
cooc_matrix.shape

(1024, 102304)

In [34]:
our_cooc, our_ppmi, our_p, our_d, our_q = create_derived_representation(cooc_matrix)
# scree_plot(our_d)
# adjusted_cooc, adjusted_ppmi, adjusted_p, adjusted_d, adjusted_q = create_derived_representation(adjusted_cooc_matrix)

(1024, 102304)


  if __name__ == "__main__":


0.07


In [37]:
new_Q = our_q[:, :3]
com_to_stance_usage = {}
comms = sorted(comms)
for i in range(len(comms)):
    sub = new_Q[i*len(markers):(i+1)*len(markers), :]
    # sub.columns = [col[col.index("_") + 1:] for col in sub.columns]
    com_to_stance_usage[comms[i]] = sub


pairs = []
need_sims = []
stance_sims = []
adjusted_stance_sims = []
com_to_markers = {}
for com_1 in tqdm(comms):
    for com_2 in comms:
        if com_1 != com_2:
            if sorted((com_1, com_2)) in pairs:
                continue
            pairs.append(sorted((com_1, com_2)))
            need_sims.append(cosine_similarity(com_to_need[com_1].reshape(1, -1), com_to_need[com_2].reshape(1, -1))[0][0])
            stance_sims.append(np.mean(cosine_similarity(com_to_stance_usage[com_1], com_to_stance_usage[com_2]).diagonal()))

            if com_1 not in com_to_markers:
                shdf = cooc_matrix[[col for col in cooc_matrix.columns if col[:col.rfind("_")] == com_1]]
                markers_in_shdf = cosine_similarity(shdf.to_numpy().T, shdf.to_numpy().T).diagonal().nonzero()
                com_1_markers = set(markers_in_shdf[0])
                com_to_markers[com_1] = com_1_markers
            if com_2 not in com_to_markers:
                d3df = cooc_matrix[[col for col in cooc_matrix.columns if col[:col.rfind("_")] == com_2]]
                markers_in_d3df = cosine_similarity(d3df.to_numpy().T, d3df.to_numpy().T).diagonal().nonzero()
                com_2_markers = set(markers_in_d3df[0])
                com_to_markers[com_2] = com_2_markers

            common_markers = list(com_to_markers[com_1].intersection(com_to_markers[com_2]))
            adjusted_stance_sims.append(np.mean(cosine_similarity(com_to_stance_usage[com_1][common_markers, :], com_to_stance_usage[com_2][common_markers, :]).diagonal()))

  0%|          | 0/92 [00:00<?, ?it/s]


ValueError: Found array with 0 sample(s) (shape=(0, 3)) while a minimum of 1 is required by check_pairwise_arrays.