In [1]:
from collections import defaultdict
import matplotlib.pyplot as plt
import numpy as np
from sklearn.cluster import KMeans, DBSCAN
from sklearn.metrics import silhouette_score
from scipy.stats import binned_statistic_dd, truncnorm
import pandas as pd
from collections import Counter

In [2]:
FEATURE_COLUMNS = ['Valence', 'Arousal', 'Dominance', 'Politeness', 'Formality']

In [3]:
def get_bins(data, num_bins):
    """Return the bin index that each data point in data falls into, given the space
    is subdivided to have num_bins equally sized bins.

    A bin number of i means that the corresponding value is between bin_edges[i-1], bin_edges[i]

    Returns both the bin index as a unique integer, as well as in terms of a 5d
    array corresponding to each dimension.
    """
    # Initialize uniformly-sized bins
    bin_edges = np.linspace(0, 1, (num_bins + 1))

    # We need to know where in the space each bin id maps to, so edges are necessary

    # TO DO: Can we modify the statistic to directly calculate a vector valued statistic?
    stats, edges, binnumber = binned_statistic_dd(data, np.arange(len(data)),
                                                  statistic="mean",
                                                  bins=[bin_edges for i in range(data.shape[1])])
    
    stats, edges, unraveled_binnumber = binned_statistic_dd(data, np.arange(len(data)),
                                                            statistic="mean",
                                                            bins=[bin_edges for i in range(data.shape[1])],
                                                            expand_binnumbers=True)

    # Return the bin IDs
    return binnumber, unraveled_binnumber.transpose()

In [46]:
df = pd.read_csv("/ais/hal9000/datasets/reddit/jai_stance_embeddings/unmasked/2014_01_files_metadata_vad.csv")
df = df.set_index("id")
df['rel_marker'] = df['rel_marker'].apply(lambda x: eval(x)[0])

In [47]:
df['Politeness'] = (df["Politeness"] - df['Politeness'].min())/(df['Politeness'].max() - df['Politeness'].min())

In [48]:
bins, ubins = get_bins(df[FEATURE_COLUMNS].to_numpy(), 2)

In [49]:
def get_bin_centroids(data, bin_idx):
    """Calculate the centroid of all the points that lie within each bin.
    
    Use get_bins on the data first to get the bin_idx for each point.
    """
    bins = defaultdict(list)
    data = data.to_numpy()
    for point, bin in zip(data, bin_idx):
        bins[bin].append(point)
        
    centroids = {}
    for points in bins:
        centroids[points] = np.mean(bins[points], axis=0)

    return centroids

In [33]:
centroids = get_bin_centroids(df[FEATURE_COLUMNS], bins)

In [50]:
df['bin'] = bins

In [56]:
df.groupby(["subreddit", "bin", "rel_marker"]).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,author,body,created_utc,marker_category,len,Valence,Arousal,Dominance,Politeness,Formality
subreddit,bin,rel_marker,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
actrade,341,afraid,1,1,1,1,1,1,1,1,1,1
actrade,341,disgusting,1,1,1,1,1,1,1,1,1,1
actrade,341,fancy,1,1,1,1,1,1,1,1,1,1
actrade,341,happily,1,1,1,1,1,1,1,1,1,1
actrade,341,hate,2,2,2,2,2,2,2,2,2,2
...,...,...,...,...,...,...,...,...,...,...,...,...
writingprompts,682,unexpected,1,1,1,1,1,1,1,1,1,1
writingprompts,682,want,32,32,32,32,32,32,32,32,32,32
writingprompts,682,wish,30,30,30,30,30,30,30,30,30,30
writingprompts,682,worried,1,1,1,1,1,1,1,1,1,1


In [45]:
df

Unnamed: 0_level_0,author,subreddit,body,created_utc,rel_marker,marker_category,len,Valence,Arousal,Dominance,Politeness,Formality,bin
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
cei20h4-0,Ultimate_Cocktease15,randomactsofmakeup,I just have so many accessories idk what to Do...,1388952663,just,emphatic,14,0.488717,0.583994,0.377098,0.388359,0.211197,405
cei20v3-0,Oneireus,mma,"He did really well against amateurs, then the ...",1388952683,really,emphatic,21,0.430257,0.705089,0.554081,0.535587,0.515630,426
cei20v3-1,Oneireus,mma,Uriah Hall was almost fired by Dana for not ha...,1388952683,just,emphatic,26,0.415873,0.621402,0.459339,0.413025,0.537336,406
cei20v3-2,Oneireus,mma,"I think Uriah looked amazing on TUF, but his m...",1388952683,amazing,positive_affect_adjective,27,0.552055,0.656746,0.480590,0.591292,0.572426,666
cei22au-0,n1cotine,snowboarding,"As a soft top wrangler owner, my only option i...",1388952768,like,positive_affect_verbs,21,0.440273,0.579550,0.481438,0.359443,0.304839,405
...,...,...,...,...,...,...,...,...,...,...,...,...,...
cesozr5-0,SlinDev,gamedev,"Sure they work for testing, as do my own but i...",1390057796,just,emphatic,21,0.374378,0.616472,0.559918,0.343059,0.535756,422
cesozr5-1,SlinDev,gamedev,Sure there are a couple of nice and extremely ...,1390057796,nice,positive_affect_adjective,32,0.509609,0.513486,0.467336,0.464866,0.599340,662
cesozr5-2,SlinDev,gamedev,"Actually, I solved my lack of test models for ...",1390057796,want,positive_affect_verbs,29,0.381132,0.570159,0.413610,0.454367,0.629680,406
cesp0f4-0,PurulentExudate,unity3d,Gold isn't really valuable in D2 but it was co...,1390057864,really,emphatic,15,0.477959,0.568833,0.502877,0.463143,0.309913,421
