In [1]:
import json
import os

import pandas as pd
from scipy.stats import spearmanr
from sklearn.decomposition import PCA

from src.features.perspective import parse_summary_scores


In [None]:

with pd.read_json('../data/interim/labeling_contributions_preprocessed_no_bot.jsonl', orient='records', lines=True, chunksize=500) as reader:
    df = pd.concat(chunk[chunk.contribution_type=='comment'][[ 'author', 'created_utc',
           'subreddit','text', 'fullname', 'parent_fullname',
           'link_fullname',]] for chunk in reader)

with open('../data/interim/perspective/labeling_contributions_preprocessed_no_bot_perspective.jsonl', encoding='utf8') as f:
    perspectives = dict()
    for obj in map(json.loads, f):
        k, v = list(obj.items())[0]
        perspectives[k] = parse_summary_scores(v)
perspective_df = pd.DataFrame(perspectives).T


with open('../data/interim/liwc/labeling_contributions_preprocessed_no_bot_liwc.jsonl', encoding='utf8') as f:
    liwcs = dict()
    for obj in map(json.loads, f):
        liwcs.update(obj)
liwc_df = pd.DataFrame(liwcs).T
with open('../data/interim/social_dimensions/labeling_contributions_preprocessed_no_bot_social_dimensions.jsonl', encoding='utf8') as f:
    social_dimensions = dict()
    for obj in map(json.loads, f):
        social_dimensions.update(obj)
social_dimensions_df = pd.DataFrame(social_dimensions).T


df = pd.merge(df, perspective_df, how='left', left_on='fullname', right_index=True)
df = pd.merge(df, social_dimensions_df, how='left', left_on='fullname', right_index=True)
df = pd.merge(df, liwc_df, how='left', left_on='fullname', right_index=True)

del perspective_df, social_dimensions_df, liwc_df,

In [2]:
df.columns

Index(['author', 'created_utc', 'subreddit', 'text', 'fullname',
       'parent_fullname', 'link_fullname', 'ATTACK_ON_AUTHOR',
       'ATTACK_ON_COMMENTER', 'FLIRTATION',
       ...
       'Assent', 'family', 'FocusFuture', 'Ingest', 'female', 'feel', 'Death',
       'Money', 'home', 'Filler'],
      dtype='object', length=106)

In [6]:
# feature_groups = [["toxicity","threat","likely_to_reject","insult","attack_on_commenter","attack_on_author",],
# ["social","you","social_support","respect","sexually_explicit","death","flirtation",],
# ["unsubstantial","incoherent","cogproc","knowledge",],
# ["i","negemo","power"],
# ["conflict","differ","relativ","compare",],]
# feature_groups = [[k for k in df.columns if k.lower() in kk] for kk in feature_groups]
# feature_groups

In [34]:
feature_groups = [['ATTACK_ON_AUTHOR',
  'ATTACK_ON_COMMENTER',],
[ 'INSULT',
  'LIKELY_TO_REJECT',
  'THREAT',
  'TOXICITY'],
 ['FLIRTATION',
  'SEXUALLY_EXPLICIT',
  'respect',
  'social_support',
  'Social',
  'You',
  'Death'],
 ['INCOHERENT', 'UNSUBSTANTIAL', 'knowledge', 'CogProc'],
 ['power', 'I', 'Negemo', ],
 ['conflict', 'Relativ', 'differ', 'Compare']]

In [41]:
column_names = ['attack', 'toxic', 'bonding', 'rational', 'status']
for fg_name, fg in zip(column_names, feature_groups):
    X = df[fg]
    pca = PCA()
    pca.fit(X)
    print(pca.explained_variance_ratio_[:3])
    pca = PCA(1)
    df[fg_name] = pca.fit_transform(X)


[0.80979289 0.19020711]
[0.76108988 0.2108897  0.01892985]
[0.71799547 0.27003238 0.01048289]
[0.97412193 0.01441718 0.00821468]
[0.94821381 0.04788143 0.00390477]


In [42]:
# fg = feature_groups[0]
# X = df[fg]
# pca = PCA(2)
# Xt = pca.fit_transform(X)
# for col in fg:
#     print(col, spearmanr(X[col].values, Xt[:, 0]))
#     print(col, spearmanr(X[col].values, Xt[:, 1]))
#
# pca.components_

In [48]:
os.makedirs('../data/sampling_features', exist_ok=True)
df.set_index('fullname')[column_names].to_csv('../data/interim/sampling_features/sampling_features.csv')

In [50]:
all_feats = [i for ii in feature_groups for i in ii]

In [55]:
df[all_feats+column_names].corr().loc[column_names, all_feats]

Unnamed: 0,ATTACK_ON_AUTHOR,ATTACK_ON_COMMENTER,INSULT,LIKELY_TO_REJECT,THREAT,TOXICITY,FLIRTATION,SEXUALLY_EXPLICIT,respect,social_support,...,UNSUBSTANTIAL,knowledge,CogProc,power,I,Negemo,conflict,Relativ,differ,Compare
attack,0.759356,0.964423,0.022013,0.094273,-0.066928,-0.034992,-0.177251,-0.163309,0.095079,0.128455,...,0.390445,-0.146526,-0.091091,-0.052911,-0.154517,-0.14497,0.066954,-0.166129,-0.174329,-0.196055
toxic,-0.036776,0.077574,0.856558,0.885659,0.224487,0.892439,0.224501,0.534365,0.004652,0.03046,...,-0.172566,-0.19873,-0.016542,-0.042875,-0.011668,0.155366,0.180895,-0.023665,-0.027739,-0.020619
bonding,0.160219,0.126288,0.059234,0.159719,0.036918,0.055417,0.181332,0.102931,-0.112094,-0.171629,...,-0.346594,0.226923,0.204115,0.084673,0.447732,0.253113,-0.006781,0.276935,0.563794,0.459215
rational,-0.024013,-0.107833,0.000711,-0.029412,0.017186,0.000285,0.131161,0.058749,-0.127713,-0.194583,...,-0.289762,0.179425,0.999992,0.045021,0.218746,0.190036,0.030394,0.206309,0.311327,0.237791
status,-0.041763,-0.181574,-0.051172,0.028478,0.01277,-0.039219,0.201272,0.070449,-0.047638,-0.144366,...,-0.300781,0.172134,0.219727,0.055205,0.999914,0.250943,-0.118007,0.266859,0.585312,0.506034
