In [47]:
import json
import os

import pandas as pd
from scipy.stats import spearmanr
from sklearn.decomposition import PCA

from src.features.perspective import parse_summary_scores


In [1]:

with pd.read_json('../data/interim/labeling_contributions_preprocessed_no_bot.jsonl', orient='records', lines=True, chunksize=500) as reader:
    df = pd.concat(chunk[chunk.contribution_type=='comment'][[ 'author', 'created_utc',
           'subreddit','text', 'fullname', 'parent_fullname',
           'link_fullname',]] for chunk in reader)

with open('../data/interim/perspective/labeling_contributions_preprocessed_no_bot_perspective.jsonl', encoding='utf8') as f:
    perspectives = dict()
    for obj in map(json.loads, f):
        k, v = list(obj.items())[0]
        perspectives[k] = parse_summary_scores(v)
perspective_df = pd.DataFrame(perspectives).T


with open('../data/interim/liwc/labeling_contributions_preprocessed_no_bot_liwc.jsonl', encoding='utf8') as f:
    liwcs = dict()
    for obj in map(json.loads, f):
        liwcs.update(obj)
liwc_df = pd.DataFrame(liwcs).T
with open('../data/interim/social_dimensions/labeling_contributions_preprocessed_no_bot_social_dimensions.jsonl', encoding='utf8') as f:
    social_dimensions = dict()
    for obj in map(json.loads, f):
        social_dimensions.update(obj)
social_dimensions_df = pd.DataFrame(social_dimensions).T


df = pd.merge(df, perspective_df, how='left', left_on='fullname', right_index=True)
df = pd.merge(df, social_dimensions_df, how='left', left_on='fullname', right_index=True)
df = pd.merge(df, liwc_df, how='left', left_on='fullname', right_index=True)

del perspective_df, social_dimensions_df, liwc_df,

In [2]:
df.columns

Index(['author', 'created_utc', 'subreddit', 'text', 'fullname',
       'parent_fullname', 'link_fullname', 'ATTACK_ON_AUTHOR',
       'ATTACK_ON_COMMENTER', 'FLIRTATION',
       ...
       'Assent', 'family', 'FocusFuture', 'Ingest', 'female', 'feel', 'Death',
       'Money', 'home', 'Filler'],
      dtype='object', length=106)

In [6]:
# feature_groups = [["toxicity","threat","likely_to_reject","insult","attack_on_commenter","attack_on_author",],
# ["social","you","social_support","respect","sexually_explicit","death","flirtation",],
# ["unsubstantial","incoherent","cogproc","knowledge",],
# ["i","negemo","power"],
# ["conflict","differ","relativ","compare",],]
# feature_groups = [[k for k in df.columns if k.lower() in kk] for kk in feature_groups]
# feature_groups

In [34]:
feature_groups = [['ATTACK_ON_AUTHOR',
  'ATTACK_ON_COMMENTER',],
[ 'INSULT',
  'LIKELY_TO_REJECT',
  'THREAT',
  'TOXICITY'],
 ['FLIRTATION',
  'SEXUALLY_EXPLICIT',
  'respect',
  'social_support',
  'Social',
  'You',
  'Death'],
 ['INCOHERENT', 'UNSUBSTANTIAL', 'knowledge', 'CogProc'],
 ['power', 'I', 'Negemo', ],
 ['conflict', 'Relativ', 'differ', 'Compare']]

In [41]:
column_names = ['attack', 'toxic', 'bonding', 'rational', 'status']
for fg_name, fg in zip(column_names, feature_groups):
    X = df[fg]
    pca = PCA()
    pca.fit(X)
    print(pca.explained_variance_ratio_[:3])
    pca = PCA(1)
    df[fg_name] = pca.fit_transform(X)


[0.80979289 0.19020711]
[0.76108988 0.2108897  0.01892985]
[0.71799547 0.27003238 0.01048289]
[0.97412193 0.01441718 0.00821468]
[0.94821381 0.04788143 0.00390477]


In [42]:
# fg = feature_groups[0]
# X = df[fg]
# pca = PCA(2)
# Xt = pca.fit_transform(X)
# for col in fg:
#     print(col, spearmanr(X[col].values, Xt[:, 0]))
#     print(col, spearmanr(X[col].values, Xt[:, 1]))
#
# pca.components_

In [48]:
os.makedirs('../data/sampling_features', exist_ok=True)
df.set_index('fullname')[column_names].to_csv('../data/sampling_features/sampling_features.csv')