In [None]:
import json
import os

import pandas as pd
from scipy.stats import spearmanr
from sklearn.decomposition import PCA

from src.features.perspective import parse_summary_scores


In [None]:

with pd.read_json('../data/interim/labeling_contributions_preprocessed_no_bot.jsonl', orient='records', lines=True, chunksize=500) as reader:
    df = pd.concat(chunk[chunk.contribution_type=='comment'][[ 'author', 'created_utc',
           'subreddit','text', 'fullname', 'parent_fullname',
           'link_fullname',]] for chunk in reader)

with open('../data/interim/perspective/labeling_contributions_preprocessed_no_bot_perspective.jsonl', encoding='utf8') as f:
    perspectives = dict()
    for obj in map(json.loads, f):
        k, v = list(obj.items())[0]
        perspectives[k] = parse_summary_scores(v)
perspective_df = pd.DataFrame(perspectives).T


with open('../data/interim/liwc/labeling_contributions_preprocessed_no_bot_liwc.jsonl', encoding='utf8') as f:
    liwcs = dict()
    for obj in map(json.loads, f):
        liwcs.update(obj)
liwc_df = pd.DataFrame(liwcs).T
liwc_df = liwc_df.divide(liwc_df.Tokens, axis=0)
del liwc_df['Tokens']

In [3]:
with open('../data/interim/social_dimensions/labeling_contributions_preprocessed_no_bot_social_dimensions.jsonl', encoding='utf8') as f:
    social_dimensions = dict()
    for obj in map(json.loads, f):
        social_dimensions.update(obj)
social_dimensions_df = pd.DataFrame(social_dimensions).T


df = pd.merge(df, perspective_df, how='left', left_on='fullname', right_index=True)
df = pd.merge(df, social_dimensions_df, how='left', left_on='fullname', right_index=True)
df = pd.merge(df, liwc_df, how='left', left_on='fullname', right_index=True)

del perspective_df, social_dimensions_df, liwc_df,

In [31]:
list(df.columns)

liwc_cols = ['Adverb',
 'Affect',
 'Article',
 'Auxverb',
 'Cause',
 'CogProc',
 'Conj',
 'Discrep',
 'FocusPresent',
 'Function',
 'Insight',
 'Ipron',
 'Posemo',
 'Ppron',
 'Pronoun',
 'Quant',
 'Social',
 'Tentat',
 'Verb',
 'You',
 'friend',
 'Tokens',
 'Achieve',
 'Adj',
 'Compare',
 'Drives',
 'I',
 'Motion',
 'Prep',
 'Relativ',
 'Reward',
 'Space',
 'they',
 'FocusFuture',
 'FocusPast',
 'Interrog',
 'SheHe',
 'differ',
 'male',
 'Anx',
 'Assent',
 'Bio',
 'Certain',
 'Health',
 'Informal',
 'Money',
 'Negate',
 'Negemo',
 'Netspeak',
 'Percept',
 'Power',
 'Relig',
 'Risk',
 'Sad',
 'Time',
 'feel',
 'Anger',
 'Body',
 'Death',
 'Nonflu',
 'See',
 'Swear',
 'female',
 'Affiliation',
 'Leisure',
 'Sexual',
 'family',
 'Ingest',
 'Work',
 'hear',
 'We',
 'home',
 'Filler']
social_cols=['conflict',
 'fun',
 'identity',
 'knowledge',
 'power',
 'respect',
 'romance',
 'similarity',
 'social_support',
 'trust',]
toxic_cols=['ATTACK_ON_AUTHOR',
 'ATTACK_ON_COMMENTER',
 'FLIRTATION',
 'IDENTITY_ATTACK',
 'INCOHERENT',
 'INFLAMMATORY',
 'INSULT',
 'LIKELY_TO_REJECT',
 'OBSCENE',
 'PROFANITY',
 'SEVERE_TOXICITY',
 'SEXUALLY_EXPLICIT',
 'SPAM',
 'THREAT',
 'TOXICITY',
 'UNSUBSTANTIAL',]

In [32]:
df[liwc_cols] = df[liwc_cols].divide(df.Tokens, axis=0)
del df['Tokens']

In [15]:
with open('../data/interim/labeler_sub_dimensions.jsonl', encoding='utf8') as f:
    dims = dict()
    for obj in map(json.loads, f):
        dims.update(obj)
dims_df = pd.DataFrame(dims).T


In [None]:
with open('../data/interim/labeler_sub_conspiracy.jsonl', encoding='utf8') as f:
    consp = dict()
    for obj in map(json.loads, f):
        consp.update(obj)


In [None]:
consp_df=pd.DataFrame.from_dict(consp, orient='index')

In [22]:
consp_df.columns=['conspiracy']

In [24]:
user_df = pd.merge(consp_df, dims_df, left_index=True, right_index=True)

In [25]:
user_df.head()

Unnamed: 0,conspiracy,affluence,age,age B,age neutral,edginess,gender,gender B,gender neutral,partisan,partisan B,partisan B neutral,partisan neutral,sociality,time
GallowFroot,0.162281,0.080235,-0.046192,-0.059004,0.493092,0.063367,-0.02441,0.02642,0.431664,0.076811,0.063858,0.416249,0.42044,0.033946,0.124584
lie-twerker,0.334199,-0.078243,-0.052892,-0.01621,0.42309,-0.004301,0.187111,0.211298,0.403782,0.05358,0.105483,0.349728,0.306543,0.138508,0.121835
arduous_raven,0.062335,0.188243,-0.023091,-0.01127,0.521908,0.0364,-0.033094,0.018579,0.441465,0.01457,-0.026937,0.401205,0.440431,0.043603,0.083971
thrawnpop,0.086708,-0.00474,0.022127,0.030912,0.065832,0.017135,0.012276,-0.003803,0.051026,-0.002335,0.010144,0.060803,0.055966,-0.017333,-0.00411
SPACEMANTIMEZ,0.297257,0.162604,0.061887,0.02139,0.519869,0.001225,0.097639,0.090844,0.483203,-0.008656,-0.042643,0.513038,0.491142,-0.039786,0.033664


In [33]:
mean_df = df[[c for c in df.columns if c not in {'created_utc', 'subreddit', 'text',
                                      'fullname',
                                      'parent_fullname', 'link_fullname'}]].groupby('author').mean()

In [34]:
mean_df.head()

Unnamed: 0_level_0,ATTACK_ON_AUTHOR,ATTACK_ON_COMMENTER,FLIRTATION,IDENTITY_ATTACK,INCOHERENT,INFLAMMATORY,INSULT,LIKELY_TO_REJECT,OBSCENE,PROFANITY,...,Affiliation,Leisure,Sexual,family,Ingest,Work,hear,We,home,Filler
author,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
----------------fuck,0.081933,0.696846,0.306964,0.367026,0.686348,0.780436,0.498691,0.704208,0.051485,0.492594,...,0.029851,0.0,0.0,0.0,0.0,0.014925,0.029851,0.029851,0.0,0.0
---------_----_---_,0.329614,0.261148,0.320017,0.043226,0.21424,0.364906,0.519861,0.561594,0.338142,0.243875,...,0.012821,0.0,0.0,0.0,0.0,0.018668,0.0,0.012821,0.0,0.0
--------V--------,0.62092,0.727617,0.203574,0.00555,0.890323,0.639493,0.072701,0.818072,0.127282,0.019964,...,0.028571,0.0,0.0,0.0,0.0,0.0,0.028571,0.028571,0.0,0.0
-------_-----,0.008301,0.160138,0.198003,0.059766,0.085357,0.436962,0.39298,0.996588,0.993693,0.6446,...,0.009346,0.018692,0.009346,0.0,0.0,0.009346,0.0,0.0,0.0,0.0
-----1,0.358992,0.355179,0.178002,0.004532,0.501877,0.650783,0.024124,0.337144,0.010805,0.0132,...,0.023256,0.023256,0.0,0.0,0.0,0.023256,0.023256,0.046512,0.0,0.0


In [35]:
import seaborn as sns

In [38]:
dh=mean_df.head()

In [50]:
toplot = pd.merge(mean_df.reset_index().melt(id_vars=['author']).set_index('author'), pd.qcut(user_df['conspiracy'], 2, labels=False), how='left', left_index=True, right_index=True)

In [None]:
user_characteristics = ['affluence', 'age', 'partisan', 'sociality', 'time', 'edginess', 'gender', 'conspiracy']
sns.stripplot(data = toplot, hue='conspiracy', x='value', y='variable', )