<a href="https://colab.research.google.com/github/ivavuko/ivasphd/blob/main/Amplified_and_attenuated_dictionary_implementation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import re
import requests
import numpy as np
import matplotlib.pyplot as plt
import datetime as dt
import io
import spacy
from sklearn import metrics
from sklearn.pipeline import Pipeline
from statsmodels.stats.contingency_tables import mcnemar
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from scipy import stats

In [2]:
dictsignals = pd.read_csv("python_signals_dictionary - condensed.csv", encoding="utf-8")
dictsignals.head()

FileNotFoundError: [Errno 2] No such file or directory: 'python_signals_dictionary - condensed.csv'

In [None]:
signals = dictsignals["signals"].tolist()

In [None]:
essays = pd.read_csv("CROSSDA_data_Iva_only_text.xlsx - data_recoded.csv", index_col= "ID", encoding="utf-8")

In [None]:
# converting wildcard signals to regex patterns because of the wildcards from LIWC
regex_patterns = {}
for phrase in signals:
    if phrase.startswith("*") and phrase.endswith("*"):
        regex_patterns[phrase] = rf"\b\w*{re.escape(phrase[1:-1])}\w*\b"
    elif phrase.startswith("*"):
        regex_patterns[phrase] = rf"\b\w*{re.escape(phrase[1:])}\b"
    elif phrase.endswith("*"):
        regex_patterns[phrase] = rf"\b{re.escape(phrase[:-1])}\w*\b"
    else:
        regex_patterns[phrase] = rf"\b{re.escape(phrase)}\b"

In [None]:
# sorting instances by length (longest first) to ensure priority of longer signals
sorted_instances = sorted(regex_patterns.keys(), key=len, reverse=True)

In [None]:
# function for counting the numbers of occurrences of signals in texts
def process_column(df, column_name):
    rows = []

    for index, text in df[column_name].items():
        row = {"Index": index}
        counted_positions = set()  # to track already matched positions

        if pd.isna(text):  # skip nan values
            text = ""

        for phrase in sorted_instances:
            pattern = regex_patterns[phrase]
            matches = list(re.finditer(pattern, text, flags=re.IGNORECASE))
            count = 0

            for match in matches:
                start, end = match.span()
                if all(pos not in counted_positions for pos in range(start, end)):  # ensure no overlap
                    count += 1
                    counted_positions.update(range(start, end))  # mark positions as counted

            row[phrase] = count

        rows.append(row)

    # create the dataframe and set index to match the original dataframe
    return pd.DataFrame(rows).set_index("Index")

In [None]:
# process each column separately
pos_char_signals = process_column(essays, "Personal characteristics POSITIVE")
neg_char_signals = process_column(essays, "Personal characteristics NEGATIVE")
pos_healthcare_signals = process_column(essays, "Universal healthcare POSITIVE")
neg_healthcare_signals = process_column(essays, "Universal healthcare NEGATIVE")

In [None]:
pos_char_signals["ID"] = pos_char_signals.index

Dictionary features based on counts per signal

In [None]:
dictmeta = pd.read_csv("python_signals_dictionary - meta-data.csv", encoding="latin-1")
dictmeta.head()

In [None]:
# a function that takes into account the metadata of signals and aggregates signals into the counts of attenuated and amplified language and their subgroups

def aggregate_ngrams_by_subgroup_and_group(ngram_df, dictmeta):

    # create mapping from signal to subgroup
    ngram_to_subgroup = dict(zip(dictmeta["signals"], dictmeta["subgroup"]))

    # filter to valid ngrams that appear in both dictmeta and ngram_df
    valid_ngrams = [col for col in ngram_df.columns if col in ngram_to_subgroup]

    # subset to those columns
    filtered_df = ngram_df[valid_ngrams]

    # create a new df to store subgroup sums
    subgroup_agg = pd.DataFrame(index=filtered_df.index)

    # sum each subgroup
    for subgroup in dictmeta["subgroup"].unique():
        ngrams_in_subgroup = dictmeta[dictmeta["subgroup"] == subgroup]["signals"]
        ngrams_in_subgroup = [ng for ng in ngrams_in_subgroup if ng in filtered_df.columns]
        if ngrams_in_subgroup:
            subgroup_agg[subgroup] = filtered_df[ngrams_in_subgroup].sum(axis=1)
        else:
            subgroup_agg[subgroup] = 0

    # aggregate subgroups into groups
    group_agg = pd.DataFrame(index=subgroup_agg.index)
    group_agg["amplified"] = subgroup_agg.get("generalization", 0) + \
                             subgroup_agg.get("intensifier", 0) + \
                             subgroup_agg.get("certainty", 0)
    group_agg["attenuated"] = subgroup_agg.get("specification", 0) + \
                              subgroup_agg.get("detensifier", 0) + \
                              subgroup_agg.get("uncertainty", 0)

    # combine subgroup and group counts
    result = pd.concat([subgroup_agg, group_agg], axis=1)
    result = result.sort_index(axis=1)

    return result

In [None]:
# a function that normalizes the counts into percentages relative to the word count of that person's text

def normalize_by_column(df, denominator_col="WC"):

    # columns to normalize (all except the denominator)
    cols_to_normalize = df.columns.difference([denominator_col])

    # compute percentages
    percentage = df[cols_to_normalize].div(df[denominator_col], axis=0)*100

     # replace 'inf' values with 0
    percentage.replace([float("inf"), -float("inf")], 0, inplace=True)

    # fill nans (e.g., if denominator is zero) with 0
    percentage = percentage.fillna(0)

    return percentage


In [None]:
essays["WC_pos_char"] = essays["Personal characteristics POSITIVE"].apply(lambda n: len(n.split()) if isinstance(n, str) else 0)
essays["WC_neg_char"] = essays["Personal characteristics NEGATIVE"].apply(lambda n: len(n.split()) if isinstance(n, str) else 0)
essays["WC_health_pos"] = essays["Universal healthcare POSITIVE"].apply(lambda n: len(n.split()) if isinstance(n, str) else 0)
essays["WC_health_neg"] = essays["Universal healthcare NEGATIVE"].apply(lambda n: len(n.split()) if isinstance(n, str) else 0)

In [None]:
pos_char_dict_counts = aggregate_ngrams_by_subgroup_and_group(pos_char_signals, dictmeta)
pos_char_dict_counts = pos_char_dict_counts.join(essays["WC_pos_char"])

## Custom features (the complicated ones that can't be simply counted)



In [None]:
df_good_text = essays.copy()

*Exclamation marks*

In [None]:
def count_exclamation_mark(text):
    text = str(text)
    exlamation_mark = re.findall(r'\!{1,}', text)
    return len(exlamation_mark)

In [None]:
df_good_text["exclamation_count_neg_char"] = df_good_text["Personal characteristics NEGATIVE"].apply(count_exclamation_mark)
df_good_text["exclamation_count_healthcare_pos"] = df_good_text["Universal healthcare POSITIVE"].apply(count_exclamation_mark)
df_good_text["exclamation_count_healthcare_neg"] = df_good_text["Universal healthcare NEGATIVE"].apply(count_exclamation_mark)

In [None]:
df_good_text["exclamation_count_pos_char"] = df_good_text["Personal characteristics POSITIVE"].apply(count_exclamation_mark)

# *...*

In [None]:
def count_three_dots(text):
    text = str(text)
    three_dots = re.findall(r'\.{3,}', text)
    return len(three_dots)

In [None]:
df_good_text["three_dots_count_pos_char"] = df_good_text["Personal characteristics POSITIVE"].apply(count_three_dots)

In [None]:
df_good_text['three_dots_count_pos_char'].sum()

In [None]:
df_good_text['three_dots_count_neg_char'] = df_good_text['Personal characteristics NEGATIVE'].apply(count_three_dots)
df_good_text['three_dots_count_healthcare_pos'] = df_good_text['Universal healthcare POSITIVE'].apply(count_three_dots)
df_good_text['three_dots_count_healthcare_neg'] = df_good_text['Universal healthcare NEGATIVE'].apply(count_three_dots)

*Superlatives*

In [None]:
nlp = spacy.load("en_core_web_sm")

In [None]:
def count_superlatives(text):
    if pd.isna(text):
        return 0
    doc = nlp(text)
    return sum(1 for token in doc if token.tag_ in ("JJS", "RBS"))

In [None]:
for col in ["Personal characteristics POSITIVE", "Personal characteristics NEGATIVE", "Universal healthcare POSITIVE", "Universal healthcare NEGATIVE"]:
    df_good_text[f'{col}_superlatives'] = df_good_text[col].apply(count_superlatives)

*CAPSed words*

In [None]:
# exclude texts where everything is in caps and when a word in caps is an abbreviation
# if two words are in caps, like "EXTREMELLY GOOD coffee", count it as the occurence of 1

In [None]:
import re

In [None]:
# function that counts CAPSed words while excluding potential abbreviations (i.e., words with 4 or less characters)

def count_capslock(text):
    if not isinstance(text, str) or not text.strip():
        return 0

    # ignore if the whole text is in CAPS
    if text.isupper():
        return 0

    words = text.split()
    count = 0
    prev_caps = False

    for word in words:
        # clean punctuation around the word
        clean_word = re.sub(r'[^A-Z]', '', word)

        # check if it's a potential abbreviation (short, ≤4 letters, all caps)
        is_abbreviation = clean_word.isupper() and len(clean_word) <= 4

        # count if word is CAPSed and not an abbreviation
        if word.isupper() and not is_abbreviation:
            if not prev_caps:
                count += 1
                prev_caps = True
        else:
            prev_caps = False

    return count


In [None]:
for col in ["Personal characteristics POSITIVE", "Personal characteristics NEGATIVE", "Universal healthcare POSITIVE", "Universal healthcare NEGATIVE"]:
    df_good_text[f'{col}_caps'] = df_good_text[col].apply(count_capslock)

*POS tagging*

In [None]:
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk import sent_tokenize, word_tokenize, pos_tag

nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('averaged_perceptron_tagger_eng')

In [None]:
def get_pos_tags(text):
    if pd.isna(text):
        return None
    tagged_sentences = []
    for sent in sent_tokenize(text):
        tokens = word_tokenize(sent)
        tags = pos_tag(tokens)
        tagged_sentences.append(tags)
    return tagged_sentences

In [None]:
df_good_text["POS_tags_pos_char"] = df_good_text["Personal characteristics POSITIVE"].apply(get_pos_tags)

In [None]:
df_good_text["POS_tags_neg_char"] = df_good_text["Personal characteristics NEGATIVE"].apply(get_pos_tags)

In [None]:
df_good_text["POS_tags_health_pos"] = df_good_text["Universal healthcare POSITIVE"].apply(get_pos_tags)

In [None]:
df_good_text["POS_tags_health_neg"] = df_good_text["Universal healthcare NEGATIVE"].apply(get_pos_tags)

*SAME ADJ and SAME ADJ*

In [None]:
#search for ("anyword", 'ADJ') followed by ("anyword", 'ADJ')

In [None]:
def count_repeated_adj(tags):
    if tags is None:
        return 0

    count = 0
    for sent in tags:
        for i in range(len(sent) - 2):
            w1, p1 = sent[i]
            w2, p2 = sent[i+1]
            w3, p3 = sent[i+2]

            if p1.startswith("JJ") and p3.startswith("JJ") and w1.lower() == w3.lower() and w2.lower() == "and":
                count += 1
    return count

In [None]:
df_good_text["pos_char_repeated_adj"] = df_good_text["POS_tags_pos_char"].apply(count_repeated_adj)

In [None]:
df_good_text["neg_char_repeated_adj"] = df_good_text["POS_tags_neg_char"].apply(count_repeated_adj)
df_good_text["health_pos_repeated_adj"] = df_good_text["POS_tags_health_pos"].apply(count_repeated_adj)
df_good_text["health_neg_repeated_adj"] = df_good_text["POS_tags_health_neg"].apply(count_repeated_adj)

*at ADJ times*

In [None]:
#search for ("at", 'IN'), ("anyword", 'ADJ'), ("times", 'NOUN')

In [None]:
def count_at_adj_times(tags):
    if tags is None:
        return 0

    count = 0
    for sent in tags:
        for i in range(len(sent) - 2):
            w1, p1 = sent[i]
            w2, p2 = sent[i+1]
            w3, p3 = sent[i+2]

            if w1.lower() == "at" and p1 == ("IN") and p2 == ("JJ") and w3.lower() == "times" and p3 == "NOUN":
                count += 1
    return count

In [None]:
df_good_text["pos_char_at_adj_times"] = df_good_text["POS_tags_pos_char"].apply(count_at_adj_times)

In [None]:
df_good_text["neg_char_at_adj_times"] = df_good_text["POS_tags_neg_char"].apply(count_at_adj_times)
df_good_text["health_pos_at_adj_times"] = df_good_text["POS_tags_health_pos"].apply(count_at_adj_times)
df_good_text["health_neg_at_adj_times"] = df_good_text["POS_tags_health_neg"].apply(count_at_adj_times)

In [None]:
df_good_text.to_csv("df_good_text.csv")

In [None]:
#rename
df_good_text = df_good_text.rename(columns={"Personal characteristics POSITIVE_superlatives" : "superlatives_pos_char",
                            "Personal characteristics NEGATIVE_superlatives" : "superlatives_neg_char",
                             "Universal healthcare POSITIVE_superlatives" : "superlatives_healthcare_pos",
                             "Universal healthcare NEGATIVE_superlatives": "superlatives_healthcare_neg",
                                            "Personal characteristics POSITIVE_caps" : "caps_pos_char",
                                            "Personal characteristics NEGATIVE_caps" : "caps_neg_char",
                                           "Universal healthcare POSITIVE_caps" : "caps_health_pos",
                                             "Universal healthcare NEGATIVE_caps" : "caps_health_neg"})


In [None]:
#add those features to the dict (sum them in the total count)

# add exlamation_count, superlatives, CAPSed words, SAME and SAME ADJ to intensifiers and amplified
# add three_doct_count to detensifiers and attenuated
# add at ADJ times to specification and attenuated

In [None]:
new_pos_char = df_good_text[["exclamation_count_pos_char", "three_dots_count_pos_char", "superlatives_pos_char", "caps_pos_char", "pos_char_at_adj_times", "pos_char_repeated_adj"]]
new_neg_char = df_good_text[["exclamation_count_neg_char", "three_dots_count_neg_char", "superlatives_neg_char", "caps_neg_char", "neg_char_at_adj_times", "neg_char_repeated_adj"]]
new_healthcare_pos = df_good_text[["exclamation_count_healthcare_pos", "three_dots_count_healthcare_pos", "superlatives_healthcare_pos", "caps_health_pos", "health_pos_at_adj_times", "health_pos_repeated_adj"]]
new_healthcare_neg = df_good_text[["exclamation_count_healthcare_neg", "three_dots_count_healthcare_neg", "superlatives_healthcare_neg", "caps_health_neg", "health_neg_at_adj_times", "health_neg_repeated_adj"]]

In [None]:
pos_char_all_features = pd.concat([new_pos_char, pos_char_dict_counts], axis=1, join="inner")

In [None]:
# add exclamation_count_pos_char and superlatives_pos_char to intensifier and amplified
pos_char_all_features["intensifier"] = pos_char_all_features["intensifier"] + pos_char_all_features["exclamation_count_pos_char"] + pos_char_all_features["superlatives_pos_char"] + pos_char_all_features["caps_pos_char"] + pos_char_all_features["pos_char_repeated_adj"]
pos_char_all_features["amplified"] = pos_char_all_features["amplified"] + pos_char_all_features["exclamation_count_pos_char"] + pos_char_all_features["superlatives_pos_char"] + pos_char_all_features["caps_pos_char"] + pos_char_all_features["pos_char_repeated_adj"]

# add three_dots_count_pos_char and ish_pos_char to detensifier and attenuated
pos_char_all_features["detensifier"] = pos_char_all_features["detensifier"] + pos_char_all_features["three_dots_count_pos_char"]
pos_char_all_features["attenuated"] = pos_char_all_features["attenuated"] + pos_char_all_features["three_dots_count_pos_char"]

# add at adj times to specification and attenuated
pos_char_all_features["specification"] = pos_char_all_features["specification"] + pos_char_all_features["pos_char_at_adj_times"]
pos_char_all_features["attenuated"] = pos_char_all_features["attenuated"] + pos_char_all_features["pos_char_at_adj_times"]

In [None]:
pos_char_all_features = pos_char_all_features.drop(columns=["exclamation_count_pos_char", "three_dots_count_pos_char", "superlatives_pos_char", "caps_pos_char", "pos_char_repeated_adj", "pos_char_at_adj_times"])

neg char dictionary

In [None]:
neg_char_dict_counts = aggregate_ngrams_by_subgroup_and_group(neg_char_signals, dictmeta)
neg_char_dict_counts = neg_char_dict_counts.join(essays["WC_neg_char"])

In [None]:
neg_char_all_features = pd.concat([new_neg_char, neg_char_dict_counts], axis=1, join="inner")

In [None]:
neg_char_all_features["intensifier"] = neg_char_all_features["intensifier"] + neg_char_all_features["exclamation_count_neg_char"] + neg_char_all_features["superlatives_neg_char"] + neg_char_all_features["caps_neg_char"] + neg_char_all_features["neg_char_repeated_adj"]
neg_char_all_features["amplified"] = neg_char_all_features["amplified"] + neg_char_all_features["exclamation_count_neg_char"] + neg_char_all_features["superlatives_neg_char"]  + neg_char_all_features["caps_neg_char"] + neg_char_all_features["neg_char_repeated_adj"]

neg_char_all_features["detensifier"] = neg_char_all_features["detensifier"] + neg_char_all_features["three_dots_count_neg_char"]
neg_char_all_features["attenuated"] = neg_char_all_features["attenuated"] + neg_char_all_features["three_dots_count_neg_char"]

neg_char_all_features["specification"] = neg_char_all_features["specification"] + neg_char_all_features["neg_char_at_adj_times"]
neg_char_all_features["attenuated"] = neg_char_all_features["attenuated"] + neg_char_all_features["neg_char_at_adj_times"]

In [None]:
neg_char_all_features = neg_char_all_features.drop(columns=["exclamation_count_neg_char", "three_dots_count_neg_char", "superlatives_neg_char", "caps_neg_char", "neg_char_repeated_adj", "neg_char_at_adj_times"])

universal healthcare positive

In [None]:
pos_healthcare_dict_counts = aggregate_ngrams_by_subgroup_and_group(pos_healthcare_signals, dictmeta)
pos_healthcare_dict_counts = pos_healthcare_dict_counts.join(essays["WC_health_pos"])

In [None]:
health_pos_all_features = pd.concat([new_healthcare_pos, pos_healthcare_dict_counts], axis=1, join="inner")

In [None]:
health_pos_all_features["intensifier"] = health_pos_all_features["intensifier"] + health_pos_all_features["exclamation_count_healthcare_pos"] + health_pos_all_features["superlatives_healthcare_pos"] + health_pos_all_features["caps_health_pos"] + health_pos_all_features["health_pos_repeated_adj"]
health_pos_all_features["amplified"] = health_pos_all_features["amplified"] + health_pos_all_features["exclamation_count_healthcare_pos"] + health_pos_all_features["superlatives_healthcare_pos"] + health_pos_all_features["caps_health_pos"] + health_pos_all_features["health_pos_repeated_adj"]

health_pos_all_features["detensifier"] = health_pos_all_features["detensifier"] + health_pos_all_features["three_dots_count_healthcare_pos"]
health_pos_all_features["attenuated"] = health_pos_all_features["attenuated"] + health_pos_all_features["three_dots_count_healthcare_pos"]

health_pos_all_features["specification"] = health_pos_all_features["specification"] + health_pos_all_features["health_pos_at_adj_times"]
health_pos_all_features["attenuated"] = health_pos_all_features["attenuated"] + health_pos_all_features["health_pos_at_adj_times"]

In [None]:
health_pos_all_features = health_pos_all_features.drop(columns=["exclamation_count_healthcare_pos", "three_dots_count_healthcare_pos", "superlatives_healthcare_pos", "caps_health_pos", "health_pos_repeated_adj", "health_pos_at_adj_times"])

universal healthcare negative

In [None]:
neg_healthcare_dict_counts = aggregate_ngrams_by_subgroup_and_group(neg_healthcare_signals, dictmeta)
neg_healthcare_dict_counts = neg_healthcare_dict_counts.join(essays["WC_health_neg"])

In [None]:
health_neg_all_features = pd.concat([new_healthcare_neg, neg_healthcare_dict_counts], axis=1, join="inner")

In [None]:
health_neg_all_features["intensifier"] = health_neg_all_features["intensifier"] + health_neg_all_features["exclamation_count_healthcare_neg"] + health_neg_all_features["superlatives_healthcare_neg"] + health_neg_all_features["caps_health_neg"] + health_neg_all_features["health_neg_repeated_adj"]
health_neg_all_features['amplified'] = health_neg_all_features["amplified"] + health_neg_all_features["exclamation_count_healthcare_neg"] + health_neg_all_features["superlatives_healthcare_neg"] + health_neg_all_features["caps_health_neg"] + health_neg_all_features["health_neg_repeated_adj"]

health_neg_all_features["detensifier"] = health_neg_all_features["detensifier"] + health_neg_all_features["three_dots_count_healthcare_neg"]
health_neg_all_features["attenuated"] = health_neg_all_features["attenuated"] + health_neg_all_features["three_dots_count_healthcare_neg"]

health_neg_all_features["specification"] = health_neg_all_features["specification"] + health_neg_all_features["health_neg_at_adj_times"]
health_neg_all_features["attenuated"] = health_neg_all_features["attenuated"] + health_neg_all_features["health_neg_at_adj_times"]

In [None]:
health_neg_all_features = health_neg_all_features.drop(columns=["exclamation_count_healthcare_neg", "three_dots_count_healthcare_neg", "superlatives_healthcare_neg", "caps_health_neg", "health_neg_repeated_adj", "health_neg_at_adj_times"])

Normalized dictionary features

In [None]:
pos_char_dict = normalize_by_column(pos_char_all_features, denominator_col="WC_pos_char")

In [None]:
neg_char_dict = normalize_by_column(neg_char_all_features, denominator_col="WC_neg_char")

In [None]:
pos_healthcare_dict = normalize_by_column(health_pos_all_features, denominator_col="WC_health_pos")

In [None]:
neg_healthcare_dict = normalize_by_column(health_neg_all_features, denominator_col="WC_health_neg")

Rename signals

In [None]:
pos_char_signals = pos_char_signals.drop(columns = ["ID"])

In [None]:
pos_char_signals = pos_char_signals.add_suffix("_pos_char")
neg_char_signals = neg_char_signals.add_suffix("_neg_char")

In [None]:
pos_healthcare_signals = pos_healthcare_signals.add_suffix("_pos_healthcare")
neg_healthcare_signals = neg_healthcare_signals.add_suffix("_neg_healthcare")

In [None]:
df_signals = pd.concat([pos_char_signals, neg_char_signals, pos_healthcare_signals, neg_healthcare_signals], axis=1, join="inner")

rename dictionary

In [None]:
pos_char_dict = pos_char_dict.add_suffix("_dict_pos_char")
neg_char_dict = neg_char_dict.add_suffix("_dict_neg_char")
pos_healthcare_dict = pos_healthcare_dict.add_suffix("_dict_pos_healthcare")
neg_healthcare_dict = neg_healthcare_dict.add_suffix("_dict_neg_healthcare")

In [None]:
df_dict = pd.concat([pos_char_dict, neg_char_dict, pos_healthcare_dict, neg_healthcare_dict], axis=1, join="inner")

In [None]:
df_dict.head()

Merge all text features

In [None]:
df_lang_features = pd.concat([df_dict, df_signals], axis=1)

Merge with the survey dataframe and leave only data of quality

In [None]:
survey = pd.read_excel("CROSSDA_data_Iva.xlsx", index_col= "ID", engine="openpyxl")

In [None]:
df_all = pd.concat([survey, df_lang_features], axis=1)

In [None]:
df_good = df_all[
    df_all["IPIP120_120.r"].notna() &
    df_all["attention_check"].isin(["PASS", "MIXED"]) &
    (df_all["num_answers"] == 4) &
    ~df_all["ivas_verdict_REFRESHIT"].isin(["gpt", "gpt pitaj", "gpt/ full", "maybe", "yes"])
]

In [None]:
df_good[["attenuated_dict_pos_char", "attenuated_dict_neg_char", "amplified_dict_pos_char", "amplified_dict_neg_char"]].describe()

In [None]:
df_good[["neg_in_pos_num", "pos_in_neg_num"]] = df_good[["neg_in_pos_num", "pos_in_neg_num"]].fillna(0)


Controling for general use: Target - avg of controls

In [None]:
df_good[["attenuated_dict_pos_healthcare", "attenuated_dict_neg_healthcare", "amplified_dict_pos_healthcare", "amplified_dict_neg_healthcare"]].describe()

In [None]:
df_good["attenuated_dict_healthcare_avg"] = (df_good["attenuated_dict_pos_healthcare"] + df_good["attenuated_dict_neg_healthcare"]) / 2

In [None]:
df_good["amplified_dict_healthcare_avg"] = (df_good["amplified_dict_pos_healthcare"] + df_good["amplified_dict_neg_healthcare"]) / 2

In [None]:
df_good["generalization_dict_healthcare_avg"] = (df_good["generalization_dict_pos_healthcare"] + df_good["generalization_dict_neg_healthcare"]) / 2
df_good["certainty_dict_healthcare_avg"] = (df_good["certainty_dict_pos_healthcare"] + df_good["certainty_dict_neg_healthcare"]) / 2
df_good["intensifier_dict_healthcare_avg"] = (df_good["intensifier_dict_pos_healthcare"] + df_good["intensifier_dict_neg_healthcare"]) / 2
df_good["specification_dict_healthcare_avg"] = (df_good["specification_dict_pos_healthcare"] + df_good["specification_dict_neg_healthcare"]) / 2
df_good["detensifier_dict_healthcare_avg"] = (df_good["detensifier_dict_pos_healthcare"] + df_good["detensifier_dict_neg_healthcare"]) / 2
df_good["uncertainty_dict_healthcare_avg"] = (df_good["uncertainty_dict_pos_healthcare"] + df_good["uncertainty_dict_neg_healthcare"]) / 2

In [None]:
df_good["amplified_dict_pos_char_control"] = df_good["amplified_dict_pos_char"] - df_good["amplified_dict_healthcare_avg"]
df_good["amplified_dict_neg_char_control"] = df_good["amplified_dict_neg_char"] - df_good["amplified_dict_healthcare_avg"]
df_good["attenuated_dict_pos_char_control"] = df_good["attenuated_dict_pos_char"] - df_good["attenuated_dict_healthcare_avg"]
df_good["attenuated_dict_neg_char_control"] = df_good["attenuated_dict_neg_char"] - df_good["attenuated_dict_healthcare_avg"]

In [None]:
df_good["generalization_dict_pos_char_control"] = df_good["generalization_dict_pos_char"] - df_good["generalization_dict_healthcare_avg"]
df_good["intensifier_dict_pos_char_control"] = df_good["intensifier_dict_pos_char"] - df_good["intensifier_dict_healthcare_avg"]
df_good["certainty_dict_pos_char_control"] = df_good["certainty_dict_pos_char"] - df_good["certainty_dict_healthcare_avg"]

df_good["generalization_dict_neg_char_control"] = df_good["generalization_dict_neg_char"] - df_good["generalization_dict_healthcare_avg"]
df_good["intensifier_dict_neg_char_control"] = df_good["intensifier_dict_neg_char"] - df_good["intensifier_dict_healthcare_avg"]
df_good["certainty_dict_neg_char_control"] = df_good["certainty_dict_neg_char"] - df_good["certainty_dict_healthcare_avg"]

df_good["specification_dict_pos_char_control"] = df_good["specification_dict_pos_char"] - df_good["specification_dict_healthcare_avg"]
df_good["detensifier_dict_pos_char_control"] = df_good["detensifier_dict_pos_char"] - df_good["detensifier_dict_healthcare_avg"]
df_good["uncertainty_dict_pos_char_control"] = df_good["uncertainty_dict_pos_char"] - df_good["uncertainty_dict_healthcare_avg"]

df_good["specification_dict_neg_char_control"] = df_good["specification_dict_neg_char"] - df_good["specification_dict_healthcare_avg"]
df_good["detensifier_dict_neg_char_control"] = df_good["detensifier_dict_neg_char"] - df_good["detensifier_dict_healthcare_avg"]
df_good["uncertainty_dict_neg_char_control"] = df_good["uncertainty_dict_neg_char"] - df_good["uncertainty_dict_healthcare_avg"]

In [None]:
df_good.to_csv("df_good.csv")

Descriptive analyses and plotting

In [None]:
#plot 4 distributions

# amplified pos

ampl_pos = df_good["amplified_dict_pos_char_control"]
ampl_pos.describe()

In [None]:
plt.hist(ampl_pos)
plt.ylabel("Frequency of participants")
plt.xlabel("ampl_pos")
plt.show()

In [None]:
# amplified neg chat

ampl_neg = df_good["amplified_dict_neg_char_control"]
ampl_neg.describe()

In [None]:
plt.hist(ampl_neg)
plt.ylabel("Frequency of participants")
plt.xlabel("ampl_neg")
plt.show()

In [None]:
# attenuated pos char

atten_pos = df_good["attenuated_dict_pos_char_control"]
atten_pos.describe()

In [None]:
plt.hist(atten_pos)
plt.ylabel("Frequency of participants")
plt.xlabel("atten_pos")
plt.show()

In [None]:
# attenuated neg char

atten_neg = df_good["attenuated_dict_neg_char_control"]
atten_neg.describe()

In [None]:
plt.hist(atten_neg)
plt.ylabel("Frequency of participants")
plt.xlabel("atten_neg")
plt.show()

In [None]:
SE_avg = df_good["SE_avg"]
plt.hist(SE_avg)
plt.ylabel("Frequency of participants")
plt.xlabel("SE_avg")
plt.show()

In [None]:
SE_avg.describe()

In [None]:
SESPM_favorable_construals_avg = df_good["SESPM_favorable_construals_avg"]
plt.hist(SESPM_favorable_construals_avg)
plt.ylabel("Frequency of participants")
plt.xlabel("SESPM_favorable_construals_avg")
plt.show()

In [None]:
SESPM_favorable_construals_avg.describe()

In [None]:
SESPM_positivity_embarcement_avg = df_good["SESPM_positivity_embarcement_avg"]
SESPM_positivity_embarcement_avg.describe()

In [None]:
SD4_NARC_avg = df_good["SD4_NARC_avg"]
SD4_NARC_avg.describe()

In [None]:
SPI_avg = df_good["SPI_avg"]
plt.hist(SPI_avg)
plt.ylabel("Frequency of participants")
plt.xlabel("SPI_avg")
plt.show()

In [None]:
SPM_avg = df_good["SPM_avg"]
plt.hist(SPM_avg)
plt.ylabel("Frequency of participants")
plt.xlabel("SPM_avg")
plt.show()

In [None]:
SPM_avg.describe()

In [None]:
ext  = df_good["extraversion_avg"]

In [None]:
plt.hist(ext)
plt.ylabel("Frequency of participants")
plt.xlabel("ext")
plt.show()

In [None]:
dep = df_good["DASS21_dep_sum"]
dep.describe()

In [None]:
plt.hist(dep)
plt.ylabel("Frequency of participants")
plt.xlabel("dep")
plt.show()

In [None]:
RSE_avg = df_good["Rosenberg_avg"]
RSE_avg.describe()

In [None]:
dep_n = df_good["N3_depression_avg"]
dep_n.describe()

In [None]:
sise = df_good["SISE"]
sise.describe()

In [None]:
plt.hist(sise)
plt.ylabel("Frequency of participants")
plt.xlabel("sise")
plt.show()

In [None]:
sar = df_good["SESPM_self.affirming_reflections_avg"]
sar.describe()

In [None]:
plt.hist(sar)
plt.ylabel("Frequency of participants")
plt.xlabel("sar")
plt.show()

In [None]:
from scipy.stats import skew

print(skew(dep))
print(skew(ext))
print(skew(ampl_neg))
print(skew(ampl_pos))
print(skew(atten_neg))
print(skew(atten_pos))

In [None]:
plt.hist(ampl_neg)
plt.ylabel("Frequency of participants")
plt.xlabel("ampl_neg")
plt.show()

In [None]:
RME = df_good["Rosenberg_avg"]
RME.describe()

In [None]:
plt.hist(RME)
plt.ylabel("Frequency of participants")
plt.xlabel("RME")
plt.show()

In [None]:
print(skew(RME))

# *Cronbach alpha*

In [None]:
SE_all = df_good[["SE01.r", "SE02", "SE03.r", "SE04", "SE05.r", "SE06", "SE07.r", "SE08", "SE09.r", "SE10", "SE11", "SE12.r"]]

In [None]:
 ext_all = df_good[["IPIP120_002", "IPIP120_032", "IPIP120_062.r", "IPIP120_092.r", "IPIP120_007", "IPIP120_037", "IPIP120_067.r", "IPIP120_097.r", "IPIP120_012", "IPIP120_042", "IPIP120_072", "IPIP120_102.r", "IPIP120_017", "IPIP120_047", "IPIP120_077", "IPIP120_107.r", "IPIP120_022", "IPIP120_052", "IPIP120_082", "IPIP120_112", "IPIP120_027", "IPIP120_057", "IPIP120_087", "IPIP120_117"]]

In [None]:
!pip install pingouin
import pingouin as pg

In [None]:
pg.cronbach_alpha(data=SE_all)

In [None]:
pg.cronbach_alpha(data=ext_all)

In [None]:
SPM_all = df_good[["SPM1", "SPM2", "SPM3", "SPM4", "SPM5", "SPM6", "SPM7", "SPM8", "SPM9", "SPM10", "SPM11", "SPM12", "SPM13"]]
pg.cronbach_alpha(data=SPM_all)

In [None]:
DEP_n_all = df_good[["IPIP120_011", "IPIP120_041", "IPIP120_071", "IPIP120_101.r"]]
pg.cronbach_alpha(data=DEP_n_all)

In [None]:
DEP_all = df_good[["DASS21_D01", "DASS21_D02", "DASS21_D03", "DASS21_D04", "DASS21_D05", "DASS21_D06", "DASS21_D07"]]
pg.cronbach_alpha(data=DEP_all)

In [None]:
NARC_all = df_good[["SD4_NARC01", "SD4_NARC03", "SD4_NARC04", "SD4_NARC05", "SD4_NARC06", "SD4_NARC07"]]
pg.cronbach_alpha(data=NARC_all)

In [None]:
PE_all = df_good[["SESPMOther_PE001", "SESPMOther_PE002", "SESPMOther_PE003", "SESPMOther_PE004", "SESPMOther_PE005", "SESPMOther_PE006", "SESPMOther_PE007", "SESPMOther_PE008", "SESPMOther_PE009", "SESPMOther_PE010"]]
pg.cronbach_alpha(data=PE_all)

In [None]:
FC_all = df_good[["SESPMOther_FC01", "SESPMOther_FC02", "SESPMOther_FC03", "SESPMOther_FC04", "SESPMOther_FC05", "SESPMOther_FC06"]]
pg.cronbach_alpha(data=FC_all)

In [None]:
SAR_all = df_good[["SESPMOther_SAR01", "SESPMOther_SAR02", "SESPMOther_SAR03", "SESPMOther_SAR04", "SESPMOther_SAR05", "SESPMOther_SAR06"]]
pg.cronbach_alpha(data=SAR_all)

In [None]:
RSE_all = df_good[["Rosenberg_01", "Rosenberg_02.r", "Rosenberg_03", "Rosenberg_04", "Rosenberg_05.r", "Rosenberg_06.r", "Rosenberg_07", "Rosenberg_08.r", "Rosenberg_09.r", "Rosenberg_10"]]
pg.cronbach_alpha(data=RSE_all)

# *McDonald Omega*

In [None]:
!pip install reliabiliPy
from reliabilipy import reliability_analysis

In [None]:
ra = reliability_analysis(raw_dataset=SE_all, is_corr_matrix=False, impute='median')
ra.fit()
print(ra.omega_total)

In [None]:
ra = reliability_analysis(raw_dataset=FC_all, is_corr_matrix=False, impute='median')
ra.fit()
print(ra.omega_total)

In [None]:
ra = reliability_analysis(raw_dataset=SAR_all, is_corr_matrix=False, impute='median')
ra.fit()
print(ra.omega_total)

In [None]:
ra = reliability_analysis(raw_dataset=PE_all, is_corr_matrix=False, impute='median')
ra.fit()
print(ra.omega_total)

In [None]:
ra = reliability_analysis(raw_dataset=SPM_all, is_corr_matrix=False, impute='median')
ra.fit()
print(ra.omega_total)

In [None]:
ra = reliability_analysis(raw_dataset=NARC_all, is_corr_matrix=False, impute='median')
ra.fit()
print(ra.omega_total)

In [None]:
ra = reliability_analysis(raw_dataset=RSE_all, is_corr_matrix=False, impute='median')
ra.fit()
print(ra.omega_total)

In [None]:
ra = reliability_analysis(raw_dataset=DEP_all, is_corr_matrix=False, impute='median')
ra.fit()
print(ra.omega_total)

In [None]:
ra = reliability_analysis(raw_dataset=DEP_n_all, is_corr_matrix=False, impute='median')
ra.fit()
print(ra.omega_total)

In [None]:
ra = reliability_analysis(raw_dataset=ext_all, is_corr_matrix=False, impute='median')
ra.fit()
print(ra.omega_total)