In [43]:
import pandas as pd
import numpy as np
import pickle
from openai import OpenAI
import time
import os
# 2.3 million rows
drug_sub = pickle.load(open('../data/all_data_filtered_delauthor.pkl', 'rb'))
# 1,5 million
non_drug_sub = pickle.load(open('../data/non_drug_data_filtered_delauthor.pkl', 'rb'))


In [57]:
# how many records from [deleted] authors in both subreddits
print('drug_sub: ', drug_sub[drug_sub['author'] == '[deleted]'].shape[0])
print('non_drug_sub: ', non_drug_sub[non_drug_sub['author'] == '[deleted]'].shape[0])


drug_sub:  412808
non_drug_sub:  157702


In [34]:
prompt = f"""
**Instructions:**\
You are an expert sociologist on stigma among people who use drugs. You have been asked to label posts from various Reddit communities on the disclosure of a stigma utterance.\
1. Provide one label for each post: S-D, S-ND, NS-D, NS-ND. If you are unsure, provide your best guess. \
2. If the post is labeled S-D or S-ND, extract the exact utterance of stigma and provide a brief description of the stigma.\

**Definitions:**\
The inclusion of stigma can be explicit or implicit and can be directed at an individual or a group using stigmatizing language or stereotypes.\
Stigma usually includes four intrinsic features: marks, labels, etiology, and peril. Marks are nonverbal cues that identify members of a stigmatized group. Labels arouse and reflect social cognitions, such as considering stigmatized people to be a distinct group, highlighting intergroup differences, encouraging categorization, and promoting stereotypes. Etiology content describes stigmatized people’s voluntary decisions to violate their social responsibilities and engage in taboo activities. Peril content links stigmatized people to physical, neurological, and social threats, such as pain, death, aggression, and taboo behavior\
The post author can describe an instance where they felt stigmatized by another person or a system or describe feelings of shame or guilt because of their drug use.\
If the post author describes a situation where they feel unsure or worried because of how they will be perceived by others, anticipating being discrimminated against because of their membership in the group, this can also be considered a post about stigma.\
You can label a post in one of four ways: \
1. S-D: The post contains an utterance of stigma. Either language that is stigmatizing or a description of an experience related to stigma *AND* the stigma is related to drug use.\
2. S-ND: The post contains an utterance of stigma. Either language that is stigmatizing or a description of an experience related to stigma *AND* the stigma is *NOT* related to drug use.\
3. NS-D: The post does *NOT* contain any utterance of stigma *AND* the post *IS* related to drug use.\
4. NS-ND: The post does *NOT* contain any utterance of stigma *AND* the post is *NOT* related to drug use.\

**Considerations:**\
1. Sometimes people use coded language to refer to their drug use (e.g. 'stoner' to refer to being high on cannabis, 'ents' for a person who enjoys cannabis, or '[5]' to refer to the degree of the user's high out of 10) \
2. Focus on Intent: Stigmatizing language aims to degrade, shame, or perpetuate negative stereotypes about people who use drugs. \
3. Context Matters: Consider the overall tone and purpose of the post. Is it intended to inform, share an experience, or genuinely promote negativity towards people who use drugs?\

"""

In [28]:
from openai import OpenAI
api_key = "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"
client = OpenAI(api_key = api_key)

In [37]:
example1 = "You did it. Take responsibility for it, US Government. We didn't want to break the law. We only wanted to do something harmless and fun. But we got used to the idea of breaking the law on a regular basis. We learned to be afraid of the police.You taught us that the justice system is not our friend.And so we got very comfortable living outside the law."
answer1 = "S-D, 'Take responsibility for it, US Government. We didn't want to break the law. We only wanted to do something harmless and fun. But we got used to the idea of breaking the law on a regular basis. We learned to be afraid of the police.You taught us that the justice system is not our friend.And so we got very comfortable living outside the law.', The author is describing a situation where they feel there are systemic barriers that prevent them from living a normal life and are blaming the US government for their situation. The author is also describing a situation where they feel they are being discriminated against because of their drug use."
example2 = "I hope my junkie sister OD's or disappears out of our lives My sister is an alcoholic junkie who has 2 DUIs under her belt as well as loves taking Xanax and alcohol together and wreaking havoc for our family and even strangers."
answer2 = "S-D, 'junkie', The author is using the term junkie to describe their sister and is perpetuating the stereotype that people who use drugs are dangerous and harmful."
example3 = "Moral of the story is never trust a low life no matter how much you feed them and house them and be friendly with them, at the end of the day they don't have shit and if you have shit they're gonna fucking take it."
answer3 = "S-ND, 'never trust a low life', The author is labeling a person who is unhoused as a 'low-life' and is perpetuating the stereotype that they are thieves and including a message of peril that your belongings will get stolen if you choose to support them."
example4 = "My mom is going to kick me out. She graciously gave me the choice of getting dropped off in a shelter in either San Diego or the desert area (Palm Springs and surrounding areas). I would choose the desert because that is one of my old stomping grounds. The dope is phenomenal and cheap (3g's for $100) and the homeless population is a majority young people. I can also hustle up $350 and rent a room at a buddy's place. I have a few options that I can look at but I have to figure it out soon."
answer4 = "NS-D"
example5 = "So, I got in touch with a friend who grows. I smoked some weed, and I never felt better. Not the high, but after. After the high I was finally happy. Finally. My anxiety was gone and I never felt better. So I decide to smoke for two months then tell my mom and get my green card. She said I seem happier and stuff, then I told her why and she flipped."
answer5 = "NS-D"
example6 = "I think I genuinely may have developed some sort of PTSD due to the incident. I have had an anxiety disorder of some kind for my entire life but over the past few years it has been getting steadily worse and I wonder if it is due to that incident."
answer6 = "NS-ND"
example7 = "Death with dignity people are wrong. I know the argument that people will be in lots of pain near the end of their life so they should kill themselves early on when they get cancer or whatever. I don't think it's very dignified to kill yourself to avoid temporal pain. It also begs the question of what is considered enough pain or chance of not getting better is enough be think killing yourself is a good idea. I've been in extreme pain before, sometimes for extended periods of time, but I didn't think killing myself was the best solution to my problems."
answer7 = "S-ND, 'I don't think it's very dignified to kill yourself to avoid temporal pain', The author is insenuating that people with a terminal disease have no dignity when they choose a compassionate death and is assigning blame to that person."
example8 = "When I'm dating a solid 10/10 girl there is close to nothing she can do wrong during step 2 that won't still merit her the joys of step 3. I, on the other hand, can kill the whole thing deader than Hitler by making one tiny mistake. Perhaps I'll text her back too quickly giving her the impression I'm desperate. Well princesses can't we all be equal and stop this stupid fucking game? No you say? Well why not? Men everywhere have to suffer your shit like slaves to pussy! But it's obvious why it must be this way; because 10/10s don't date guys they want to fuck! Why would they? Cock runs from the taps for these princesses! Wait, that's it... Don't date 10/10s."
answer8 = "S-ND, 'I'm dating a solid 10/10 girl; princesses; because 10/10s don't date guys they want to fuck', The author is labeling attractive women as princesses and perpetuating a stereotype. They are also issuing a warning about dating attractive women."

def get_categories(post, prompt, retries = 2, model = "gpt-4-0125-preview"):
    while retries > 0:
        try:
            response = client.chat.completions.create(
            messages=[
                {
                    "role": "system",
                    "content": prompt
                },
                {
                    "role": "user",
                    "content": example1

                },
                {
                    "role": "system",
                    "content": answer1
                },
                {
                    "role": "user",
                    "content": example2
                },
                {
                    "role": "system",
                    "content": answer2
                },
                {
                    "role": "user",
                    "content": example3
                },
                {
                    "role": "system",
                    "content": answer3
                },
                {
                    "role": "user",
                    "content": example4
                },
                {
                    "role": "system",
                    "content": answer4
                },
                {
                    "role": "user",
                    "content": example5
                },
                {
                    "role": "system",
                    "content": answer5
                },
                {
                    "role": "user",
                    "content": example6
                },
                {
                    "role": "system",
                    "content": answer6
                },
                {
                    "role": "user",
                    "content": example7
                },
                {
                    "role": "system",
                    "content": answer7
                },
                {
                    "role": "user",
                    "content": post
                }

            ],
            model=model,
            temperature=0
        )
            label = response.choices[0].message.content.lower().strip()
            return label
        except Exception as e:
            if e:
                print(e)
                retries -= 1
                time.sleep(5)
            else:
                raise e
    print("Retrying...")
    return "skipped"

In [30]:
# sample 500 from each
seed = 2024
drug_sample = drug_sub.sample(500, random_state = seed)
non_drug_sample = non_drug_sub.sample(500, random_state = seed)
sample2 = pd.concat([drug_sample, non_drug_sample])

In [31]:
sample2.to_csv('../data/sample2.csv', index = False)

---
Run GPT4

In [14]:
sample['label_2'] = sample['text'].apply(lambda x: get_categories(x, prompt))

In [27]:
print(sample['label_2'].value_counts())
sample.to_csv('../data/stigma_drug_categories_1k.csv', index=False)


ns-nd    450
ns-d     423
s-nd     109
s-d       18
Name: label_2, dtype: int64


In [21]:
# 5/8/24 sample was run on self text not text
# get rows where self text is empty
just_title = sample[sample['selftext'] == '']

In [23]:
just_title['label_2'] = just_title['text'].apply(lambda x: get_categories(x, prompt))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  just_title['label_2'] = just_title['text'].apply(lambda x: get_categories(x, prompt))


In [24]:
# merge back on index - replace label_2 with new labels
sample.loc[just_title.index, 'label_2'] = just_title['label_2']

In [25]:
print(sample['label_2'].value_counts())

ns-nd    450
ns-d     423
s-nd     109
s-d       18
Name: label_2, dtype: int64


In [26]:
# remove label1
sample = sample.drop(columns = ['label_1'])

In [16]:
# for review:
# weighted for each label
review_sample = sample.sample(200)
review_sample.to_csv('../data/stigma_drug_categories_review_200.csv', index=False)

---
NEW 1k posts after agreement review 05/09/2024

In [40]:
sample2['label2'] = sample2['text'].apply(lambda x: get_categories(x, prompt))

In [41]:
sample2.to_csv('../data/stigma_drug_categories_1k_2.csv', index=False)

In [42]:
review_sample2 = sample2.sample(200)
review_sample2.to_csv('../data/stigma_drug_categories_review_200_2.csv', index=False)

In [44]:
sample2 = pd.read_csv('../data/stigma_drug_categories_1k_2.csv')

In [45]:
# stigma drug
stigma_drug = sample2[sample2['label2'].str.contains('s-d')]

In [46]:
# agreement
labeled = pd.read_excel('../data/stigma_drug_categories_review_200_2_combined.xlsx')

In [47]:
# cohens kappa
from sklearn.metrics import cohen_kappa_score
cohen_kappa_score(labeled['agree_LB'], labeled['agree_EA'])

0.32515337423312884

In [53]:
# krippendorff's alpha
import krippendorff

# data needs to be lists of lists
data = {
    'agree_LB': labeled['agree_LB'].tolist(),
    'agree_EA': labeled['agree_EA'].tolist()

}
df = pd.DataFrame(data)
formatted_data = [df[col].tolist() for col in df.columns]

alpha = krippendorff.alpha(formatted_data, level_of_measurement='nominal')
print(alpha)

0.3259099984641377


In [55]:
from statsmodels.stats.inter_rater import aggregate_raters, fleiss_kappa


In [56]:
data_transposed = np.array([df[col].values for col in df.columns]).T

# Using aggregate_raters to prepare data for Fleiss' Kappa, which is a similar measure
table, n_ij = aggregate_raters(data_transposed)

# Compute Fleiss' Kappa as an approximation
kappa = fleiss_kappa(table, method='fleiss')

print(f"Fleiss' Kappa: {kappa}")

Fleiss' Kappa: 0.3242205498387334
