In [1]:
import pandas as pd
import numpy as np
import re

dat = pd.read_csv('../data/destigma_pipeline/task2/stigma-posts.csv')

## Stigma by type

In [8]:
# label types of stigma types
# keywords
self = ['self-stigma', 'internalization', 'Self-Stigma', 'self-stigmatizing']
structural = ['structural stigma', 'institutional stigma', 'systemic stigma']

# regex
self_re = re.compile(r'\bself-stigma\b|\binternalization\b|\bSelf-Stigma\b|\bself-stigmatizing\b')
structural_re = re.compile(r'\bstructural stigma\b|\binstitutional stigma\b|\bsystemic stigma\b')

# create new columns
dat['stigma_type'] = np.nan
dat['stigma_type'] = np.where(dat['stigmaExplanation'].str.contains(self_re), 'self-stigma', dat['stigma_type'])
dat['stigma_type'] = np.where(dat['stigmaExplanation'].str.contains(structural_re), 'structural stigma', dat['stigma_type'])
# make np.nan to 'directed stigma'
dat['stigma_type'] = np.where(dat['stigma_type']=='nan', 'directed stigma', dat['stigma_type'])

In [9]:
dat['stigma_type'].value_counts()

directed stigma      1949
self-stigma          1199
structural stigma      59
Name: stigma_type, dtype: int64

In [5]:
# manually correct some of the labels
# ids= ['ej9n8h', 'e0f34z', 'aopd9s', 'iijeac', 'xz3ah2', 'aasrzu', '3gj2ac', '4mdyst', '41v8m5', '1t0y0y', 'ez09dy', 'uvwabs', '9sjnir', 'awmgr2', 'zr34f0', '6eie6n', '4zwshz', 'nocycv', 'ylife', 'mofShe', 'Swiw0z', 'aOylmk']
# dat_subs = dat[dat['id'].isin(ids)]

## Stigma by substance

In [10]:
narcotics = ['narcotics', 'opioids', 'heroin', 'fentanyl', 'opiates', 'opiate', 'opioid', 'opiods', 'oxy', 'oxycodone', 'hydrocodone', 'codeine', 'morphine', 'methadone', 'buprenorphine', 'suboxone', 'subutex',
             'dilaudid', 'vicodin', 'percocet', 'percocets', 'roxicodone', 'roxy', 'roxies', 'oxymorphone', 'oxymorphones', 
             'hydromorphone', 'hydromorphones', 'pain pills', 'opium', 'opoid','fent', 'dope','painkillers', 'pain killers']
hallucinogens = ['hallucinogens', 'lsd', 'acid', 'mushrooms', 'psilocybin', 'dmt', 'ayahuasca', 'peyote', 'mdma', 'ecstasy', 'molly','ketamine','salvia', 
                 'dissociatives', 'pcp', 'angel dust', 'shrooms']
depressants = ['depressants', 'benzos', 'xanax', 'valium', 'ativan', 'klonopin', 'librium', 'tranquilizers', 'barbiturates', 'sleeping pills', 'sedatives',
               'benzodiazepines', 'benzodiazepine', 'sedative']
stims = ['stimulants', 'meth', 'methamphetamine', 'cocaine', 'crystal meth', 'speed', 'adderall', 'ritalin', 'amphetamine', 'amphetamines',
         'blow', 'coke', 'crack', 'crack cocaine', 'crack-cocaine']
drugs_of_concern = ['dxm', 'dextromethorphan', 'kratom', 'fake pills']
designer_drugs = ['bath salts', 'flakka','gravel', 'cloud nine', 'snow blow', 'vanilla sky', 'lunar wave', 'white lightning']
cannabis = ['cannabis', 'weed', 'marijuana', 'pot', 'thc', 'cbd']
synthetic_cannabinoids = ['k2', 'spice', 'synthetic cannabinoids', 'synthetic marijuana']
reversal_agents = ['naloxone', 'narcan', 'nalmefene', 'kloxxado', 'evzio', 'naloxone hydrochloride', 'naloxone hcl']
other = ['inhalants', 'steroids', 'jenkem']

In [11]:
# if any of the keywords are in the language posts
dat['substance_type'] = np.nan
dat['substances'] = np.nan
dat['text'] = dat['text'].str.lower()

for index, row in dat.iterrows():
    substance_types = []
    substances = []
    for drug in narcotics:
        if drug in row['text']:
            substance_types.append('narcotics')
            substances.append(drug)
    for drug in hallucinogens:
        if drug in row['text']:
            substance_types.append('hallucinogens')
            substances.append(drug)
    for drug in depressants:
        if drug in row['text']:
            substance_types.append('depressants')
            substances.append(drug)
    for drug in stims:
        if drug in row['text']:
            substance_types.append('stimulants')
            substances.append(drug)
    for drug in drugs_of_concern:
        if drug in row['text']:
            substance_types.append('drugs_of_concern')
            substances.append(drug)
    for drug in designer_drugs:
        if drug in row['text']:
            substance_types.append('designer_drugs')
            substances.append(drug)
    for drug in cannabis:
        if drug in row['text']:
            substance_types.append('cannabis')
            substances.append(drug)
    for drug in synthetic_cannabinoids:
        if drug in row['text']:
            substance_types.append('synthetic_cannabinoids')
            substances.append(drug)
    for drug in other:
        if drug in row['text']:
            substance_types.append('other')
            substances.append(drug)
    for drug in reversal_agents:
        if drug in row['text']:
            substance_types.append('reversal_agents')
            substances.append(drug)
    if len(substance_types) == 0:
        substance_types.append('unspecified')
    dat.at[index, 'substance_type'] = ', '.join(set(substance_types))
    dat.at[index, 'substances'] = ', '.join(set(substances))

In [12]:
dat['substance_type'].value_counts()

unspecified                                                       1021
stimulants                                                         591
cannabis                                                           379
narcotics                                                          309
stimulants, narcotics                                              200
                                                                  ... 
narcotics, hallucinogens, stimulants, cannabis, designer_drugs       1
stimulants, designer_drugs                                           1
reversal_agents, cannabis                                            1
narcotics, drugs_of_concern                                          1
stimulants, narcotics, reversal_agents, depressants                  1
Name: substance_type, Length: 66, dtype: int64

In [13]:
# count the number of stigma types in all posts
narcotics = 0
hallucinogens = 0
depressants = 0
stims = 0
drugs_of_concern = 0
designer_drugs = 0
cannabis = 0
synthetic_cannabinoids = 0
reversal_agents = 0
other = 0
unspecified = 0
for index, row in dat.iterrows():
    if 'narcotics' in row['substance_type']:
        narcotics += 1
    if 'hallucinogens' in row['substance_type']:
        hallucinogens += 1
    if 'depressants' in row['substance_type']:
        depressants += 1
    if 'stimulants' in row['substance_type']:
        stims += 1
    if 'drugs_of_concern' in row['substance_type']:
        drugs_of_concern += 1
    if 'designer_drugs' in row['substance_type']:
        designer_drugs += 1
    if 'cannabis' in row['substance_type']:
        cannabis += 1
    if 'synthetic_cannabinoids' in row['substance_type']:
        synthetic_cannabinoids += 1
    if 'reversal_agents' in row['substance_type']:
        reversal_agents += 1
    if 'other' in row['substance_type']:
        other += 1
    if 'unspecified' in row['substance_type']:
        unspecified += 1

print('narcotics:', narcotics)
print('hallucinogens:', hallucinogens)
print('depressants:', depressants)
print('stims:', stims)
print('drugs_of_concern:', drugs_of_concern)
print('designer_drugs:', designer_drugs)
print('cannabis:', cannabis)
print('synthetic_cannabinoids:', synthetic_cannabinoids)
print('reversal_agents:', reversal_agents)
print('other:', other)
print('unspecified:', unspecified)


narcotics: 769
hallucinogens: 162
depressants: 200
stims: 1218
drugs_of_concern: 14
designer_drugs: 6
cannabis: 818
synthetic_cannabinoids: 14
reversal_agents: 41
other: 8
unspecified: 1021


In [21]:
dat2 = dat.copy()
dat2["substance_type"] = dat2["substance_type"].str.split(", ")
dat2 = dat2.explode("substance_type")
dat2["substance_type"] = dat2["substance_type"].str.strip()

table = pd.crosstab(dat2['substance_type'], dat2['stigma_type'])
subs_totals = table.sum(axis=1)
stigma_totals = table.sum(axis=0)
sorted_table = table.reindex(subs_totals.sort_values(ascending=False).index, axis=0)

table.to_csv('../data/destigma_pipeline/task2/stigma-substance-table.csv')
print(sorted_table)

stigma_type             directed stigma  self-stigma  structural stigma
substance_type                                                         
stimulants                          818          380                 20
unspecified                         537          475                  9
cannabis                            515          276                 27
narcotics                           501          250                 18
depressants                          92          102                  6
hallucinogens                        90           68                  4
reversal_agents                      38            3                  0
drugs_of_concern                      7            7                  0
synthetic_cannabinoids               11            3                  0
other                                 4            3                  1
designer_drugs                        6            0                  0


In [23]:
# group by stigma type and subreddit and count the number of posts
dat['subreddit'].value_counts()


offmychest          2565
unpopularopinion     460
nursing              156
medicine              26
Name: subreddit, dtype: int64

In [25]:
dat.to_csv('../data/destigma_pipeline/task2/stigma-posts.csv', index=False)

---
## Phrase extraction

In [14]:
directed = dat[dat['stigma_type']=='directed stigma']

In [26]:
self = dat[dat['stigma_type']=='self-stigma']
structural = dat[dat['stigma_type']=='structural stigma']

self.to_csv('../data/destigma_pipeline/task2/self-stigma-posts.csv', index=False)
structural.to_csv('../data/destigma_pipeline/task2/structural-stigma-posts.csv', index=False)

In [24]:
directed['subreddit'].value_counts()

offmychest          1339
unpopularopinion     445
nursing              143
medicine              22
Name: subreddit, dtype: int64

In [None]:
import spacy
import spacy_transformers
from collections import Counter

# meaninful phrases
nlp = spacy.load("en_core_web_trf")

def extract_meaningful_phrases(text):
    doc = nlp(text)
    # Filter out pronouns and single-character words in noun chunks
    return [chunk.text for chunk in doc.noun_chunks if chunk.root.pos_ not in ['PRON'] and len(chunk.text) > 1]

# Apply the function to extract phrases
directed['phrases'] = directed['text'].apply(extract_meaningful_phrases)
phrases_flat = [phrase for sublist in directed['phrases'] for phrase in sublist]
phrase_counts = Counter(phrases_flat)

# Display the most common phrases
print(phrase_counts.most_common(20))

In [None]:
# refine noun extraction
def extract_complex_phrases(text):
    doc = nlp(text)
    # Combine noun chunks and named entities for richer context
    phrases = set(chunk.text for chunk in doc.noun_chunks if len(chunk.text.split()) > 1)
    entities = set(ent.text for ent in doc.ents)
    return list(phrases.union(entities))

directed['phrases2'] = directed['text'].apply(extract_complex_phrases)
phrases_flat2 = [phrase for sublist in directed['phrases2'] for phrase in sublist]
phrase_counts2 = Counter(phrases_flat2)

print(phrase_counts2.most_common(20))

[('one', 335), ('first', 241), ('my life', 186), ('my mom', 175), ('two', 175), ('a lot', 174), ('today', 155), ('my brother', 141), ('my family', 130), ('my dad', 117), ('no one', 116), ('my mother', 111), ('the time', 109), ('my parents', 102), ('a job', 99), ('the house', 99), ('my sister', 98), ('the way', 92), ('an addict', 91), ('your life', 90)]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  directed['phrases2'] = directed['text'].apply(extract_complex_phrases)


In [None]:
# extract based on dependency parsing

def extract_dependency_phrases(text):
    doc = nlp(text)
    phrases = []
    for token in doc:
        # Extract phrases based on syntactic dependencies
        if token.dep_ in ('nsubj', 'dobj', 'pobj', 'attr', 'ROOT'):
            phrases.append(token.head.text + ' ' + token.text)
    return phrases

directed['phrases_dp'] = directed['text'].apply(extract_dependency_phrases)
phrases_flat_dp = [phrase for sublist in directed['phrases_dp'] for phrase in sublist]
phrase_counts_dp = Counter(phrases_flat_dp)

print(phrase_counts_dp.most_common(20))

[('is is', 1705), ('was was', 1090), ("'m i", 987), ("'s 's", 809), ("'s it", 805), ('know i', 745), ('have i', 655), ('have have', 647), ('was i', 624), ('are are', 622), ('know know', 607), ('want i', 540), ('was it', 522), ("'m 'm", 470), ('had had', 462), ('am i', 426), ('feel i', 423), ("'re you", 423), ('had i', 412), ('was he', 402)]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  directed['phrases_dp'] = directed['text'].apply(extract_dependency_phrases)


In [None]:
# extract subtree
def extract_subtrees(text):
    doc = nlp(text)
    phrases = []
    for token in doc:
        if token.dep_ in ['nsubj', 'obj', 'iobj']:  # You can modify the list of dependency tags
            # Join the tokens in the subtree into a single string
            subtree = ' '.join([t.text for t in token.subtree])
            phrases.append(subtree)
    return phrases

# Apply the function to extract phrases
directed['phrases_subtr'] = directed['text'].apply(extract_subtrees)
phrases_flat_s = [phrase for sublist in directed['phrases_subtr'] for phrase in sublist]
phrase_counts_s = Counter(phrases_flat_s)

# Display the most common phrases
print(phrase_counts_s.most_common(20))

[('i', 16933), ('you', 6257), ('he', 4593), ('she', 4128), ('it', 3426), ('they', 2437), ('we', 1861), ('that', 1554), ('who', 1233), ('this', 647), ('me', 595), ('her', 301), ('people', 300), ('\\n\\ni', 283), ('him', 265), ('what', 237), ('which', 234), ('my mom', 218), ('them', 157), ('my dad', 152)]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  directed['phrases_subtr'] = directed['text'].apply(extract_subtrees)


In [None]:
from rake_nltk import Rake 

rake = Rake()

def extract_keywords(text):
    rake.extract_keywords_from_text(text)
    return rake.get_ranked_phrases()

directed['keywords_rake'] = directed['text'].apply(extract_keywords)
keywords_flat = [keyword for sublist in directed['keywords_rake'] for keyword in sublist]
keyword_counts = Counter(keywords_flat)

print(keyword_counts.most_common(20))


[('n', 3203), ('’', 2931), ('know', 1051), ('get', 1017), ('want', 781), ('drugs', 702), ('life', 668), ('people', 641), ('going', 613), ('fuck', 581), ('time', 562), ('.\\ n', 558), ('like', 516), ('ni', 501), ('think', 500), ('go', 469), ('shit', 458), ('one', 456), ('see', 429), ('family', 422)]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  directed['keywords'] = directed['text'].apply(extract_keywords)


In [None]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/laylabouzoubaa/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/laylabouzoubaa/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [None]:
import yake

yake_kw = yake.KeywordExtractor(lan='en', n=4, dedupLim=0.9, dedupFunc='seqm', windowsSize=1, top=20, features=None)

def extract_keywords_yake(text):
    keywords = yake_kw.extract_keywords(text)
    return [keyword[0] for keyword in keywords]

directed['keywords_yake'] = directed['text'].apply(extract_keywords_yake)
keywords_flat_yake = [keyword for sublist in directed['keywords_yake'] for keyword in sublist]
keyword_counts_yake = Counter(keywords_flat_yake)

print(keyword_counts_yake.most_common(20))

[('people', 317), ('drugs', 281), ('drug', 248), ('time', 218), ('fucking', 211), ('n’t', 209), ('life', 202), ('years', 163), ('fuck', 149), ('weed', 149), ('shit', 136), ('mom', 130), ('feel', 129), ('hate', 127), ('friend', 121), ('addict', 120), ('friends', 118), ('brother', 114), ('family', 109), ('back', 109)]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  directed['keywords_yake'] = directed['text'].apply(extract_keywords_yake)


In [None]:
seed_words = ['addict', 'drugs', 'junkie', 'overdose', 'clean', 'rehab']

dict_of_phrases = {}

for word in seed_words:
    seed_posts = []
    for index, row in directed.iterrows():
        if word in row['text']:
            seed_posts.append(row['text'])
    
    ngram_max = 4
    numOfKeywords = 20
    dedupFunc='seqm'
    dedupLim=0.1
    windowSize=1

    kw_extractor = yake.KeywordExtractor(lan='en', 
                                         n=ngram_max, 
                                         dedupLim=dedupLim, 
                                         dedupFunc=dedupFunc, 
                                         windowsSize=windowSize, 
                                         top=numOfKeywords, features=None)
    
    keywords = kw_extractor.extract_keywords(' '.join(seed_posts))
    dict_of_phrases[word] = [keyword[0] for keyword in keywords]

    print("done for ", word)

done for  addict
done for  drugs
done for  junkie
done for  overdose
done for  clean
done for  rehab


In [None]:
for key, value in dict_of_phrases.items():
    print(key, value)

addict ['sister that i ’ve', 'rehab a few weeks', 'fuck up. i thought', 'n’t', 'day', 'mom', 'ago', 'end', 'run', '’ll', 'idk', 'wtf', 'dog witnessed a drug', 'harrassing my family', 'leg', '5-10', 'gpa', 'yrs', 'completely psychotic', 'xxx']
drugs ['mom will never find', 'shit took our daughter', 'weeks later she calls', 'n’t', 'back', 'guy', 'kids', 'job', 'hell drug addicts put', '’re', 'sex', 'dumbest things people', 'cpr', 'adhd', '5-10', 'usa', 'fbi', 'hmmm', '24-7', 'aaaaaaaaaaaaaaaaaaaaaaa']
junkie ['n’t had a job', 'junkie', 'drugs', 'year', 'mom', 'living', 'call', 'piece of shit', 'good people', 'etc.', 'addict today', '5-10', 'flew', 'bbq', 'aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaahh', 'ffffuuuuuccckk', 'ahhhhh', 'assessments', '04:52']
overdose ['made it in time', 'drug', 'years', 'high', 'good', 'fuck', 'knew', 'completely', 'nwhy', '5-10', 'bff', '24-7', 'initial labs are afu']
clean ['makes them a lot', 'girl i was friends', 'drug', 'back', 'n’t', 'high', 'job', 'full', '’v

In [None]:
# export dict
dict_df = pd.DataFrame.from_dict(dict_of_phrases, orient='index')
dict_df.to_csv('../data/destigma_pipeline/task2/stigma_posts_phrases.csv')

In [16]:
# export
directed.to_csv('../data/destigma_pipeline/task2/directed_stigma.csv', index=False)

---
## Extracting negative verbs and negative associations

In [10]:
from nltk.sentiment import SentimentIntensityAnalyzer
import pandas as pd

# Assuming 'text' column in your DataFrame contains the posts
sia = SentimentIntensityAnalyzer()
directed['sentiment'] = directed['text'].apply(lambda x: sia.polarity_scores(x)['compound'])

# Filter out sentences with negative sentiment
negative_posts = directed[directed['sentiment'] < 0]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  directed['sentiment'] = directed['text'].apply(lambda x: sia.polarity_scores(x)['compound'])


In [11]:
# extract verbs
import spacy
nlp = spacy.load("en_core_web_sm")

def extract_verbs(text):
    doc = nlp(text)
    verbs = [token.lemma_ for token in doc if token.pos_ == 'VERB']
    return verbs

negative_posts['verbs'] = negative_posts['text'].apply(extract_verbs)
verb_counts = negative_posts['verbs'].explode().value_counts()
print(verb_counts.head(20))  # Print top 20 verbs


ImportError: cannot import name util

---
## semantic role labeling

In [None]:
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')

from nltk import pos_tag, word_tokenize
from nltk.chunk import ne_chunk
from nltk.tree import Tree

def extract_np_vp_chunks(sentence):
    words = word_tokenize(sentence)
    pos_tags = pos_tag(words)

    # Chunk sentence into noun phrases (NP) and verb phrases (VP) using a basic grammar
    grammar = r"""
        NP: {<DT|PP\$>?<JJ>*<NN.*>+}  # Chunk sequences of DT, JJ, NN
        VP: {<VB.*><NP|PP|CLAUSE>+$}  # Chunk verbs and their arguments
        PP: {<IN><NP>}                # Chunk prepositions followed by NP
        CLAUSE: {<NP><VP>}            # Chunk NP, VP
    """
    cp = nltk.RegexpParser(grammar)
    tree = cp.parse(pos_tags)
    
    return tree

def simple_srl(sentence):
    tree = extract_np_vp_chunks(sentence)
    semantic_roles = []
    
    # Let's define ARG0 typically as the subject NP, ARG1 as the object NP, and V as the main verb
    for subtree in tree.subtrees():
        if subtree.label() == 'VP':
            verb = None
            args = []
            for s in subtree:
                if type(s) is Tree:
                    if s.label() == 'NP':
                        args.append(' '.join(word for word, tag in s.leaves()))
                else:
                    verb = s[0]
            if verb:
                semantic_roles.append(('V', verb))
                if args:
                    semantic_roles.append(('ARG0', args[0]))  # Simplistically considering the first NP as ARG0
                    if len(args) > 1:
                        semantic_roles.append(('ARG1', args[1]))  # And the second NP as ARG1

    return semantic_roles


In [15]:
# apply srl to the text
directed['srl'] = directed['text'].apply(simple_srl)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  directed['srl'] = directed['text'].apply(simple_srl)


In [16]:
directed_srl = directed[directed['srl'].apply(lambda x: len(x) > 0)][['text', 'srl']]
directed_srl.to_csv('../data/destigma_pipeline/task2/stigma-posts-srl.csv', index=False)

In [None]:
nlp = spacy.load("en_core_web_trf")

def analyze_sentence(sentence):
    doc = nlp(sentence)
    semantic_roles = []

    # Extract entities and their roles based on dependency parsing
    for token in doc:
        # Finding verb (predicate)
        if token.pos_ == 'VERB':
            subject = ''
            object = ''
            # Searching for subject and object relations
            for child in token.children:
                if child.dep_ in ('nsubj', 'nsubjpass'):
                    subject = child.text
                elif child.dep_ in ('dobj', 'attr', 'prep', 'ccomp'):
                    object = child.text
            if subject:
                semantic_roles.append(('V', token.text, 'ARG0', subject))
            if object:
                semantic_roles.append(('V', token.text, 'ARG1', object))
    
    return semantic_roles

In [10]:
directed_srl = pd.read_csv('../data/destigma_pipeline/task2/directed_srl.csv')

---
### junk

In [None]:
# phrase extraction 
from collections import Counter
import spacy
from spacy.matcher import PhraseMatcher

nlp = spacy.load('en_core_web_sm')
matcher = PhraseMatcher(nlp.vocab)

# define phrases
phrases = ['drug addict', 'drug addiction', 'addict', 'addiction', 'junkie', 
              'druggie', 'druggies', 'addicts', 'od', 'overdose', 'low-life']

# add phrases to matcher
patterns = [nlp.make_doc(text) for text in phrases]
matcher.add('stigma', None, *patterns)

# extract phrases
phrases = []
for index, row in directed.iterrows():
    doc = nlp(row['text'])
    matches = matcher(doc)
    for match_id, start, end in matches:
        span = doc[start:end]
        phrases.append(span.text)

# count phrases
phrase_counts = Counter(phrases)
print(phrase_counts)



In [None]:
import nltk
from nltk.text import Text

# Tokenize the complete text corpus
tokens = [word for sent in directed['text'] for word in nltk.word_tokenize(sent)]
text_obj = Text(tokens)

# Function to print concordances of phrases
def print_concordances(phrase, width=80):
    text_obj.concordance(phrase, width=width, lines=5)

# Example usage for the phrase 'substance use'
print_concordances('drugs')

Displaying 5 of 1274 matches:
ruggy . you can tell my sister is on drugs because she doesnt shut her yap and 
d my mom says my sister is n't doing drugs but she ca n't tell.\n\nno wonder i 
sister is a liar , steals , and does drugs . she tries to make you feel like yo
rve major kudos . hey i 've done the drugs , i did coke like nobodys business i
ast 13 years and now she wants to do drugs again .... drugs suck i do n't care 


In [None]:
from allennlp.predictors import Predictor
import allennlp_models.tagging

# Load the pre-trained Semantic Role Labeling model
predictor = Predictor.from_path("https://storage.googleapis.com/allennlp-public-models/bert-base-srl-2020.11.19.tar.gz")

# Define a function to apply SRL to a sentence
def semantic_role_labeling(text):
    # Use the predictor on the provided text
    results = predictor.predict(sentence=text)
    
    # Extract and return the verbs and their roles
    verbs = results['verbs']
    formatted_results = []
    for verb in verbs:
        description = verb['description']
        formatted_results.append(description)
    
    return formatted_results