### model predictions
* fastText for multiclass input data looks like a flat file delimited with ```__label__ + label_class + <\space> + preprocessed_text```
* here in this notebook we load an already trained model to get all model predictions for all classes

In [1]:
! pip install fasttext
! pip install pandas
! pip install gcsfs
! pip install jsonlines



In [2]:
# importing packgs and creating filespace
import gcsfs
import fasttext
import pandas as pd
import string

fs = gcsfs.GCSFileSystem(project='sm4h-rxspace')

In [3]:
from datetime import datetime

dt = datetime.now().strftime('%Y-%m-%d %H:%M')
print(f"starting at {dt}")

starting at 2020-03-25 16:02


In [12]:
# creating text_preprocessing with ekphrasis
import re
from ekphrasis.classes.preprocessor import TextPreProcessor
from ekphrasis.classes.tokenizer import SocialTokenizer
from ekphrasis.dicts.emoticons import emoticons

text_processor = TextPreProcessor(
    # terms that will be normalized
    normalize=['url', 'email', 'percent', 'money', 'phone', 'user',
        'time', 'url', 'date', 'number'],
    # terms that will be annotated
    annotate={},
#     annotate={"hashtag", "allcaps", "elongated", "repeated",
#         'emphasis', 'censored'},
    fix_html=True,  # fix HTML tokens
    
    # corpus from which the word statistics are going to be used 
    # for word segmentation 
    segmenter="twitter", 
    
    # corpus from which the word statistics are going to be used 
    # for spell correction
    corrector="twitter", 
    
    unpack_hashtags=True,  # perform word segmentation on hashtags
    unpack_contractions=True,  # Unpack contractions (can't -> can not)
    spell_correct_elong=False,  # spell correction for elongated words
    
    # select a tokenizer. You can use SocialTokenizer, or pass your own
    # the tokenizer, should take as input a string and return a list of tokens
    tokenizer=SocialTokenizer(lowercase=True).tokenize,
    
    # list of dictionaries, for replacing tokens extracted from the text,
    # with other expressions. You can pass more than one dictionaries.
    dicts=[emoticons]
)

sentences = [
'Can somebody tell me what morphine is for?', '1.2 mg .02 of my x 6 i feeeeeeeeeeel goooooooood like im on suboxone or methadone', 'Oh hello crippling anxiety, let’s drive home from Pittsburgh shall we?! #ativan', 'Pop a adderall we gone fuck all night', '#fakeexercisefacts a xanax a day keeps the trainer away']
for s in sentences:
    print(type(s), s)
    print(" ".join(text_processor.pre_process_doc(s)))

Reading twitter - 1grams ...
Reading twitter - 2grams ...
Reading twitter - 1grams ...
<class 'str'> Can somebody tell me what morphine is for?
can somebody tell me what morphine is for ?
<class 'str'> 1.2 mg .02 of my x 6 i feeeeeeeeeeel goooooooood like im on suboxone or methadone
<number> mg . <number> of my x <number> i feeeeeeeeeeel goooooooood like im on suboxone or methadone
<class 'str'> Oh hello crippling anxiety, let’s drive home from Pittsburgh shall we?! #ativan
oh hello crippling anxiety , let ’ s drive home from pittsburgh shall we ? ! ativan
<class 'str'> Pop a adderall we gone fuck all night
pop a adderall we gone fuck all night
<class 'str'> #fakeexercisefacts a xanax a day keeps the trainer away
fake exercise facts a xanax a day keeps the trainer away


In [13]:


def preprocess_fasttext(s, lower=True):
    tokens = text_processor.pre_process_doc(s)
    if lower:
        return ' '.join([t.lower() for t in tokens])

    return ' '.join(tokens)

In [14]:
def create_fasttext_label(val):
    val = str(val)
    val = val.strip()
    return '__label__' + val

def preprocess_fasttext(s, lower=True):
    tokens = text_processor.pre_process_doc(s)
    if lower:
        return ' '.join([t.lower() for t in tokens])

    return ' '.join(tokens)

In [15]:

train_pth = "gs://sm4h-rxspace/task4/train.csv"
dev_pth = "gs://sm4h-rxspace/task4/validation.csv"
model_pth = "fasttext_model_tweets.bin"

In [16]:
model = fasttext.load_model(model_pth)





In [21]:
verbose_map = {
    'a': 'ABUSE',
    'm': 'MENTION',
    'u': 'UNRELATED',
    'c': 'CONSUMPTION'
              }

def prediction_matrix(dev_pth, outpath,text_col='unprocessed_text', label_col='class', n_samples = None):
    
    df = pd.read_csv(dev_pth)
    if n_samples is None:
        n_samples = len(df)
    df = df.head(n_samples)
#print(list(df.unprocessed_text[:5]))
    print(df.head())

    print(f"read in {n_samples} samples from {dev_pth}")
    
    df['label'] = df[label_col].map(create_fasttext_label)
    df['text'] = df[text_col].replace('\n', ' ', regex=True).replace('\t', ' ', regex=True)
    df['text'] = df['text'].map(str)
    df['text'] = df['text'].map(preprocess_fasttext)

    preds_list = []
    for i, row in df.iterrows():
        tweetid = row['tweetid']
        text = row['text']
        true_lab = row[label_col]
     #print(text)
        # to predict all scores
        pred_lbs, scores = model.predict(text, k=4)
        y_pred, score_pred = model.predict(text)
        y_pred = y_pred[0]
        pred_lbs = [lb.replace('__label__', '') for lb in pred_lbs]
        predictions_dict = dict(zip(pred_lbs, scores))
        predictions_dict['text'] = text
        predictions_dict['tweetid'] = tweetid
        predictions_dict['y_true'] = true_lab
        predictions_dict['y_pred'] = y_pred.replace('__label__', '')
    
        preds_list.append(predictions_dict)
    df_pred = pd.DataFrame(preds_list)
    print(df_pred.columns)
    df_pred[['tweetid', 'text', 'y_true', 'y_pred', 'a', 'c', 'm', 'u']].to_csv(f'{outpath}', index=False)
    return df_pred



In [22]:

df_pred = prediction_matrix(dev_pth=dev_pth, outpath='prediction-matrix-fasttext.csv')

    

   Unnamed: 0 class              tweetid  \
0        9293     m  1201409307167862784   
1        4651     m  1200007750383738885   
2        2275     c  1199244035006902272   
3        3740     m  1199782125609902084   
4        3759     m  1199783941764517889   

                                    unprocessed_text  
0         Can somebody tell me what morphine is for?  
1  1.2 mg .02 of my x 6 i feeeeeeeeeeel goooooooo...  
2  Oh hello crippling anxiety, let’s drive home f...  
3              Pop a adderall we gone fuck all night  
4  #fakeexercisefacts a xanax a day keeps the tra...  
read in 2635 samples from gs://sm4h-rxspace/task4/validation.csv
Index(['m', 'u', 'a', 'c', 'text', 'tweetid', 'y_true', 'y_pred'], dtype='object')


In [36]:
df_pred = pd.read_csv('prediction-matrix-fasttext.csv')

pd.set_option('display.max_colwidth', 0)
abuse_subset_fasttext_model = df_pred[df_pred.y_true=='a']
abuse_subset_fasttext_model

Unnamed: 0,tweetid,text,y_true,y_pred,a,c,m,u
5,1198463912330645504,_u i lied about being claustrophobic for no reason and they gave me a clonazepam shot so it was awesome,a,c,0.007826,1.000010,0.000010,0.000010
11,1202352927446380544,anyone selling some morphine i need like <number> dozes,a,a,0.980886,0.000010,0.787941,0.000010
24,1200680073751408640,looking for some adderall . shoot me the link .,a,a,0.812877,0.000010,0.056662,0.000010
32,1199987793482653696,""" do not tell mikes dad i did drugs . he will lock me up . morphine yea hell really lock me up "" mikes dad who was a police officer",a,m,0.000010,0.067557,0.685959,0.001180
38,1199134018807181312,got my adderall fix and i feel great ! 😁 😄,a,c,0.012442,1.000010,0.000010,0.000010
...,...,...,...,...,...,...,...,...
2589,1201141807071072256,seroquel munchies are absolutely real i always thought it was a joke,a,m,0.000010,0.011697,0.808077,0.000921
2599,1200499901160968194,"how much xanax is too much ? i have gobbled down <number> percocets in a day . got a high tolerance for shit like this . i have heard someone say it ' s possible to smoke too much weed , but i never found a limit myself . good at drugs",a,a,0.981463,0.000010,0.021625,0.000010
2607,1199822537892519936,he took methadone by mistake and nearly died from it ? how bruh ? !,a,m,0.000010,0.000010,1.000010,0.000010
2619,1199047791454171138,"_u i ’ ve done oxy , hydrocodone , and codeine and can confirm this is accurate",a,c,0.000010,0.912446,0.020342,0.000010


In [39]:

wrong_preds_abuse = abuse_subset_fasttext_model[abuse_subset_fasttext_model.y_pred != 'a']
wrong_preds_abuse.y_pred.value_counts()

m    171
c    116
u    3  
Name: y_pred, dtype: int64

In [40]:
wrong_preds_abuse
        

Unnamed: 0,tweetid,text,y_true,y_pred,a,c,m,u
5,1198463912330645504,_u i lied about being claustrophobic for no reason and they gave me a clonazepam shot so it was awesome,a,c,0.007826,1.000010,0.000010,0.000010
32,1199987793482653696,""" do not tell mikes dad i did drugs . he will lock me up . morphine yea hell really lock me up "" mikes dad who was a police officer",a,m,0.000010,0.067557,0.685959,0.001180
38,1199134018807181312,got my adderall fix and i feel great ! 😁 😄,a,c,0.012442,1.000010,0.000010,0.000010
55,1202443453885489152,i was shunned into never doing coke so like what is morphine then i ’ m down,a,c,0.065615,0.348655,0.000010,0.048868
62,1201315490381082624,"_u mikael found himself dumbfounded and at loss for words . is that alright ? will she freak out ? i mean , she offered it herself . scratch that , i will just snort my line of morphine , per usual . mikael dragged his hand to hang up , but his fingers —",a,m,0.000010,0.000010,0.999579,0.006498
...,...,...,...,...,...,...,...,...
2551,1200742999656976386,can ’ t cope with danny off his head on morphine laid in bed watching 9 0 s music videos and,a,m,0.000010,0.000698,0.996527,0.001998
2589,1201141807071072256,seroquel munchies are absolutely real i always thought it was a joke,a,m,0.000010,0.011697,0.808077,0.000921
2607,1199822537892519936,he took methadone by mistake and nearly died from it ? how bruh ? !,a,m,0.000010,0.000010,1.000010,0.000010
2619,1199047791454171138,"_u i ’ ve done oxy , hydrocodone , and codeine and can confirm this is accurate",a,c,0.000010,0.912446,0.020342,0.000010
