In [1]:
import spacy
import pandas as pd
import pprint
from utilz import get_top_cat

dev_path = 'preds/prediction-matrix-fasttext.csv'

In [2]:
df = pd.read_csv(dev_path)
print(df.columns)

Index(['tweetid', 'text', 'y_true', 'y_pred', 'a', 'c', 'm', 'u'], dtype='object')


In [3]:
df.head()

Unnamed: 0,tweetid,text,y_true,y_pred,a,c,m,u
0,1201409307167862784,can somebody tell me what morphine is for ?,m,m,1e-05,1e-05,0.99495,1e-05
1,1200007750383738885,<number> mg . <number> of my x <number> i feee...,m,c,0.002193,0.085109,0.039649,1e-05
2,1199244035006902272,"oh hello crippling anxiety , let ’ s drive hom...",c,c,1e-05,0.977033,0.300756,0.001255
3,1199782125609902084,pop a adderall we gone fuck all night,m,a,0.999206,1e-05,0.132974,1e-05
4,1199783941764517889,fake exercise facts a xanax a day keeps the tr...,m,m,1e-05,0.939923,0.974053,1e-05


In [4]:
map2simple = {
    'ABUSE': 'a',
    'CONSUMPTION': 'c',
    'MENTION': 'm',
    'UNRELATED': 'u'
}

abbrev2fullname =  mapping={
    'a': 'ABUSE',
    'c': 'CONSUMPTION',
    'm': 'MENTION',
    'u': 'UNRELATED'
}

def swap_lab(val, mapping={
    'a': 'ABUSE',
    'c': 'CONSUMPTION',
    'm': 'MENTION',
    'u': 'UNRELATED'
}):
    
    return mapping.get(val)

def catify(doc, mapping):
    top_cat, _ = get_top_cat(doc)
    return mapping.get(top_cat)



In [5]:


def write_out_predictions_4_evalofficial(
    inputpath='preds/prediction-matrix-fasttext.csv',
    text_col='text',
    label_col = False,
    mapping = {
    'ABUSE': 'a',
    'CONSUMPTION': 'c',
    'MENTION': 'm',
    'UNRELATED': 'u'
},
    modelpath='saved-models/spacy-cnn',
    preds_eval_official_base = 'preds/spacy-cnn-eval-official',
    preds_matrix_path_base='preds/spacy-cnn-prediction-matrix',
    preds_matrx= True, n_samples=20):
    labels = list(mapping.keys())
    print(f'loading spacy model from {modelpath} to predict labels: {", ".join(labels)}')
    nlp = spacy.load(modelpath)
    df_in = pd.read_csv(inputpath)
    if not n_samples:
        n_samples = len(df_in)
    df_in = df_in.head(n_samples)
    outpath = f"{preds_eval_official_base}-{n_samples}.csv"
    outpathm = f"{preds_matrix_path_base}-{n_samples}.csv"
    if not label_col:
        print(f"input data:\n{df_in[['tweetid', text_col]].head(3)}")
        
    else:
        print(f"input data with predictions:\n{df_in[['tweetid', text_col, label_col]].head(3)}")

    
    tweetids = df_in['tweetid'].tolist()
    tweet_texts = df_in[text_col].tolist()
    
    docs = list(nlp.pipe(tweet_texts))
    print(f"predicting on {n_samples} samples...")
    print(f"gathering y_pred - the class with the the maximum score predicted by the model (how we get predicted class)")
    cats = [catify(doc, mapping=mapping) for doc in docs]
    tweet_preds = list(zip(tweetids, cats))
    
    df_out = pd.DataFrame(data=tweet_preds,
                          columns=['tweetid', 'Class'])
    
    print(f"\noutput data:\n{df_out.head(3)}")
    
    df_out.to_csv(outpath, index=False)
    print(f'\nwriting {n_samples} predictions to {outpath}')
    
    o = [{'tweetid': tweetid, 'y_pred': cat, 'text': text} for 
             tweetid, cat, text in list(zip(tweetids, cats, tweet_texts))]
    o2 = None
    if preds_matrx:
        if label_col:
            df_in[label_col] = df_in[label_col].map(str.strip)
            label_cols = df_in[label_col].tolist()
            pred_cats_dict_list2 = [{'tweetid': tweetidp[0], 'y_pred': tweetidp[1],
            'text': doc.text, 'y_true': gt} for tweetidp, doc, gt in list(zip(tweet_preds, docs, label_cols))]
            [pred_dic.update(doc.to_json()['cats']) for pred_dic, doc in list(
            zip(pred_cats_dict_list2, docs))]
    
            print(f'printing a group of predictions for labels + scores for {", ".join(labels)}')
    
            for i, dc in enumerate(pred_cats_dict_list2[:3]):
                pprint.pprint(dc)
            o2 = pred_cats_dict_list2
        
        else:
            pred_cats_dict_list = [{'tweetid': tweetidp[0],
                                    'y_pred': tweetidp[1],
            'text': doc.text,
            } for tweetidp, doc in list(zip(tweet_preds, docs))]
            [pred_dic.update(doc.to_json()['cats']) for pred_dic, doc in list(
                zip(pred_cats_dict_list, docs))]
            print(f'sample preds')
            for i, dc in enumerate(pred_cats_dict_list[:3]):
                pprint.pprint(dc)
            o2 = pred_cats_dict_list
            
 
        df_preds_matrix = pd.DataFrame(o2)
        df_preds_matrix[['tweetid',
 'text',
 'y_true',
 'y_pred',
 'ABUSE',
 'CONSUMPTION',
 'MENTION',
 'UNRELATED']].to_csv(outpathm, index=False)
    return o, o2
        
        
        
        
        
            
        
        
    
##df1, df2 = write_out_predictions_4_evalofficial(label_col='y_true')

In [6]:
all_off, all_matrix = write_out_predictions_4_evalofficial(n_samples=False, label_col='y_true')

loading spacy model from saved-models/spacy-cnn to predict labels: ABUSE, CONSUMPTION, MENTION, UNRELATED
input data with predictions:
               tweetid                                               text  \
0  1201409307167862784        can somebody tell me what morphine is for ?   
1  1200007750383738885  <number> mg . <number> of my x <number> i feee...   
2  1199244035006902272  oh hello crippling anxiety , let ’ s drive hom...   

  y_true  
0      m  
1      m  
2      c  
predicting on 2635 samples...
gathering y_pred - the class with the the maximum score predicted by the model (how we get predicted class)

output data:
               tweetid Class
0  1201409307167862784     m
1  1200007750383738885     m
2  1199244035006902272     c

writing 2635 predictions to preds/spacy-cnn-eval-official-2635.csv
printing a group of predictions for labels + scores for ABUSE, CONSUMPTION, MENTION, UNRELATED
{'ABUSE': 0.12959064543247223,
 'CONSUMPTION': 0.058977238833904266,
 'MENTION': 

In [7]:
matrix_preds_df = pd.DataFrame(all_matrix)
print(matrix_preds_df.columns)
matrix_preds_df

Index(['tweetid', 'y_pred', 'text', 'y_true', 'ABUSE', 'CONSUMPTION',
       'MENTION', 'UNRELATED'],
      dtype='object')


Unnamed: 0,tweetid,y_pred,text,y_true,ABUSE,CONSUMPTION,MENTION,UNRELATED
0,1201409307167862784,m,can somebody tell me what morphine is for ?,m,0.129591,0.058977,0.811110,0.000322
1,1200007750383738885,m,<number> mg . <number> of my x <number> i feee...,m,0.112575,0.111845,0.774122,0.001458
2,1199244035006902272,c,"oh hello crippling anxiety , let ’ s drive hom...",c,0.007263,0.633963,0.349767,0.009007
3,1199782125609902084,m,pop a adderall we gone fuck all night,m,0.091566,0.169169,0.736982,0.002283
4,1199783941764517889,c,fake exercise facts a xanax a day keeps the tr...,m,0.333943,0.338942,0.318412,0.008703
...,...,...,...,...,...,...,...,...
2630,1200980670782300160,m,_u that was not annie hall or diane keaton tha...,m,0.312474,0.139349,0.509596,0.038582
2631,1199509721868374022,c,_u suboxone for opiate dependent individuals d...,m,0.020803,0.749869,0.224500,0.004828
2632,1198691681119490050,m,small brain : love lil pump med brain : xanax ...,m,0.272467,0.338632,0.366688,0.022213
2633,1200884551108714497,c,_u do they have a physician ? many will give f...,m,0.105011,0.457535,0.428604,0.008850


In [8]:
# in this case all2 is none --> foor writing out just eval4official
# all1, all2 = write_out_predictions_4_evalofficial(n_samples=False, preds_matrx=False)
# all2

In [9]:
import sklearn.metrics as sklm

y_true = matrix_preds_df['y_true']
y_pred = matrix_preds_df['y_pred']
print(f"{sklm.classification_report(y_true=y_true, y_pred=y_pred)}")

              precision    recall  f1-score   support

           a       0.20      0.08      0.12       448
           c       0.24      0.20      0.22       730
           m       0.50      0.68      0.58      1353
           u       0.07      0.01      0.02       104

    accuracy                           0.42      2635
   macro avg       0.25      0.24      0.23      2635
weighted avg       0.36      0.42      0.38      2635



In [10]:
col_order_class_abbrv = ['tweetid', 'text', 'y_true', 'y_pred', 'a', 'c', 'm', 'u']
[abbrev2fullname.get(col, col) for col in col_order_class_abbrv]

['tweetid',
 'text',
 'y_true',
 'y_pred',
 'ABUSE',
 'CONSUMPTION',
 'MENTION',
 'UNRELATED']

In [11]:
import glob
dataorig_files = glob.glob('data-orig/*.csv')
dataorig_files

['data-orig/validation.csv', 'data-orig/train.csv']

In [12]:
full_df = pd.concat([pd.read_csv(f) for f in dataorig_files])


In [13]:
import numpy as np

num_rows,_ = full_df.shape
num_rows


13172

In [14]:
label_col = 'class'
text_col = 'unprocessed_text'
meta_cols = ['tweetid',]
full_df['y_true'] = full_df[label_col].map(str.strip)
one_hot = ['y_true',]

for col in one_hot:
    new_data = full_df.loc[:,col]
    
new_data = pd.get_dummies(new_data)

new_data['metadata'] = full_df.loc[:, 'tweetid']
new_data['text'] = full_df.loc[:, text_col]
new_data


Unnamed: 0,a,c,m,u,metadata,text
0,0,0,1,0,1201409307167862784,Can somebody tell me what morphine is for?
1,0,0,1,0,1200007750383738885,1.2 mg .02 of my x 6 i feeeeeeeeeeel goooooooo...
2,0,1,0,0,1199244035006902272,"Oh hello crippling anxiety, let’s drive home f..."
3,0,0,1,0,1199782125609902084,Pop a adderall we gone fuck all night
4,0,0,1,0,1199783941764517889,#fakeexercisefacts a xanax a day keeps the tra...
...,...,...,...,...,...,...
10532,1,0,0,0,1201785605576937472,"No OK for real I actually deeply, deeply regre..."
10533,0,0,1,0,1201906257344512001,_U _U _U _U ugh a consulting attending refused...
10534,0,1,0,0,1200352805925670917,i took adderall and no longer want to kms so w...
10535,0,0,1,0,1201881741499215877,Adderall but for Emotions


In [15]:
texts = full_df.sample(10)[text_col].tolist()
web = spacy.load("en_core_web_lg")

nlp = spacy.load("spacy-cnn-twitter-glove")
#nlp.add_pipe()

#docs = list(nlp.pipe(texts))



In [16]:
textcat = nlp.get_pipe("textcat")
web.add_pipe(textcat)

In [17]:
print(web.pipeline)

[('tagger', <spacy.pipeline.pipes.Tagger object at 0x12a80d210>), ('parser', <spacy.pipeline.pipes.DependencyParser object at 0x190031980>), ('ner', <spacy.pipeline.pipes.EntityRecognizer object at 0x190031830>), ('textcat', <spacy.pipeline.pipes.TextCategorizer object at 0x150e01990>)]


In [18]:
docs= list(web.pipe(texts))
for i, doc in enumerate(docs):
    print(" ".join(token.text for token in doc))
    print("\n")
    for token in doc:

        print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_,
            token.shape_, token.is_alpha, token.is_stop)
    break
   # print(doc.to_json())
    
    
    

oxycontin is a great drug if you wanna be sad and constipated


oxycontin oxycontin PROPN NNP nsubj xxxx True False
is be AUX VBZ ROOT xx True True
a a DET DT det x True True
great great ADJ JJ amod xxxx True False
drug drug NOUN NN attr xxxx True False
if if SCONJ IN mark xx True True
you -PRON- PRON PRP nsubj xxx True True
wanna wanna VERB VBP advcl xxxx True False
be be AUX VB xcomp xx True True
sad sad ADJ JJ acomp xxx True False
and and CCONJ CC cc xxx True True
constipated constipated ADJ JJ conj xxxx True False


In [25]:
new_data['label'] = full_df.loc[:,'class']
new_data['label'] = new_data['label'].map(str.strip)

In [31]:
100 - 15
85 - 65
new_data[['label', 'unprocessed_text', 'tweetid']]

20

In [34]:
np.random.seed(13)
trn_rows = np.sort(np.random.choice(num_rows, size = int(num_rows * .85), replace = False))

#Select Validation rows
val_rows = np.setdiff1d(np.arange(num_rows), trn_rows)

#Split dataset


#Select Training rows
np.random.seed(0)
trn_rows = np.sort(np.random.choice(num_rows, size = int(num_rows * .7), replace = False))

#Select Validation rows
val_rows = np.setdiff1d(np.arange(num_rows), trn_rows)
new_data[trn_rows,:]

TypeError: '(array([    1,     4,     5, ..., 13168, 13170, 13171]), slice(None, None, None))' is an invalid key

In [None]:
#Split dataset
trn_data, val_data = data[trn_rows,1:], data[val_rows,1:]
trn_Y, val_Y = data[trn_rows,0], data[val_rows,0]