### text preprocessing foor fastText
* fastText for multiclass input data looks like a flat file delimited with ```__label__ + label_class + <\space> + preprocessed_text```

In [9]:
! pip install fasttext
! pip install pandas
! pip install gcsfs




In [10]:
# importing packgs and creating filespace
import gcsfs
import fasttext
import pandas as pd
import string

fs = gcsfs.GCSFileSystem(project='sm4h-rxspace')

In [11]:
from datetime import datetime

dt = datetime.now().strftime('%Y-%m-%d %H:%M')
print(f"starting at {dt}")

starting at 2020-03-25 09:20


In [12]:
# creating text_preprocessing with ekphrasis
import re
from ekphrasis.classes.preprocessor import TextPreProcessor
from ekphrasis.classes.tokenizer import SocialTokenizer
from ekphrasis.dicts.emoticons import emoticons

text_processor = TextPreProcessor(
    # terms that will be normalized
    normalize=['url', 'email', 'percent', 'money', 'phone', 'user',
        'time', 'url', 'date', 'number'],
    # terms that will be annotated
    annotate={},
#     annotate={"hashtag", "allcaps", "elongated", "repeated",
#         'emphasis', 'censored'},
    fix_html=True,  # fix HTML tokens
    
    # corpus from which the word statistics are going to be used 
    # for word segmentation 
    segmenter="twitter", 
    
    # corpus from which the word statistics are going to be used 
    # for spell correction
    corrector="twitter", 
    
    unpack_hashtags=True,  # perform word segmentation on hashtags
    unpack_contractions=True,  # Unpack contractions (can't -> can not)
    spell_correct_elong=False,  # spell correction for elongated words
    
    # select a tokenizer. You can use SocialTokenizer, or pass your own
    # the tokenizer, should take as input a string and return a list of tokens
    tokenizer=SocialTokenizer(lowercase=True).tokenize,
    
    # list of dictionaries, for replacing tokens extracted from the text,
    # with other expressions. You can pass more than one dictionaries.
    dicts=[emoticons]
)

sentences = [
    "CANT WAIT for the new season of #TwinPeaks ＼(^o^)／!!! #davidlynch #tvseries :)))",
    "I saw the new #johndoe movie and it suuuuucks!!! WAISTED $10... #badmovies :/",
    "@SentimentSymp:  can't wait for the Nov 9 #Sentiment talks!  YAAAAAAY !!! :-D http://sentimentsymposium.com/."
]

for s in sentences:
    print(type(s), s)
    print(" ".join(text_processor.pre_process_doc(s)))

Reading twitter - 1grams ...
Reading twitter - 2grams ...
Reading twitter - 1grams ...
<class 'str'> CANT WAIT for the new season of #TwinPeaks ＼(^o^)／!!! #davidlynch #tvseries :)))
cant wait for the new season of twin peaks ＼(^o^)／ ! ! ! david lynch tv series <happy>
<class 'str'> I saw the new #johndoe movie and it suuuuucks!!! WAISTED $10... #badmovies :/
i saw the new john doe movie and it suuuuucks ! ! ! waisted <money> . . . bad movies <annoyed>
<class 'str'> @SentimentSymp:  can't wait for the Nov 9 #Sentiment talks!  YAAAAAAY !!! :-D http://sentimentsymposium.com/.
<user> : can not wait for the <date> sentiment talks ! yaaaaaay ! ! ! <laugh> <url>


In [53]:
def create_fasttext_label(val):
    val = str(val)
    val = val.strip()
    return '__label__' + val

def preprocess_fasttext(s, lower=True):
    tokens = text_processor.pre_process_doc(s)
    if lower:
        return ' '.join([t.lower() for t in tokens])

    return ' '.join(tokens)

In [63]:
import csv

def main(inpath, outpath, text_col='unprocessed_text', label_col='class'):
    
    df = pd.read_csv(inpath)
    n = len(df)
    print(f"read in {n} samples from {inpath}")
    
    df['label'] = df[label_col].map(create_fasttext_label)
    df['text'] = df[text_col].replace('\n', ' ', regex=True).replace('\t', ' ', regex=True)
    df['text'] = df['text'].map(str)
    df['text'] = df['text'].map(preprocess_fasttext)
    fasttext_df = df[['label', 'text']]
    fasttext_df.to_csv(f"{outpath}", index=False, sep=' ',
                       header=False, quoting=csv.QUOTE_NONE,
                      quotechar="", escapechar=" ")
    print(f"wrote out fasttext prepared text to {outpath}")
    
    
    
    

In [64]:

train_pth = "gs://sm4h-rxspace/task4/train.csv"
dev_pth = "gs://sm4h-rxspace/task4/validation.csv"



In [66]:
main(inpath=train_pth, outpath="fastText-0.9.1/data/tweets-fasttext.train")


read in 10537 samples from gs://sm4h-rxspace/task4/train.csv
wrote out fasttext prepared text to fastText-0.9.1/data/tweets-fasttext.train


In [67]:
main(inpath=dev_pth, outpath="fastText-0.9.1/data/tweets-fasttext.dev")

read in 2635 samples from gs://sm4h-rxspace/task4/validation.csv
wrote out fasttext prepared text to fastText-0.9.1/data/tweets-fasttext.dev


In [69]:

model = fasttext.train_supervised(input='fastText-0.9.1/data/tweets-fasttext.train',
                                  lr=0.5, epoch=25,
                                  wordNgrams=2,
                                  bucket=200000,
                                  dim=100,
                                  loss='ova')

In [71]:
model.save_model('fasttext_model_tweets.bin')

In [96]:

verbose_map = {
    'a': 'ABUSE',
    'm': 'MENTION',
    'u': 'UNRELATED',
    'c': 'CONSUMPTION'
              }

def predict_twitter(inpath, outpath, text_col='unprocessed_text', label_col='class', n_samples=10):
    
    df = pd.read_csv(inpath)
    if n_samples is None:
        n_samples = len(df)
        
    df = df.sample(n_samples)

    print(f"read in {n_samples} samples from {inpath}")
    
    df['label'] = df[label_col].map(create_fasttext_label)
    df['text'] = df[text_col].replace('\n', ' ', regex=True).replace('\t', ' ', regex=True)
    df['text'] = df['text'].map(str)
    df['text'] = df['text'].map(preprocess_fasttext)
    preds_list = []
    for i, row in df.iterrows():
        tweetid = row['tweetid']
        text = row['text']
        print(text)
        pred_lb, score = model.predict(text)
        pred = pred_lb[0].replace('__label__', '')
        print(f'pred class: {verbose_map.get(pred)}\npred score {round(score[0], 4)}')
        true_label = row[label_col]
        print(f'true class: {verbose_map.get(true_label)}')
        preds_list.append({'tweetid': tweetid,
                          'Class': pred})
        
    fasttext_df = pd.DataFrame(preds_list)
    fasttext_df.to_csv(f"{outpath}", index=False, quoting=csv.QUOTE_NONE,
                      quotechar="", escapechar=" ")
    print(f"wrote out fasttext prepared text to {outpath}")
        

    
    

In [97]:
predict_twitter(inpath=train_pth, outpath='preds-validation-fasttext-twitter-model-samples.csv', n_samples=20)

read in 20 samples from gs://sm4h-rxspace/task4/train.csv
<time> and i am already on tramadol 😞 🔫
pred class: CONSUMPTION
pred score 0.9996
true class: CONSUMPTION
day <number> of being sober from addy and xanax and feelin good 😇 gonna keep this up idc how difficult it gets
pred class: ABUSE
pred score 0.9962
true class: ABUSE
if it ’ s not too late can someone give xan to a couple of xanax so it can calm down a bit before tomorrow ?
pred class: MENTION
pred score 0.9922
true class: MENTION
_u oh c ' mon , that ' s knudepunkt but on valium and with business cards . . .
pred class: MENTION
pred score 1.0
true class: MENTION
life of a college kid : " i am about to take an adderall and drink a beer with it . " - _u
pred class: MENTION
pred score 0.9935
true class: MENTION
i changed for each one of my adderall high where i feel like mario after he ' s found a star .
pred class: ABUSE
pred score 1.0
true class: ABUSE
so done with this adderall 😒
pred class: CONSUMPTION
pred score 1.0
true c