In [1]:
! pip install gcsfs
! pip install jsonlines
! pip install pandas
! pip install ekphrasis



In [2]:
from datetime import datetime
import gcsfs

# setting up file system to be ble to read from buckets

fs = gcsfs.GCSFileSystem(project='sm4h-rxspace')

now = datetime.now()
dt = now.strftime('%Y-%m-%d %H:%M')
print(f'start time:\n{dt}')

start time:
2020-03-25 08:06


In [3]:
# import packages
import jsonlines
import pandas as pd

pd.set_option('display.max_colwidth', 0)



In [4]:
def get_distribution(df, col='class'):
    """gives distribution of a column from a pandas data-frame """
    df_out = df[col].value_counts()
    n_train = df.shape[0]
    print(f"loaded {n_train} samples\n")

    df_out = pd.DataFrame(df_out)
    df_out.columns = ['class counts']
    df_out['class %'] = round(100 * df_out['class counts'] / n_train, 2)
    return df_out


In [5]:
train_path = "gs://sm4h-rxspace/task4/train.csv"
dev_path = "gs://sm4h-rxspace/task4/validation.csv"
print(f'train path : {train_path}\ndev path : {dev_path}')

train path : gs://sm4h-rxspace/task4/train.csv
dev path : gs://sm4h-rxspace/task4/validation.csv


In [7]:
df_train_raw = pd.read_csv(train_path)
df_train_raw['class'] = df_train_raw['class'].map(str.strip)

print(f'loaded train from {train_path}')
get_distribution(df_train_raw)

loaded train from gs://sm4h-rxspace/task4/train.csv
loaded 10537 samples



Unnamed: 0,class counts,class %
m,5488,52.08
c,2940,27.9
a,1685,15.99
u,424,4.02


In [8]:
df_val_raw = pd.read_csv(dev_path)
df_val_raw['class'] = df_val_raw['class'].map(str.strip)

print(f'loaded dev from {dev_path}..')
get_distribution(df_val_raw)

loaded dev from gs://sm4h-rxspace/task4/validation.csv..
loaded 2635 samples



Unnamed: 0,class counts,class %
m,1353,51.35
c,730,27.7
a,448,17.0
u,104,3.95


In [10]:
df_train_raw.head()[['tweetid', 'unprocessed_text', 'class']]

Unnamed: 0,tweetid,unprocessed_text,class
0,1202189293432823810,"_U _U i even see a lot of readmits on those. risperdal consta, abilify maintena, haldol lai, all of them.",m
1,1200504615760023552,_U valium o clock,m
2,1201776570492489728,Stop Xanax 😂😂😂😂,m
3,1200528076159029248,_U tbh it’s the valium i’m on rn prob,c
4,1201420901633400832,_U i got mine pulled out about 6 years ago and the doctor prescribed me oxycodone but i never had pain. i just got high lol,a


In [12]:
# loading text preprocessing
from ekphrasis.classes.preprocessor import TextPreProcessor
from ekphrasis.classes.tokenizer import SocialTokenizer
from ekphrasis.dicts.emoticons import emoticons

text_processor = TextPreProcessor(
    # terms that waill be normalized
    normalize=['url', 'email', 'percent', 'money', 'phone', 'user',
        'time', 'url', 'date', 'number'],
    # terms that will be annotated
    annotate={},
#     annotate={"hashtag", "allcaps", "elongated", "repeated",
#         'emphasis', 'censored'},
    fix_html=True,  # fix HTML tokens
    
    # corpus from which the word statistics are going to be used 
    # for word segmentation 
    segmenter="twitter", 
    
    # corpus from which the word statistics are going to be used 
    # for spell correction
    corrector="twitter", 
    
    unpack_hashtags=True,  # perform word segmentation on hashtags
    unpack_contractions=True,  # Unpack contractions (can't -> can not)
    spell_correct_elong=False,  # spell correction for elongated words
    
    # select a tokenizer. You can use SocialTokenizer, or pass your own
    # the tokenizer, should take as input a string and return a list of tokens
    tokenizer=SocialTokenizer(lowercase=True).tokenize,
    
    # list of dictionaries, for replacing tokens extracted from the text,
    # with other expressions. You can pass more than one dictionaries.
    dicts=[emoticons]
)

sentences = df_train_raw['unprocessed_text'].tolist()[:11]
sentences

Reading twitter - 1grams ...
Reading twitter - 2grams ...
Reading twitter - 1grams ...


['_U _U i even see a lot of readmits on those. risperdal consta, abilify maintena, haldol lai, all of them.',
 '_U valium o clock',
 'Stop Xanax 😂😂😂😂',
 '_U tbh it’s the valium i’m on rn prob',
 '_U i got mine pulled out about 6 years ago and the doctor prescribed me oxycodone but i never had pain. i just got high lol',
 "Today is my 1 year vegan anniversary, also over a year since I've used a microwave and 1 year being clean of Vyvanse after struggling being on it for 8 years🤘",
 'Hurricane #Irma upgraded back to a Category-5 storm; maximum sustained winds 160 mph.   Can someone get this chick a xanax or soemthin????',
 "_U I'm on 100mg of Pristiq as well as I'm on Tramadol which boosts serotonin levels, I'm a fucking monster mate",
 'last timethis haopened i spent two months taking morphine and spending50% of mydays bedbound so uh. hoping its just a onw off and not thatagain tbh!',
 '_U _U "innovation isn\'t as likely" is a massive understatement btw. The only popular consumer produc

In [13]:
for s in sentences:
    print(type(s), s)
    print(" ".join(text_processor.pre_process_doc(s)))

<class 'str'> _U _U i even see a lot of readmits on those. risperdal consta, abilify maintena, haldol lai, all of them.
_u _u i even see a lot of readmits on those . risperdal consta , abilify maintena , haldol lai , all of them .
<class 'str'> _U valium o clock
_u valium o clock
<class 'str'> Stop Xanax 😂😂😂😂
stop xanax 😂 😂 😂 😂
<class 'str'> _U tbh it’s the valium i’m on rn prob
_u tbh it ’ s the valium i ’ m on rn prob
<class 'str'> _U i got mine pulled out about 6 years ago and the doctor prescribed me oxycodone but i never had pain. i just got high lol
_u i got mine pulled out about <number> years ago and the doctor prescribed me oxycodone but i never had pain . i just got high lol
<class 'str'> Today is my 1 year vegan anniversary, also over a year since I've used a microwave and 1 year being clean of Vyvanse after struggling being on it for 8 years🤘
today is my <number> year vegan anniversary , also over a year since i have used a microwave and <number> year being clean of vyvanse

In [14]:


def preprocess_tweet_text(s):
    """using ekphrasis preprocessng """
    return " ".join(text_processor.pre_process_doc(s))

In [15]:
    
def write_df(df, out_path, text_col='text', label_col='class', metadata=None):
    """
    takes a datafrmae, writes out text col, label col
    """
    
    cnt = 0
    with jsonlines.open(out_path, 'w') as writer:
        for i, row in df.iterrows():
            if metadata is None:
                metadata_res = ''
            metadata_res = row[metadata]
            #tweetid = row['tweetid']
            text = row[text_col]
            text = preprocess_tweet_text(text)
            label = row[label_col]
            # to strip white spaces and etc
            label = label.strip()
            writer.write({
                'text': text,
                'label': label,
                'metadata': metadata,

            })
            
            
            cnt += 1
    print(f"wrote {cnt} lines to {out_path}")
    


    

In [17]:
write_df(df_train_raw, out_path='train.jsonl', text_col='unprocessed_text', label_col='class', metadata='tweetid')


wrote 10537 lines to train.jsonl


In [18]:
write_df(df_val_raw, out_path='validation.jsonl', text_col='unprocessed_text', label_col='class', metadata='tweetid')


wrote 2635 lines to validation.jsonl
