In [1]:
%matplotlib inline
import re
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

import string
import sklearn

from nltk.tokenize import TweetTokenizer
from nltk.sentiment.vader import SentimentIntensityAnalyzer as VS

import gensim
from gensim.models import word2vec

import spacy as sp
nlp = sp.load('en')

import nltk
nltk.download('vader_lexicon')
sentiment_analyzer = VS()

nltk.download("stopwords")
from nltk.corpus import stopwords
enStop = stopwords.words('english')
enStop_dict={e: 0 for e in enStop}

from nltk.stem.porter import *
stemmer = PorterStemmer()

# load oov dictionary for word correction
oov_dict={}
with open("OOV_Dictionary_V1.0.txt","r",encoding="utf-8") as f:
    for line in f:
        oov_dict[line.split()[0]]=line.strip().split()[1]

paramiko missing, opening SSH/SCP/SFTP paths will be disabled.  `pip install paramiko` to suppress
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /home/cwang/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package stopwords to /home/cwang/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


# Load Model (Muhammad, 2016)

In [6]:
# The pre-trained model can be downloaed from: https://crisisnlp.qcri.org/lrec2016/lrec2016.html
%time model = gensim.models.KeyedVectors.load_word2vec_format('crisisNLP_word_vector.bin', binary=True)

CPU times: user 57.2 s, sys: 3.79 s, total: 1min 1s
Wall time: 1min 1s


In [7]:
# """
# "Request-GoodsServices",
# "Request-SearchAndRescue",
# "CallToAction-MovePeople",
# "Report-EmergingThreats", 
# "Report-NewSubEvent", 
# "Report-ServiceAvailable"
# """
def load_actionable_verbs(important_list={'GoodsServices':0,'SearchAndRescue':0,'MovePeople':0,'EmergingThreats':0,'NewSubEvent':0,'ServiceAvailable':0}): 
    actionnable_verb_list_dict={}
    df_actionable=pd.read_csv("actionable_tweets.csv",sep="\t")
    for index,row in df_actionable.iterrows():
        tweet_text=row['text']
        category=row['categories']
        if category in important_list:
            if category not in actionnable_verb_list_dict:
                actionnable_verb_list_dict[category]=[]
            doc=nlp(tweet_text)
            verbs=[token.text for token in doc if token.pos_ == "VERB"]
            for v in verbs:
                if len(v)>=3 and v not in enStop_dict:
                    actionnable_verb_list_dict[category].append(v)
    return {e:sorted(nltk.FreqDist(actionnable_verb_list_dict[e]).items(), key=lambda kv: kv[1],reverse=True) for e in actionnable_verb_list_dict}

In [8]:
%time sorted_actionnable_verb_dict=load_actionable_verbs()

CPU times: user 17.8 s, sys: 412 ms, total: 18.2 s
Wall time: 18.2 s


In [9]:
sorted_actionnable_verb_dict2dict_top10={}
for event in sorted_actionnable_verb_dict:
    sorted_actionnable_verb_dict2dict_top10[event]={}
    for tup in sorted_actionnable_verb_dict[event][:10]:
        sorted_actionnable_verb_dict2dict_top10[event][stemmer.stem(tup[0])]=tup[1]
sorted_actionnable_verb_dict2dict_top10.keys()

dict_keys(['MovePeople', 'SearchAndRescue', 'EmergingThreats', 'ServiceAvailable', 'GoodsServices'])

In [10]:
print(sorted_actionnable_verb_dict['MovePeople'][:10])
print(sorted_actionnable_verb_dict['EmergingThreats'][:10])
sorted_actionnable_verb_dict['SearchAndRescue'][:10]

[('leave', 4), ('coming', 4), ('affected', 3), ('evacuated', 3), ('evacuating', 3), ('evacuate', 3), ('flash', 2), ('take', 2), ('highparkfire', 2), ('use', 2)]
[('hits', 22), ('bigwet', 20), ('reported', 18), ('BREAKING', 14), ('killed', 13), ('says', 13), ('nswfires', 12), ('strikes', 11), ('injured', 11), ('killing', 11)]


[('missing', 37),
 ('help', 17),
 ('needs', 14),
 ('trapped', 11),
 ('stranded', 9),
 ('NEED', 7),
 ('stuck', 6),
 ('send', 6),
 ('need', 5),
 ('Missing', 4)]

In [11]:
stopList = ["http", "https", "rt", "@", ":", "t.co", "co", "amp", "&amp;", "...", "\n", "\r"]
stopList.extend(string.punctuation)

local_tokenizer = TweetTokenizer()
def tokenizer_wrapper(text):
    return local_tokenizer.tokenize(text)

#define vectorizer using sklearn
vectorizer = sklearn.feature_extraction.text.TfidfVectorizer(
    tokenizer=tokenizer_wrapper,
    ngram_range=(1, 1),
    stop_words=stopList, #We do better when we keep stopwords
    use_idf=True,
    smooth_idf=False,
    norm=None, #Applies l2 norm smoothing
    decode_error='replace',
    max_features=10000,
    min_df=4,
    max_df=0.501)

analyzer = vectorizer.build_analyzer()

def normalize(s):
        """
        Given a text, cleans and normalizes it. Feel free to add your own stuff.
        From: https://www.kaggle.com/mschumacher/using-fasttext-models-for-robust-embeddings
        """
        s = s.lower()
        # Replace numbers and symbols with language
        s = s.replace('&', ' and ')
        s = s.replace('@', ' at ')
        s = s.replace('0', 'zero')
        s = s.replace('1', 'one')
        s = s.replace('2', 'two')
        s = s.replace('3', 'three')
        s = s.replace('4', 'four')
        s = s.replace('5', 'five')
        s = s.replace('6', 'six')
        s = s.replace('7', 'seven')
        s = s.replace('8', 'eight')
        s = s.replace('9', 'nine')
        return s

def tokenize_tweet(string):
    string=string.lower()
    # Clean and Refine (remove URL, special characters)
    giant_url_regex = ('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|'
        '[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
    string = re.sub(giant_url_regex, "", string)
    
    string = re.sub(r"\'s", "s", string)
    string = re.sub(r"\'ve", "ve", string)
    string = re.sub(r"\'t", "t", string)
    string = re.sub(r"\'re", "re", string)
    string = re.sub(r"\'d", "d", string)
    string = re.sub(r"\'ll", "ll", string)
    
    #Skip retweet signs, @ symbols, and special chars such as punctuations
    string = re.sub(r"[^A-Za-z]", " ", string)
    string = re.sub(r"\s{2,}", " ", string)
    tokens=[]
    for w in string.split():
        # OOV Correction
        if w in oov_dict:
            w=oov_dict[w]
        # Short words skip and Stop word removal
        if len(w) >= 3 and w not in enStop_dict:
            tokens.append(w)
    return tokens

def vectorize(sentence):
    tokenized = tokenize_tweet(sentence)
    wvs = []
    for t in tokenized:
        if t in model:
            v = model[t]
            norm = np.linalg.norm(v)
            normed_v = v / norm
            wvs.append(normed_v) 
    m = np.array(wvs)
    normed_m = np.mean(m, axis=0)
    return normed_m

# construct X matrix
def to_matrix(raw_tweet_texts):
    X_matrix = np.zeros((len(raw_tweet_texts), 300))
    for index,s in enumerate(raw_tweet_texts):
        sv = vectorize(s)
        if not np.isnan(sv).any():
            X_matrix[index,:]=sv
        else:
            print(index,s)
    return X_matrix

# Data preperation

## Load Dataset without meta-info

In [21]:
# in order to run the cell, download three datasets from trecis2018-train,trecis2018-test, trescis2019a-test respectively.
# covert them to tsv file with format as shown in the output
data_dir="raw/"
trecis2018train=pd.read_csv(data_dir+"trecis2018-train.tsv",sep="\t")
trecis2018train['timestamp']='null'
idx=trecis2018train[trecis2018train['priority'].str.contains("Unknown")].index
trecis2018train.set_value(idx, 'priority', "Low")
trecis2018train['source']='2018train'
trecis2018test=pd.read_csv(data_dir+"trecis2018-test.tsv",sep="\t")
trecis2018test['source']='2018test'
trescis2019atest=pd.read_csv(data_dir+"trescis2019a-test.tsv",sep="\t")
trescis2019atest['source']='2019a'
print("Dataset size:\n trecis2018train\ttrecis2018test\ttrescis2019atest\n {}\t{}\t{}".format(len(trecis2018train),len(trecis2018test),len(trescis2019atest)))
print("------------")
print("Total:\n {}".format(len(trecis2018train)+len(trecis2018test)+len(trescis2019atest)))
trecis2018test.head()

Dataset size:
 trecis2018train	trecis2018test	trescis2019atest
 1337	17653	7098
------------
Total:
 26088


  """


Unnamed: 0,event_id,post_id,text,categories,priority,timestamp,source
0,albertaFloods2013,351734622884855808,#yycflood be sure when you are ready for re bu...,"ThirdPartyObservation,Advice",Low,11 Sep 2018 19:36:19 GMT,2018test
1,albertaFloods2013,351741648327294977,Happy Canada Day everyone! Let's show national...,"FirstPartyObservation,Advice",Low,11 Sep 2018 19:36:32 GMT,2018test
2,albertaFloods2013,351776825967513600,Unloading a truck filled with clean-up kits in...,"ThirdPartyObservation,ContinuingNews,Sentiment",Low,11 Sep 2018 19:36:49 GMT,2018test
3,albertaFloods2013,352043285876977664,Unofficial “anthem” of #Alberta floods by #Can...,"FirstPartyObservation,Advice",Low,11 Sep 2018 19:37:12 GMT,2018test
4,albertaFloods2013,352054501466849280,Get the Alberta Strong (Flood Montage) as hear...,"ThirdPartyObservation,Advice",Low,11 Sep 2018 19:37:23 GMT,2018test


In [23]:
# some changes of information types happened with the evolution of TREC-IS, so this is used for reformating information types
trecis2018train["categories"]=trecis2018train['categories'].apply(lambda x: x.replace("PastNews","ContextualInformation") if "PastNews" in x else x)
trecis2018train["categories"]=trecis2018train['categories'].apply(lambda x: x.replace("ContinuingNews","News") if "ContinuingNews" in x else x)
trecis2018train["categories"]=trecis2018train['categories'].apply(lambda x: x.replace("KnownAlready","OriginalEvent") if "KnownAlready" in x else x)
trecis2018train["categories"]=trecis2018train['categories'].apply(lambda x: x.replace("SignificantEventChange","NewSubEvent") if "SignificantEventChange" in x else x)
trecis2018train["categories"]=trecis2018train['categories'].apply(lambda x: x.replace("Unknown","Irrelevant") if "Unknown" in x else x)

trecis2018test["categories"]=trecis2018test['categories'].apply(lambda x: x.replace("PastNews","ContextualInformation") if "PastNews" in x else x)
trecis2018test["categories"]=trecis2018test['categories'].apply(lambda x: x.replace("ContinuingNews","News") if "ContinuingNews" in x else x)
trecis2018test["categories"]=trecis2018test['categories'].apply(lambda x: x.replace("KnownAlready","OriginalEvent") if "KnownAlready" in x else x)
trecis2018test["categories"]=trecis2018test['categories'].apply(lambda x: x.replace("SignificantEventChange","NewSubEvent") if "SignificantEventChange" in x else x)
trecis2018test["categories"]=trecis2018test['categories'].apply(lambda x: x.replace("Unknown","Irrelevant") if "Unknown" in x else x)

df_combined=pd.concat([trecis2018train,trecis2018test,trescis2019atest])
df_combined=df_combined.reset_index(drop=True)
df_combined["categories"]=df_combined['categories'].apply(lambda x: x.split(","))
print(df_combined.shape)
df_combined.tail()

(26088, 7)


Unnamed: 0,event_id,post_id,text,categories,priority,timestamp,source
26083,fireYMM2016D,727632069682044928,@JHauk84 @stephen_taylor Our prayers are with ...,"[Hashtags, Sentiment, Discussion]",Low,8 Apr 2019 16:50:15 GMT,2019a
26084,fireYMM2016D,727636108729720832,To think the lead headline at noon on @GlobalC...,"[Hashtags, Sentiment]",Low,8 Apr 2019 18:39:43 GMT,2019a
26085,fireYMM2016D,727635394670383105,"Just found out my friends in #ymmfire is safe,...","[Hashtags, Sentiment]",Low,8 Apr 2019 18:05:13 GMT,2019a
26086,fireYMM2016D,727629223616249858,RT @BreannaCTV: A lot of people are hanging ar...,"[MultimediaShare, Hashtags, News]",Low,5 Apr 2019 17:15:39 GMT,2019a
26087,fireYMM2016D,727629552009220097,RT @puravida_lisa: Terrifying - get out safe e...,"[MultimediaShare, Hashtags, OriginalEvent, Sen...",Low,5 Apr 2019 18:57:24 GMT,2019a


In [16]:
print(getTweetTextsbyIT('MovePeople')[1])
" ".join(tokenize_tweet(getTweetTextsbyIT('MovePeople')[1]))

Typhoon #RubyPH will make a landfall in Dolores, Eastern Samar by 8 pm. Head to higher ground. http://t.co/Fv0ehCiClE http://t.co/b7LrWD2n2P


'typhoon rubyph make landfall dolores eastern samar head higher ground'

In [14]:
def getTweetTextsbyIT(it="MovePeople"):
    return df_combined[df_combined['categories'].astype(str).str.contains(it)].reset_index(drop=True)['text']

In [15]:
test_tweet=getTweetTextsbyIT('MovePeople')[1]
test_tweet="I hate you"
print(test_tweet)
import spacy
nlp = spacy.load('en')

doc = nlp(test_tweet)

# Analyze syntax
print("Noun phrases:", [chunk.text for chunk in doc.noun_chunks])
print("Verbs:", [token.lemma_ for token in doc if token.pos_ == "VERB"])
# Find named entities, phrases and concepts
for entity in doc.ents:
    print(entity.text, entity.label_)
    
import nltk
nltk.download('vader_lexicon')
sentiment_analyzer = VS()

sentiment = sentiment_analyzer.polarity_scores("I like you")
sentiment

I hate you
Noun phrases: ['I', 'you']
Verbs: ['hate']


[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /home/cwang/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


{'compound': 0.3612, 'neg': 0.0, 'neu': 0.286, 'pos': 0.714}

In [17]:
sorted_actionnable_verb_dict2dict_top10['EmergingThreats']

{'bigwet': 20,
 'break': 14,
 'hit': 22,
 'injur': 11,
 'kill': 11,
 'nswfire': 12,
 'report': 18,
 'say': 13,
 'strike': 11}

## Feature extraction

In [18]:
"""
Features considered for boosting are as follows:
1. No. of hashtags (numeric)
2. No. of special verbs, such as, trapped, stuck, move etc (dataset statistical analysis)
3. Sentiment polarity (categorical,, -1, 0 or 1)
4. Tweet length (word_length, char_length, numeric)
5. URL count (numeric)
6. Digit count (int, numeric)
7. Retweet check (0 or 1)
8. caps ratio (float, numeric)
9. Special chars count (@, ! and ?, normalized float numeric)
10. : in first token check (0 or 1)
11. Named Entity count (numeric)
"""
def return_features(tweet_text):
    #1. No. of hashtags (numeric)
    hashtag_count=tweet_text.count("#")
    #3. Sentiment polarity (categorical,, -1, 0 or 1)
    sentiment = sentiment_analyzer.polarity_scores(tweet_text)
    sentiment_cpd=sentiment['compound']
    sentiment_neg=sentiment['neg']
    sentiment_neu=sentiment['neu']
    sentiment_pos=sentiment['pos']
  
    #2. No. of special verbs such as, trapped, stuck, move etc (dataset statistical analysis)
    #['MovePeople', 'EmergingThreats', 'GoodsServices', 'SearchAndRescue', 'ServiceAvailable']
    tokens=tokenize_tweet(tweet_text)
    is_action1=int(any(stemmer.stem(i) in sorted_actionnable_verb_dict2dict_top10['MovePeople'] for i in tokens))
    is_action2=int(any(stemmer.stem(i) in sorted_actionnable_verb_dict2dict_top10['EmergingThreats'] for i in tokens))
    is_action3=int(any(stemmer.stem(i) in sorted_actionnable_verb_dict2dict_top10['GoodsServices'] for i in tokens))
    is_action4=int(any(stemmer.stem(i) in sorted_actionnable_verb_dict2dict_top10['SearchAndRescue'] for i in tokens))
    is_action5=int(any(stemmer.stem(i) in sorted_actionnable_verb_dict2dict_top10['ServiceAvailable'] for i in tokens))
    
    #4. Tweet length (word_length, char_length, numeric)
    tokens=tweet_text.split(" ")
    word_length=len(tokens)
    char_length=len(tweet_text)
    
    #5. URL count (numeric)
    url_count=len(re.findall('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|'
        '[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+',tweet_text))
    #6. Digit count (int, numeric)
    digital_count=len( re.findall('[0-9]+', tweet_text))
    
    #7. Retweet check (0 or 1)
    is_retweet=int(tweet_text[:2]=="RT")
    # 8. caps ratio (float, numeric)
    caps_ratio=len("".join(re.findall('[A-Z]+', tweet_text)))/char_length
    
    #9. Special chars count
    at_count=tweet_text.count("@")
    exclaim_count=tweet_text.count("!")
    question_count=tweet_text.count("?")
    
    #10. : in first token check (0 or 1)
    colon_check=int(":" in tokens[0])
    #11. Named Entity count (numeric)
    doc = nlp(tweet_text)
    ner_count=len(doc.ents)
#     print(doc.ents)
    
    return_list=[hashtag_count,sentiment_cpd,sentiment_neg,sentiment_neu,sentiment_pos,is_action1,
                    is_action2,is_action3,is_action4,is_action5,word_length,char_length,
                    url_count,digital_count,is_retweet,caps_ratio,at_count,exclaim_count,question_count,
                    colon_check,ner_count]
    return return_list

def normalize_by_columns(crafted_features_matrix,columns_to_normalize=["hashtag_count","word_length","char_length","url_count","digital_count","at_count","exclaim_count","question_count","ner_count"]):
    crafted_features_matrix_=crafted_features_matrix.copy()
    to_normalize_features_matrix=crafted_features_matrix_[columns_to_normalize]
    from sklearn import preprocessing
    x = to_normalize_features_matrix.values 
    min_max_scaler = preprocessing.MinMaxScaler()
    x_scaled = min_max_scaler.fit_transform(x)
    normalized_features_matrix = pd.DataFrame(x_scaled,columns=columns_to_normalize)
    normalized_features_matrix.head()
    for column in columns_to_normalize:
        crafted_features_matrix_[column]=normalized_features_matrix[column]
    return crafted_features_matrix_

In [19]:
test_tweet=getTweetTextsbyIT('MovePeople')[1]
print("test for ",test_tweet)
feature_names=["hashtag_count","sentiment_cpd","sentiment_neg","sentiment_neu",
               "sentiment_pos","is_action1","is_action2","is_action3","is_action4",
              "is_action5","word_length","char_length","url_count","digital_count",
              "is_retweet","caps_ratio","at_count","exclaim_count","question_count",
              "colon_check","ner_count"]
import datetime
start_time=datetime.datetime.now()
features=return_features(test_tweet)

for idx,fature_name in enumerate(feature_names):
    print(fature_name,"====>",features[idx])


test for  Typhoon #RubyPH will make a landfall in Dolores, Eastern Samar by 8 pm. Head to higher ground. http://t.co/Fv0ehCiClE http://t.co/b7LrWD2n2P
hashtag_count ====> 1
sentiment_cpd ====> 0.0
sentiment_neg ====> 0.0
sentiment_neu ====> 1.0
sentiment_pos ====> 0.0
is_action1 ====> 0
is_action2 ====> 0
is_action3 ====> 0
is_action4 ====> 0
is_action5 ====> 0
word_length ====> 19
char_length ====> 140
url_count ====> 2
digital_count ====> 5
is_retweet ====> 0
caps_ratio ====> 0.11428571428571428
at_count ====> 0
exclaim_count ====> 0
question_count ====> 0
colon_check ====> 0
ner_count ====> 2


In [24]:
import os
if os.path.isfile("crafted_features_matrix_2018B.csv"):
    crafted_features_matrix=pd.read_csv("crafted_features_matrix.csv")
    categories = []
    for it in crafted_features_matrix['categories']:
        categories.append(it.split(","))
    crafted_features_matrix['categories']=categories
else:
    feature_names=["post_id","hashtag_count","sentiment_cpd","sentiment_neg","sentiment_neu",
               "sentiment_pos","is_action1","is_action2","is_action3","is_action4",
              "is_action5","word_length","char_length","url_count","digital_count",
              "is_retweet","caps_ratio","at_count","exclaim_count","question_count",
              "colon_check","ner_count","categories","priority"]
    crafted_features_matrix=pd.DataFrame(columns=feature_names)
    texts=df_combined['text']
    postids=df_combined['post_id']
    categories=df_combined['categories']
    priority=df_combined['priority']
    for index,tweet_text in enumerate(texts):
        if index%1000==0:
            print("Now is processing at index",index)
        features=return_features(tweet_text)
        features.insert(0,postids[index])
        features.append(",".join(categories[index]))
        features.append(priority[index])
        crafted_features_matrix.loc[len(crafted_features_matrix)]=features
    crafted_features_matrix.to_csv("crafted_features_matrix_2018B.csv",index=False)
    
new_crafted_features_matrix=normalize_by_columns(crafted_features_matrix)
print(new_crafted_features_matrix.shape)
new_crafted_features_matrix.head()


Now is processing at index 0
Now is processing at index 1000
Now is processing at index 2000
Now is processing at index 3000
Now is processing at index 4000
Now is processing at index 5000
Now is processing at index 6000
Now is processing at index 7000
Now is processing at index 8000
Now is processing at index 9000
Now is processing at index 10000
Now is processing at index 11000
Now is processing at index 12000
Now is processing at index 13000
Now is processing at index 14000
Now is processing at index 15000
Now is processing at index 16000
Now is processing at index 17000
Now is processing at index 18000
Now is processing at index 19000
Now is processing at index 20000
Now is processing at index 21000
Now is processing at index 22000
Now is processing at index 23000
Now is processing at index 24000
Now is processing at index 25000
Now is processing at index 26000
(26088, 24)




Unnamed: 0,post_id,hashtag_count,sentiment_cpd,sentiment_neg,sentiment_neu,sentiment_pos,is_action1,is_action2,is_action3,is_action4,...,digital_count,is_retweet,caps_ratio,at_count,exclaim_count,question_count,colon_check,ner_count,categories,priority
0,242997841134505985,0.388889,0.0,0.0,1.0,0.0,0,1,0,0,...,0.210526,0,0.136,0.0,0.0,0.0,0,0.307692,OriginalEvent,Low
1,243121539552256001,0.055556,0.0,0.0,1.0,0.0,0,0,0,0,...,0.368421,0,0.123711,0.0,0.0,0.0,0,0.307692,OriginalEvent,Low
2,243198739882332161,0.055556,0.0,0.0,1.0,0.0,0,0,0,0,...,0.157895,0,0.111111,0.0,0.0,0.0,1,0.230769,OriginalEvent,Low
3,243203924050448384,0.055556,0.0,0.0,1.0,0.0,0,0,0,0,...,0.157895,0,0.131148,0.0,0.0,0.0,0,0.230769,OriginalEvent,Low
4,243361298518249473,0.0,-0.3612,0.2,0.8,0.0,0,1,0,0,...,0.105263,0,0.104478,0.0,0.0,0.0,0,0.153846,OriginalEvent,Low


## ML models for combined features (crafted features + word2vec features)

In [69]:
word2vec_matrix=to_matrix(df_combined['text'])
word2vec_matrix.shape
combined_matrix=np.concatenate([word2vec_matrix, new_crafted_features_matrix.iloc[:,1:22].values], axis=1)

#X and y construction for priortiy classification
X_crafted_matrix=combined_matrix
print(X_crafted_matrix.shape)


437 #HighParkFire from I-25. http://t.co/2nXZFfIv
5751 Oh my. https://t.co/F0vFbYpfFO
5792 What?? https://t.co/gkwuoJP8rJ
6183 R.I.P. https://t.co/jyRqACDjoC
6233 From @meighanstone https://t.co/JpEY1CxTqt
6766 Pa howwwww https://t.co/eb3pSR57Va
6778 Where ? https://t.co/za0G2BIr0U
7505 Me too... https://t.co/zOUnWhPupr
7596 This! https://t.co/kTlSuCuvDB
8442 More on this at 6:20 am on @cbcsask https://t.co/RuXyssorVh
8481 This https://t.co/dpcfSr6bpd
8621 this. https://t.co/tRdssIupJ0
8702 What this https://t.co/qoBEYiLgNT
9275 this. https://t.co/5iVaLykcuc
10316 This. https://t.co/a2qLYC3pqB
10477 Me too! https://t.co/rJJssfKljg
10529 This this this https://t.co/8l2hvzNo5U
10702 Oh no :( https://t.co/DdW9qL7nSu
11007 Oh no:-( https://t.co/rYBXHlm2Q2
19055 #PrayforBoholandCebu #PrayForBohol #PrayForCarmenBohol ??
19216 #prayforBohol http://t.co/lJczvAi5R2
19363 RT @TrustJo: #PrayForBohol http://t.co/TOQjOQThkk
19458 #PrayForBohol http://t.co/xfyoTri7bs
19523 #PrayForBohol http://t.co/

### Priority Classification

In [85]:
X_crafted_matrix.shape

(26088, 321)

In [78]:
priortiy2label={"Low":0,"Unknown":0,"Medium":1,"High":2,"Critical":3}
label2priority={0: 'Low', 1: 'Medium', 2: 'High', 3: 'Critical'}
new_crafted_features_matrix_=new_crafted_features_matrix.replace({"priority":priortiy2label})
y_crafted_pri=new_crafted_features_matrix_['priority'].values
print(y_crafted_pri.shape)
print(y_crafted_pri)


y_crafted_pri=y_crafted_pri.astype('int')

from sklearn.model_selection import train_test_split
features_train, features_test, labels_train, labels_test = train_test_split(X_crafted_matrix, 
                                                                            y_crafted_pri, 
                                                                            test_size=0.0, 
                                                                            random_state=0)
from imblearn.over_sampling import SMOTE
oversampler=SMOTE('not majority')
features_train,labels_train=oversampler.fit_sample(features_train,labels_train)
print(features_train.shape)
print(len(labels_train.shape))

# By GridSearch and KFold validation, we get the highest accurancy with parameter C=xx
# from sklearn.model_selection import GridSearchCV
# from sklearn.model_selection import StratifiedKFold
# val_clf = LogisticRegression()
# C_params = [0.001, 0.01, 0.1, 1, 10,100]
# # gamma_params=[0.01,0.1,1,10,100]
# param_grid = dict(C=C_params)
# kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=7)
# grid_search = GridSearchCV(val_clf, param_grid, n_jobs=-1, cv=kfold)
# grid_result = grid_search.fit(features_train, labels_train)
# print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
# means = grid_result.cv_results_['mean_test_score']
# params = grid_result.cv_results_['params']
# for mean, param in zip(means, params):
#     print("%f  with: %r" % (mean, param))

# from sklearn.naive_bayes import GaussianNB
# gnb_pri_model = GaussianNB()
# %time gnb_pri_model.fit(features_train, labels_train)
# pri_model=gnb_pri_model

#poor on critical although hits a higher average precision and recall over GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
# lsvc_pri_model = LinearSVC(C=0.1)
lsvc_pri_model = LogisticRegression(C=100)
# lsvc_pri_model = SVC(C=0.1,kernel="linear")
%time lsvc_pri_model.fit(features_train, labels_train)
pri_model=lsvc_pri_model

# print('Accuracy of LinearSVC classifier on FULL set with oversampling for priority (Train set): {:.4f}'
#      .format(pri_model.score(features_train, labels_train)))
# print('Accuracy of LinearSVC classifier on FULL set with oversampling for priority (Test set): {:.4f}'
#      .format(pri_model.score(features_test, labels_test)))


(26088,)
[0 0 0 ... 0 0 0]
(79672, 321)
1
CPU times: user 1min 19s, sys: 359 ms, total: 1min 20s
Wall time: 1min 20s


In [79]:
# predicted_test = pri_model.predict(features_test)
# from sklearn.metrics import classification_report
# class_names=[label2priority[each] for each in range(0,4)]
# print(classification_report(labels_test, predicted_test,target_names=class_names))
# print("Evaluate on training set before SMOTE")
# from sklearn.metrics import classification_report
# class_names_=[label2priority[each] for each in range(0,4)]
# print(classification_report(y_crafted_pri, pri_model.predict(X_crafted_matrix),target_names=class_names_))

### Multi-Label classification for information type

In [95]:
def onehot_to_labels(onehot):
    labels=[]
    for index,each in enumerate(onehot):
        if each == 1:
            labels.append(list(multilabel_binarizer.classes_)[index])
    return labels

def labels_to_onehot(label_list):
    onehot=[0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
    for each in label_list:
        onehot[list(multilabel_binarizer.classes_).index(each)]=1
    return np.array(onehot)

def normalize_prob(raw_list=['0','1','2','3']):
    raw_list=[float(i) for i in raw_list]
    max_raw_score=np.max(raw_list)
    min_raw_score=np.min(raw_list)
    normalized=[]
    for each in raw_list:
        normalized.append(each/(max_raw_score-min_raw_score))
    return normalized

index2label={0: 'Advice',
 1: 'CleanUp',
 2: 'ContextualInformation',
 3: 'Discussion',
 4: 'Donations',
 5: 'EmergingThreats',
 6: 'Factoid',
 7: 'FirstPartyObservation',
 8: 'GoodsServices',
 9: 'Hashtags',
 10: 'InformationWanted',
 11: 'Irrelevant',
 12: 'Location',
 13: 'MovePeople',
 14: 'MultimediaShare',
 15: 'NewSubEvent',
 16: 'News',
 17: 'Official',
 18: 'OriginalEvent',
 19: 'SearchAndRescue',
 20: 'Sentiment',
 21: 'ServiceAvailable',
 22: 'ThirdPartyObservation',
 23: 'Volunteer',
 24: 'Weather'}

import warnings
warnings.filterwarnings('ignore')

from sklearn.preprocessing import MultiLabelBinarizer
multilabel_binarizer = MultiLabelBinarizer()
multilabel_binarizer.fit(df_combined['categories'])
X_matrix=X_crafted_matrix
# transform target variable
y_matrix = multilabel_binarizer.transform(df_combined['categories'])
print(df_combined['text'][2000])
print(onehot_to_labels(y_matrix[2000]))
y_matrix[0]

ITEMS NEEDED! RT@cbmyyc: @innfromthecold needs grocery store gift cards and bus passes! #yycflood
['Advice', 'Donations', 'ThirdPartyObservation']


array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 0])

In [97]:
X_crafted_matrix.shape

(26088, 321)

In [101]:
from sklearn.model_selection import train_test_split


features_train, features_test, labels_train, labels_test = train_test_split(X_matrix, y_matrix, test_size=0.0, random_state=9)

# Binary Relevance
from sklearn.multiclass import OneVsRestClassifier

# Performance metric

# runtag="MuLaBnb"

from sklearn.linear_model import LogisticRegression
ml_m = LogisticRegression()

clf1 = OneVsRestClassifier(ml_m)
# fit model on train data
%time clf1.fit(features_train, labels_train)


# from xgboost import XGBClassifier
# ml_m = XGBClassifier()

# from sklearn.svm import LinearSVC
# ml_m = LinearSVC(C=1)

# from sklearn.naive_bayes import BernoulliNB
# ml_m = BernoulliNB()

# from sklearn.ensemble import RandomForestClassifier
# ml_m = RandomForestClassifier(random_state=1, n_estimators=10, min_samples_split=4, min_samples_leaf=2)

from sklearn.naive_bayes import GaussianNB
ml_m = GaussianNB()

clf2 = OneVsRestClassifier(ml_m)
# fit model on train data
%time clf2.fit(features_train, labels_train)

# from sklearn.metrics import f1_score
# print(f1_score(labels_test, y_pred, average="micro"))

# print('Accuracy of BernoulliNB classifier on FULL set with oversampling for priority (Train set): {:.4f}'
#      .format(clf.score(features_train, labels_train)))
# print('Accuracy of BernoulliNB classifier on FULL set with oversampling for priority (Test set): {:.4f}'
#      .format(clf.score(features_test, labels_test)))


CPU times: user 1min 9s, sys: 1.14 s, total: 1min 10s
Wall time: 1min 10s
CPU times: user 7.26 s, sys: 3.13 s, total: 10.4 s
Wall time: 10.4 s


OneVsRestClassifier(estimator=GaussianNB(priors=None, var_smoothing=1e-09),
          n_jobs=None)

# 2019b Submission (on 2019btrecis-test set)


In [120]:
# construct the feature boosting matrix for 2019a test set
results=[]
import json
with open("test_dataset_2019b.json") as f:
    for line in f:
        results.append(json.loads(line.strip()))
df_test_2019b=pd.DataFrame(results)

X_test_2019b_word2vec_matrix=to_matrix(df_test_2019b['text'])
import os
if os.path.isfile("trecis2019b_fbtest_matrix.csv"):
    X_test_2019b_fb=pd.read_csv("trecis2019b_fbtest_matrix.csv")
else:
    feature_names=["hashtag_count","sentiment_cpd","sentiment_neg","sentiment_neu",
               "sentiment_pos","is_action1","is_action2","is_action3","is_action4",
              "is_action5","word_length","char_length","url_count","digital_count",
              "is_retweet","caps_ratio","at_count","exclaim_count","question_count",
              "colon_check","ner_count"]
    trecis2019b_fbtest_matrix=pd.DataFrame(columns=feature_names)
    texts=df_test_2019b['text']
    for index,tweet_text in enumerate(texts):
        if index%1000==0:
            print("Now is processing at index",index)
        features=return_features(tweet_text)
        trecis2019b_fbtest_matrix.loc[len(trecis2019b_fbtest_matrix)]=features
    trecis2019b_fbtest_matrix.to_csv("trecis2019b_fbtest_matrix.csv",index=False)
    
X_test_2019b_fb=normalize_by_columns(X_test_2019b_fb)
X_test_2019b=np.concatenate([X_test_2019b_word2vec_matrix, X_test_2019b_fb.iloc[:,0:21].values], axis=1)
print(X_test_2019b.shape)
X_test_2019b

11488 Not again🙆🏼‍♂️🙆🏼‍♂️‼️‼️ https://t.co/1ygBJKNgCi
12181 No, not again 🙆🏾‍♂️ https://t.co/SYwqwcn7Xj
(15000, 321)


array([[-0.02275683,  0.05240015,  0.02972024, ...,  0.        ,
         0.        ,  0.04761905],
       [ 0.01183188,  0.01825785, -0.01675739, ...,  0.        ,
         0.        ,  0.04761905],
       [-0.00803777,  0.00876423,  0.00747698, ...,  0.        ,
         0.        ,  0.04761905],
       ...,
       [-0.00566667,  0.00825524,  0.00452569, ...,  0.        ,
         0.        ,  0.0952381 ],
       [ 0.03183927,  0.01161456, -0.00593295, ...,  0.1       ,
         0.        ,  0.        ],
       [-0.00541297,  0.02367819, -0.03326358, ...,  0.        ,
         0.        ,  0.19047619]])

In [121]:
t = 0.51 # this is set emprically
predicted_test1_proba=clf1.predict_proba(X_test_2019b)
predicted_test1_proba=[normalize_prob(each) for each in predicted_test1_proba]
predicted_test1_proba=np.array(predicted_test1_proba)

predicted_test2_proba=clf2.predict_proba(X_test_2019b)
predicted_test2_proba=[normalize_prob(each) for each in predicted_test2_proba]
predicted_test2_proba=np.array(predicted_test2_proba)
combined_predict_proba=(predicted_test1_proba+predicted_test2_proba)/2
y_pred_prob_2019=combined_predict_proba
predicted_2019 = (y_pred_prob_2019 >= t).astype(int)

# prediction for priority
predicted_pri = pri_model.predict(X_test_2019b)
# predicted
predictd_labels_2019=multilabel_binarizer.inverse_transform(predicted_2019)
# predictd_labels_2019

In [122]:
all_predicted_categories=[]
all_predicted_categories = sum([list(tup) for tup in predictd_labels_2019],[])
len(set(all_predicted_categories))

all_predicted_categories_freq = nltk.FreqDist(all_predicted_categories)
# create dataframe
all_predicted_categories_df = pd.DataFrame({'Category': list(all_predicted_categories_freq.keys()), 
                              'Count': list(all_predicted_categories_freq.values())})
all_predicted_categories_df

Unnamed: 0,Category,Count
0,ThirdPartyObservation,3801
1,FirstPartyObservation,2887
2,ServiceAvailable,1491
3,SearchAndRescue,509
4,MovePeople,919
5,Official,4593
6,MultimediaShare,3988
7,Location,5521
8,InformationWanted,825
9,News,7653


In [123]:
df_test_2019b["event_id"].shape

(15000,)

In [4]:
runtag="baseline"
runlabel="UCD"+runtag

def normalize_priority(raw_priority=['0','1','2','3']):
    raw_priority=[float(i) for i in raw_priority]
    max_priority_score=np.max(raw_priority)
    min_priority_score=np.min(raw_priority)
    normalized=[]
    for each in raw_priority:
        normalized.append((each+1)/(max_priority_score-min_priority_score))
#         normalized.append(0.75*each/(max_priority_score-min_priority_score)+(0.25*max_priority_score-min_priority_score)/(max_priority_score-min_priority_score))
    return normalized
import math
short2longITs2019b = {'Advice': 'Other-Advice',
                      'CleanUp': 'Report-CleanUp',
                      'ContextualInformation': 'Other-ContextualInformation',
                      'Discussion': 'Other-Discussion',
                      'Donations': 'CallToAction-Donations',
                      'EmergingThreats': 'Report-EmergingThreats',
                      'Factoid': 'Report-Factoid',
                      'FirstPartyObservation': 'Report-FirstPartyObservation',
                      'GoodsServices': 'Request-GoodsServices',
                      'Hashtags': 'Report-Hashtags',
                      'InformationWanted': 'Request-InformationWanted',
                      'Location': 'Report-Location',
                      'MovePeople': 'CallToAction-MovePeople',
                      'MultimediaShare': 'Report-MultimediaShare',
                      'NewSubEvent': 'Report-NewSubEvent',
                      'News': 'Report-News',
                      'Official': 'Report-Official',
                      'OriginalEvent': 'Report-OriginalEvent',
                      'SearchAndRescue': 'Request-SearchAndRescue',
                      'Sentiment': 'Other-Sentiment',
                      'Irrelevant': 'Report-Irrelevant',
                      'ServiceAvailable': 'Report-ServiceAvailable',
                      'ThirdPartyObservation': 'Report-ThirdPartyObservation',
                      'Volunteer': 'CallToAction-Volunteer',
                      'Weather': 'Report-Weather'}

event2test = {"albertaWildfires2019": "TRECIS-CTIT-H-Test-029",
                          "cycloneKenneth2019": "TRECIS-CTIT-H-Test-030",
                          "philippinesEarthquake2019": "TRECIS-CTIT-H-Test-031",
                          "coloradoStemShooting2019": "TRECIS-CTIT-H-Test-032",
                          "southAfricaFloods2019": "TRECIS-CTIT-H-Test-033",
                          "sandiegoSynagogueShooting2019": "TRECIS-CTIT-H-Test-034"}

# A naive approach: based on the averaged priority weights computed by information types in training set
info_type_priority_weight_dict = {'Advice': 0.22862577231414025, 'CleanUp': 0.35653795010024114,
                                  'News': 0.3270722902848146, 'Discussion': 0.19307661376501264,
                                  'Donations': 0.3612767541359874, 'EmergingThreats': 0.8537984236364585,
                                  'Factoid': 0.48917308056907677, 'FirstPartyObservation': 0.1420197717109446,
                                  'GoodsServices': 0.6959346525995092, 'Hashtags': 0.2687451987058043,
                                  'InformationWanted': 0.6688596972384949, 'Irrelevant': 0.0010061402766415601,
                                  'OriginalEvent': 0.09192532381788204, 'MovePeople': 0.9253136744180216,
                                  'MultimediaShare': 0.3128463518966703, 'Official': 0.6785618345463643,
                                  'ContextualInformation': 0.1405539758609653, 'SearchAndRescue': 1.0010061402766415,
                                  'Sentiment': 0.039519645594885515, 'ServiceAvailable': 0.7669372401298534,
                                  'NewSubEvent': 0.9176682650090996, 'ThirdPartyObservation': 0.2703955630411556,
                                  'Location': 0.03711651480530716, 'Volunteer': 0.3839639462617982,
                                  'Weather': 0.33196659834790165}

# priority prediction
normalized_predicted_pri=normalize_priority(predicted_pri)

info_type_priority_weight_list=list(info_type_priority_weight_dict.values())
max_pri=np.max([float(i) for i in info_type_priority_weight_list])
min_pri=np.min([float(i) for i in info_type_priority_weight_list])

# Step 2: Convert the submission file to the standard format
# generate submission file
# import core.label2code as l2c
# index2label = l2c.index2label
str_pre = ""

import random
sub_dict={}
for index,each in enumerate(predictd_labels_2019):
    each=list(each)
    if len(each)==0:
        each=['Irrelevant']
    elif len(each)>1 and 'Irrelevant' in each:
        each.remove('Irrelevant')
    elif 'Unknown' in each:
        each.remove('Unknown')
        
    p=np.mean([float(info_type_priority_weight_dict[e]) for e in each])
#     p = float(info_type_priority_weight_dict[index2label[each]])
    # low=0.25, medium=0.5, high=0.75 and critical=1.0. 
    # equation for priority calculation where p is parameter =0.5 by default
    # p*predicted_score+(1-p)*statistic_weight
    parameter_p=0.5
    predicted_score=normalized_predicted_pri[index]
#   statistic_weight=normalize_priority([p])[0]
    statistic_weight=p/(max_pri-min_pri)
#     statistic_weight=0.75*p/(max_pri-min_pri)+(0.25*max_pri-min_pri)/(max_pri-min_pri)
    p_final=parameter_p*predicted_score+(1-parameter_p)*statistic_weight

    event_id=df_test_2019b.iloc[index]['event_id']
    test_id=event2test[event_id]
    post_id=df_test_2019b.iloc[index]['post_id']

#     labels=[short2longITs2019A[index2label[each]] for each in multi_label_predicted[index]]
    labels=[short2longITs2019b[e] for e in each]

    if math.isnan(p_final):
        p_final=0.0
    if p_final>1.0:
        p_final=1.0
        
    line_sub=test_id+"\tQ0\t"+str(post_id)+"\t"+ "#" + "\t" + str(p_final) +"\t"+str(labels)+"\t"+runlabel+ "\n"
#     str_pre +=line_sub
    if event_id not in sub_dict:
#         sub_dict[post_id]=[event_id]
        sub_dict[event_id]={line_sub:p_final}
    else:
        sub_dict[event_id][line_sub]=p_final
str_pre=""

for each in sub_dict:
    rank_inevent=1
    sub_inevent=sub_dict[each]
    sub_inevent_sorted=sorted(sub_inevent.items(), key=lambda kv: kv[1],reverse=True)
    for tup in sub_inevent_sorted:
        line_sub=tup[0].replace("#",str(rank_inevent))
        str_pre+=line_sub
        rank_inevent+=1
        
    
with open(runlabel+".txt", "w") as f:
    f.write(str_pre)

print("Submission for "+runlabel+" is done")

Submission for UCDbaseline is done
