In [33]:
import pandas as pd
import numpy as np
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize, TweetTokenizer, RegexpTokenizer

from nltk.corpus import stopwords
from gensim.models import Word2Vec

In [34]:
df = pd.read_csv('../data/clean_hurricane.csv')

In [35]:
df.head()

Unnamed: 0,Date,Text,text_clean,lat,long
0,2020-08-31 23:15:25+00:00,A customer service rep told me Friday “there i...,a customer service rep told me friday there is...,25.525284,-80.60692
1,2020-08-31 22:39:25+00:00,Tomorrow at 7pm After Hours with Sabor Havana ...,tomorrow at pm after hour with sabor havana ci...,27.686273,-80.934588
2,2020-08-31 22:31:09+00:00,Hurricane Laura wallops areas with high mortga...,hurricane laura wallop area with high mortgage...,27.701712,-75.255859
3,2020-08-31 20:25:29+00:00,I never wish bad on anyone but I think we need...,i never wish bad on anyone but think we need a...,29.114762,-84.339632
4,2020-08-31 19:51:39+00:00,Wth is pricemart so full????? Is their a hurri...,wth is pricemart so full their a hurricane id...,28.506867,-89.67809


In [36]:
def word2vec_tokens(column, df):
    
    #instantiate tokenizer
    tt = TweetTokenizer()
    stop = stopwords.words('english')
    
    #preprocess colum for tokenizing
    df[column].replace( '\d+', ' ', inplace= True, regex = True)
    df[column].replace('\W+', ' ', inplace = True, regex = True)
    df[column].replace('\s+', ' ', inplace = True, regex = True)
    df[column].replace('\n', ' ', inplace = True, regex = True)
    df[column].astype(str)
    df[column]=df[column].str.lower()
    
    df[f'{column}_tokens_word2vec'] = df[column].astype(str).apply(lambda x: tt.tokenize(x))
    df[f'{column}_tokens_word2vec'] = df[f'{column}_tokens_word2vec'].apply(lambda x: [word for word in x if word not in stop])
    
    return df

In [37]:
tw = word2vec_tokens(column = 'text_clean', df = df)

In [38]:
#Training the Word2Ved model
model = Word2Vec(df['text_clean_tokens_word2vec'], min_count = 100)
vocab_words = model.wv.vocab
#Finding Word Vector count
vector = model.wv['hurricane']

In [39]:
#this code is to take search for similar words to keywords in word2vec model and use them to create a target column



def words_for_target(keyword_list):
    #instantiate relevant word list of relevant words to keyword list
    #instatiate list of words to use for bianary target
    relevant_words =[]
    target_words=[]

    #loop through keyword_list for relevant words to each keyword
    for keyword in keyword_list:
        similar = model.wv.most_similar(keyword)
        #add similar words to keyword_list words to relevant_words list
        for x in similar:
            relevant_words.append(x[0])
        
    #append the words in keyword_list to their relevant_words list to have a complete list of words to model with        
    for word in keyword_list:
        relevant_words.append(word)

    #append the words in keyword_list to target_words only once
    for word in relevant_words:
        if word not in target_words:
            target_words.append(word)
    return target_words

In [40]:
#keywords to search for in tweets
keyword_list = [  'warning', 'emergency', 'flood', 'storm', 'rain', 'fema' ]

target_words = words_for_target(keyword_list)

In [43]:
# this is juhee's code

def create_target(column):
    labels = []
    
    #separate each row
    for text in column:
        
        #initiate 0 as label
        label = 0
        
        # iterate through words in relevant_words
        for relevant_word in target_words:
            
            #if relevant_word is found in text, assign 1 as label
            # stop comparing once assigned as 1, then append it to the list
            if relevant_word in text:
                label = 1
                break
                
        labels.append(label)
        
    return labels

In [44]:
df['label'] = create_target(df['text_clean_tokens_word2vec'])

In [45]:
#target is about 50% of total data
df.label.sum()

19443

In [46]:
df.head()

Unnamed: 0,Date,Text,text_clean,lat,long,text_clean_tokens_word2vec,label
0,2020-08-31 23:15:25+00:00,A customer service rep told me Friday “there i...,a customer service rep told me friday there is...,25.525284,-80.60692,"[customer, service, rep, told, friday, hurrica...",0
1,2020-08-31 22:39:25+00:00,Tomorrow at 7pm After Hours with Sabor Havana ...,tomorrow at pm after hour with sabor havana ci...,27.686273,-80.934588,"[tomorrow, pm, hour, sabor, havana, cigar, jos...",0
2,2020-08-31 22:31:09+00:00,Hurricane Laura wallops areas with high mortga...,hurricane laura wallop area with high mortgage...,27.701712,-75.255859,"[hurricane, laura, wallop, area, high, mortgag...",0
3,2020-08-31 20:25:29+00:00,I never wish bad on anyone but I think we need...,i never wish bad on anyone but think we need a...,29.114762,-84.339632,"[never, wish, bad, anyone, think, need, couple...",0
4,2020-08-31 19:51:39+00:00,Wth is pricemart so full????? Is their a hurri...,wth is pricemart so full their a hurricane idk...,28.506867,-89.67809,"[wth, pricemart, full, hurricane, idk]",0


In [47]:
## count sentences in each tweet (row)

# set up an emty list for sentence counts

n_sentence = []

# iterate through the text column

for i in range(len(df['Text'])):
    
    n_sentence.append(len(sent_tokenize(df['Text'][i].lower())))
    
# store it in the dataframe

df['n_sentence'] = n_sentence

In [49]:
## count words in each tweet (row)
tokenizer = RegexpTokenizer(r'\w+|\$[\d\.]+|\S+')
# set up an empty list for word counts

n_words = []

# iterate through the text column

for i in range(len(df['text_clean'])):
    
    word_tokens = df['text_clean_tokens_word2vec'][i]
    cnt = len(word_tokens)
    n_words.append(cnt)
    
# store it in the dataframe

df['n_words'] = n_words

In [53]:
df = df[['Date', 'text_clean', 'label',  'n_sentence', 'n_words','lat', 'long',]]

In [54]:
df.head()

Unnamed: 0,Date,text_clean,label,n_sentence,n_words,lat,long
0,2020-08-31 23:15:25+00:00,a customer service rep told me friday there is...,0,2,11,25.525284,-80.60692
1,2020-08-31 22:39:25+00:00,tomorrow at pm after hour with sabor havana ci...,0,2,19,27.686273,-80.934588
2,2020-08-31 22:31:09+00:00,hurricane laura wallop area with high mortgage...,0,2,13,27.701712,-75.255859
3,2020-08-31 20:25:29+00:00,i never wish bad on anyone but think we need a...,0,1,9,29.114762,-84.339632
4,2020-08-31 19:51:39+00:00,wth is pricemart so full their a hurricane idk...,0,2,5,28.506867,-89.67809


In [55]:
df.to_csv('../data/final_hurricane_labeled.csv', index = False)