In [55]:
from kaggle.api.kaggle_api_extended import KaggleApi #https://github.com/Kaggle/kaggle-api
import os
import pandas as pd
import spacy
from spacy.matcher import PhraseMatcher
from spacy.util import minibatch
import random

In [28]:
# Use Kaggle API to download train.csv, test.csv
api = KaggleApi()
api.authenticate()
api.competition_download_files('nlp-getting-started')

In [29]:
# Unzip files
os.system('unzip *.zip');

In [30]:
# Read in training set and test set
df_trn=pd.read_csv('train.csv', error_bad_lines=False, warn_bad_lines=False)
df_tst=pd.read_csv('test.csv', error_bad_lines=False, warn_bad_lines=False)

In [52]:
# hyperparameter placeholders
lemmatized=True
nostopword=True
batch_num=8
epoch_num=10

In [31]:
# Load the SpaCy model
nlp = spacy.blank("en")

In [32]:
# Data exploration
df_trn['text'].head(10)

0    Our Deeds are the Reason of this #earthquake M...
1               Forest fire near La Ronge Sask. Canada
2    All residents asked to 'shelter in place' are ...
3    13,000 people receive #wildfires evacuation or...
4    Just got sent this photo from Ruby #Alaska as ...
5    #RockyFire Update => California Hwy. 20 closed...
6    #flood #disaster Heavy rain causes flash flood...
7    I'm on top of the hill and I can see a fire in...
8    There's an emergency evacuation happening now ...
9    I'm afraid that the tornado is coming to our a...
Name: text, dtype: object

In [33]:
# Create keywords list
keywords_list = ["earthquake", "fire", "flood", "emergency", "tornado", "rain", "disaster"]

# Create a list of tokens from keywords list
keywords_tokens_list = [nlp(keyword) for keyword in keywords_list]

# Create the PhraseMatcher object. The tokenizer is the first argument. Use attr = 'LOWER' to make consistent capitalization
matcher = PhraseMatcher(nlp.vocab, attr='LOWER')

# Add the item patterns to the matcher. 
matcher.add("KEYWORDS",           # Just a name for the set of rules we're matching to
           None,                  # Special actions to take on matched words
           *keywords_tokens_list  # (variable number of) objects of the phrases to match
          )

In [35]:
# Sample text from training set
sample_idx = 7
sample_txt = df_trn['text'].iloc[sample_idx]

# Create the tokenized version of sample_txt
sample_doc = nlp(sample_txt)

# Create a list of lemmatized, non-stop-word tokens for sample_doc
if lemmatized and nostopword:
    token_list = [token.lemma_ for token in sample_doc if (not token.is_stop)]
elif lemmatized:
    token_list = [token.lemma_ for token in sample_doc]
elif nostopword:
    token_list = [token for token in sample_doc if (not token.is_stop)]
else: 
    token_list = [token for token in sample_doc]

# Find matches in the doc
matches = matcher(sample_doc)

print(sample_txt)
print('-------------------------------------------------------------')
for match in matches:
   print(f"found keyword in token number {match[1]}: {sample_doc[match[1]:match[2]]}")

I'm on top of the hill and I can see a fire in the woods...
-------------------------------------------------------------
found keyword in token number 12: fire


In [36]:
# Initialize a matches dictionary {idx: matches_set}
matches_dict = {}

# Use above sample code with df.iterrows() which iterates over a DataFrame rows as (index, Series) pairs
for idx, series in df_trn.iterrows():

    # Text from training set
    txt = df_trn['text'].iloc[idx]

    # Create the tokenized version of txt
    doc = nlp(txt)

    # Create a list of lemmatized, non-stop-word tokens for doc
    if lemmatized and nostopword:
        token_list = [token.lemma_ for token in doc if (not token.is_stop)]
    elif lemmatized:
        token_list = [token.lemma_ for token in doc]
    elif nostopword:
        token_list = [token for token in doc if (not token.is_stop)]
    else: 
        token_list = [token for token in doc]

    # Find matches in the doc
    matches = matcher(doc)
    
    # Create a set of matches found in doc
    matches_set = {doc[start:end] for match_id, start, end in matches}
    
    # Append to matches dictionary
    if len(matches_set) > 0:
        matches_dict.update({idx: matches_set})

In [100]:
# Load the SpaCy model
nlp = spacy.blank("en")

# Create the TextCategorizer with exclusive classes and "bow" architecture (ngram “bag-of-words”) 
textcat = nlp.create_pipe(
              "textcat",
              config={
                "exclusive_classes": True,
                "architecture": "bow"})

# Add the TextCategorizer to the empty model
nlp.add_pipe(textcat)

# Add labels to text classifier
textcat.add_label('real')
textcat.add_label('not')

1

In [101]:
# The model is looking for "real", "not" labels inside another dictionary with the key 'cats'
# e.g. if a text is "real", we need a dictionary {'real': True, 'not': False}
train_texts = df_trn['text'].values
train_labels = [{'cats': {'real': target == 1, 'not': target == 0}} for target in df_trn['target']]

# combine the texts and labels into a single list
train_data = list(zip(train_texts, train_labels))

In [102]:
# Create an optimizer using nlp.begin_training()
spacy.util.fix_random_seed(1)
optimizer = nlp.begin_training()

losses = {}
random.seed(1)
# loop through the training set epoch_num times, re-shuffling the training data at the begining of each iteration
for epoch in range(epoch_num):
    random.shuffle(train_data)
    # Create the batch generator with batch size = batch_num
    batches = minibatch(train_data, size=batch_num)
    # Iterate through minibatches
    for batch in batches:
        # Each batch is a list of (text, label) but we need to
        # send separate lists for texts and labels to update().
        # This is a quick way to split a list of tuples into lists
        texts, labels = zip(*batch)
        nlp.update(texts, labels, sgd=optimizer, losses=losses)
    # Print accumulated loss bug (HB: FIX THIS)
    print(losses)

{'textcat': 5.311279617191758}
{'textcat': 8.446476191224065}
{'textcat': 10.580094419201487}
{'textcat': 12.142444305991376}
{'textcat': 13.328999306524565}
{'textcat': 14.263951404558611}
{'textcat': 15.022631380922121}
{'textcat': 15.645888627128897}
{'textcat': 16.16052245742469}
{'textcat': 16.598223631826045}


In [103]:
# Define a function that predicts class for each text, given a model and a list of texts
def predict(model, texts): 
    # Use the model's tokenizer to tokenize each input text
    docs = [model.tokenizer(text) for text in texts]
    
    # Use textcat to get the scores for each doc
    textcat = model.get_pipe('textcat')
    scores, _ = textcat.predict(docs)
    
    # From the scores, find the class with the highest score/probability
    predicted_class = scores.argmax(axis=1)
    
    return predicted_class

In [111]:
# Make predictions
texts = df_tst['text'].values
predicted_class = predict(nlp, texts)

for p, t in zip(predicted_class, texts):
    print(f"{textcat.labels[p]}: {t} \n")

real: Just happened a terrible car crash 

real: Heard about #earthquake is different cities, stay safe everyone. 

real: there is a forest fire at spot pond, geese are fleeing across the street, I cannot save them all 

real: Apocalypse lighting. #Spokane #wildfires 

real: Typhoon Soudelor kills 28 in China and Taiwan 

real: We're shaking...It's an earthquake 

not: They'd probably still show more life than Arsenal did yesterday, eh? EH? 

not: Hey! How are you? 

not: What a nice hat? 

not: Fuck off! 

not: No I don't like cold! 

not: NOOOOOOOOO! Don't do that! 

not: No don't tell me that! 

not: What if?! 

not: Awesome! 

real: Birmingham Wholesale Market is ablaze BBC News - Fire breaks out at Birmingham's Wholesale Market http://t.co/irWqCEZWEU 

not: @sunkxssedharry will you wear shorts for race ablaze ? 

real: #PreviouslyOnDoyinTv: Toke MakinwaÛªs marriage crisis sets Nigerian Twitter ablaze... http://t.co/CMghxBa2XI 

not: Check these out: http://t.co/rOI2NSmEJJ http://