In [1]:
import pandas as pd

In [6]:
pd.read_csv('./train.csv').head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [32]:
def load_data(csv_file, split=0.9):
    data = pd.read_csv(csv_file)
    
    # Shuffle data
    train_data = data.sample(frac=1, random_state=7)
    
    texts = train_data.text.values
    labels = [{"Disaster": bool(y), "No Disaster": not bool(y)}
              for y in train_data.target.values]
    split = int(len(train_data) * split)

    train_labels = [{"cats": labels} for labels in labels[:split]]
    val_labels = [{"cats": labels} for labels in labels[split:]]
    
    return texts[:split], train_labels, texts[split:], val_labels

train_texts, train_labels, val_texts, val_labels = load_data('./train.csv')

In [67]:
import spacy

# Create an empty model
nlp = spacy.blank("en")

# Create the TextCategorizer with exclusive classes and "bow" architecture
textcat = nlp.create_pipe(
              "textcat",
              config={
                "exclusive_classes": True,
                "architecture": "bow"})

# Add the TextCategorizer to the empty model
nlp.add_pipe(textcat)

# Add labels to text classifier
textcat.add_label("No Disaster")
textcat.add_label("Disaster")

1

In [68]:
from spacy.util import minibatch
import random


def train(model, train_data, optimizer):
    losses = {}
    random.seed(1)
    random.shuffle(train_data)
    
    batches = minibatch(train_data, size=8)
    for batch in batches:
        # train_data is a list of tuples [(text0, label0), (text1, label1), ...]
        # Split batch into texts and labels
        texts, labels = zip(*batch)
        
        # Update model with texts and labels
        model.update(texts, labels, sgd=optimizer, losses=losses)
        
    return losses

In [69]:
# Fix seed for reproducibility
spacy.util.fix_random_seed(1)
random.seed(1)

optimizer = nlp.begin_training()
train_data = list(zip(train_texts, train_labels))
losses = train(nlp, train_data, optimizer)
print(losses['textcat'])

4.905895377276465


In [70]:
def predict(model, texts): 
    # Use the model's tokenizer to tokenize each input text
    docs = [nlp.tokenizer(text) for text in texts]
    
    # Use textcat to get the scores for each doc
    textcat = model.get_pipe('textcat')
    scores, _ = textcat.predict(docs)
    
    # From the scores, find the class with the highest score/probability
    predicted_class = scores.argmax(axis=1)
    
    return predicted_class

In [71]:
text = "This tea cup was full of holes. Do not recommend."
doc = nlp(text)
print(doc.cats)

{'No Disaster': 0.6612555980682373, 'Disaster': 0.3387444317340851}


In [72]:
def evaluate(model, texts, labels):
    """ Returns the accuracy of a TextCategorizer model. 
    
        Arguments
        ---------
        model: ScaPy model with a TextCategorizer
        texts: Text samples, from load_data function
        labels: True labels, from load_data function
    
    """
    # Get predictions from textcat model (using your predict method)
    predicted_class = list(predict(model, texts))

    # From labels, get the true class as a list of integers ('No Disaster' on place 1)
    true_class = [1 if each['cats']['Disaster'] == True else 0 for each in labels]

    # A boolean or int array indicating correct predictions
    correct_predictions = [1 if true_class[i] == predicted_class[i] else 0 for i in range(len(labels))]
    
    # The accuracy, number of correct predictions divided by all predictions
    accuracy = sum(correct_predictions)/len(correct_predictions)
    
    return accuracy

In [73]:
accuracy = evaluate(nlp, val_texts, val_labels)
print(f"Accuracy: {accuracy:.4f}")

Accuracy: 0.7848


### Check on Test Data

In [78]:
test_texts = pd.read_csv('./test.csv').text.values
test_ids = pd.read_csv('./test.csv').id.values

In [79]:
predictions = predict(nlp, test_texts)

In [83]:
submission = pd.DataFrame(data={'id': test_ids, 'target': predictions})
submission.to_csv('./submission.csv', index=False)