In [2]:
# Run this every time you open the spreadsheet
%load_ext autoreload
%autoreload 2
from collections import Counter
import lib

# Load and inspect the data

In [3]:
# Load the data.
tweets, test_tweets = lib.read_data(train_path="data/labeled-data-singlelabels-train.csv", 
                                    test_path="data/labeled-data-singlelabels-test.csv")

# This function prints out a table containing all the tweets, along with their category labels
lib.show_tweets(tweets)


Unnamed: 0,Text,Category,Need or resource
0,we finally got huge tree off monday.. before house got anymore structural damage before this new storm..still the damage is done..,,
1,# sandy update 3 : @ kirancreates & i have no power or cell at union sq apart . camped out at roosevelt hotel lobby charging phones along w many,Energy,need
2,sandy relief efforts in full swing @ foodcoop @ park slope food coop http : //t.co/xn3qwuct,,
3,# foodtruck to the rescue # eastvillage # nyc # sandy # ignyc # below39thstreet @ gothamist @ ninth street espresso http : //t.co/hq0yynhr,,
4,"sandy takes ny out : ... ..and now the morning after , this is no lovable affair . @ nycurrent @ new york http : //t.co/tqqvbf80",,
5,"south fin grill will be serving hot food to those who are in need . soup , pasta , coffee & tea .",Food,resource
6,great organizational skills sorting donated goods . worked past weekend sorting clothes and food items donated by toledo area to fill 3 30 ' trucks . have worked with seniors and children . computer and office skills . food prep . 3/4 ton truck 4x4 . my husband is retired and ready to help as well .,Food,resource
7,i do n't have a car however i can help distribute water and clothing .,Water,resource
8,"i would like to help in any way that i can . cook hot foods , donate goods , give people rides , help people find new lodging . anything !",Food,resource
9,food prep yoga classes supply runs organization,Food,resource


# Naive Bayes classifier

To construct our Naive Bayes classifier, we first need to calculate two things:

### Prior probabilities of categories
We need to calculate $P(C_i)$ for each category $C_i \in \{\text{Energy}, \text{Food}, \text{Medical}, \text{Water}, \text{None}\}$. 

We estimate $P(C_i)$ by $\frac{\text{# tweets about }C_i}{\text{# tweets}}$

### Conditional probabilities of tokens
For each token (i.e. word) $x_j$ and each category $C_i$, we need to calculate $P(x_j|C_i)$.

We estimate $P(x_j|C_i) = \frac{P(x_j \text{ and } C_i)}{P(C_i)}$ by $\frac{\text{# tweets about }C_i \text{ containing }x_j}{\text{# tweets about }C_i}$

### Posterior probability of categories
Given a tweet which is a set of tokens $\{x_1,...,x_n\}$, the posterior probability of each category $C_i$ is

$P(C_i | x_1,...,x_n) \propto P(C_i) \times P(x_1|C_i) \times P(x_2|C_i) ... \times P(x_n|C_i)$

### Discriminator for choosing the output



In [5]:
def calc_probs(tweets, c):
    """
    Input:
        tweets: a list of tweets
        c: a string representing a category; one of "Energy", "Food", "Medical", "Water", "None". 
    Returns:
        prob_c: the prior probability of category c
        token_probs: a Counter mapping each token to P(token|category c)
    """

    # Step 1: Calculate the total number of tweets
    num_tweets = len(tweets)

    # Step 2: Calculate the number of tweets that are about category c.
    num_tweets_about_c = len([t for t in tweets if t.category == c])

    # Step 3: Calculate the probability of category c using the answers from Steps 1 and 2.
    prob_c = float(num_tweets_about_c) / num_tweets

    # Step 4: Create an empty Counter called token_counts.
    token_counts = Counter()

    # Step 5 (tricky): Use a for-loop to iterate over the list of tweets.
    for tweet in tweets:
        if tweet.category == c:
            for token in tweet.tokenSet:
                token_counts[token] += 1

    # Step 6: Create an empty Counter called token_probs.
    token_probs = Counter()

    # Step 7: Now fill token_probs.
    for token, count in token_counts.items():
        token_probs[token] = float(count) / num_tweets_about_c

    print("Class %s has prior probability %.2f" % (c, prob_c))
    return prob_c, token_probs


def get_posterior_prob(tweet, prob_c, token_probs):
    """Calculate the posterior P(c|tweet). 
    (Actually, calculate something proportional to it).
    
    Inputs:
        tweet: a tweet
        prob_c: the prior probability of category c
        token_probs: a Counter mapping each token P(token|c)
    Return:
        The posterior P(c|tweet).
    """

    posterior = prob_c
    for token in tweet.tokenSet:
        if token_probs[token] == 0:
            posterior *= 0.001
        else:
            posterior *= token_probs[token]

    return posterior

def classify_nb(tweet):
    """Classifies a tweet. Calculates the posterior P(c|tweet) for each category c, 
    and returns the category with largest posterior.
    Input:
        tweet
    Output:
        string equal to most-likely category for this tweet
    """
    posterior_food_prob = get_posterior_prob(tweet, prob_food, token_probs_food)
    posterior_water_prob = get_posterior_prob(tweet, prob_water, token_probs_water)
    posterior_energy_prob = get_posterior_prob(tweet, prob_energy, token_probs_energy)
    posterior_medical_prob = get_posterior_prob(tweet, prob_medical, token_probs_medical)
    posterior_none_prob = get_posterior_prob(tweet, prob_none, token_probs_none)

    max_posterior = max(
        [posterior_food_prob, posterior_water_prob, posterior_energy_prob,
         posterior_medical_prob, posterior_none_prob]
    )
    if posterior_food_prob == max_posterior:
        return 'Food'
    elif posterior_water_prob == max_posterior:
        return 'Water'
    elif posterior_energy_prob == max_posterior:
        return 'Energy'
    elif posterior_medical_prob == max_posterior:
        return 'Medical'
    else:
        return 'None'

In [7]:
prob_food, token_probs_food = calc_probs(tweets, "Food")
prob_water, token_probs_water = calc_probs(tweets, "Water")
prob_energy, token_probs_energy = calc_probs(tweets, "Energy")
prob_medical, token_probs_medical = calc_probs(tweets, "Medical")
prob_none, token_probs_none = calc_probs(tweets, "None")

Class Food has prior probability 0.47
Class Water has prior probability 0.09
Class Energy has prior probability 0.12
Class Medical has prior probability 0.04
Class None has prior probability 0.28


## Evaluate the Naive Bayes classifier

In [8]:
# Compare true labels and predicted labels in a table

predictions = [(tweet, classify_nb(tweet)) for tweet in test_tweets]  # a list of (tweet, prediction) pairs
lib.show_predictions(predictions)



Unnamed: 0,Text,True category,Predicted category
0,"i have a lot of canned goods and some clothing , but i can also buy and bring things as needed . please let me know what you need most .",Food,Food
1,how the **** am i supposed to get @ meekmill new album when i ai n't got power ? **** outaaa here sandy !,Energy,Energy
2,frankenstorm wo n't stop the bean ! thx for staying open for the neighbors who need coffee and treat ! ( @ the bean ) http : //t.co/zw7oa0tq,Food,
3,deodorant toothpaste shampoo/conditioner baby shampoo kids toothbrush bar soap mouth wash q-tips painkillers,Medical,Medical
4,"clothes for baby kids woman men , food , non perishable food , tools , toys , paper , furniture , any products ,",Food,Food
5,"i have blankets , socks , non perishables , baby wipes , diapers",Food,Food
6,"i can bring some clothing , non perishables , hygiene products and some baby supplies",Food,Food
7,oyster creek power plant is on alert for flooding ... . it 's about 80 miles away . great . # sandy,,
8,nonperishable food hygiene products temporary shelter,Food,Food
9,soo it 's almost 2 am and people are still waiting on that line to get gas.. it 's the shortest line i 've seen though -__- # gas # sandy # nyc,Energy,


In [9]:
# Get average F1 score for the test set

predictions = [(tweet, classify_nb(tweet)) for tweet in test_tweets]  # maps each test tweet to its predicted label
lib.evaluate(predictions)

Energy
Precision:  50.0
Recall:  60.0
F1:  54.54545454545455

Food
Precision:  83.56164383561644
Recall:  94.57364341085271
F1:  88.72727272727272

Medical
Precision:  85.71428571428571
Recall:  46.15384615384615
F1:  60.0

None
Precision:  82.85714285714286
Recall:  73.41772151898734
F1:  77.85234899328859

Water
Precision:  80.0
Recall:  40.0
F1:  53.333333333333336

Average F1:  66.89168191986984


In [24]:
# Get average F1 score for the TRAINING set.
# Compare with average F1 for test set above. What's the reason for the difference?

trainset_predictions = [(tweet, classify_nb(tweet)) for tweet in
                        tweets]  # maps each training tweet to its predicted label
lib.evaluate(trainset_predictions)

Energy
Precision:  95.77464788732394
Recall:  98.55072463768116
F1:  97.14285714285714

Food
Precision:  98.09885931558935
Recall:  97.72727272727273
F1:  97.91271347248579

Medical
Precision:  95.65217391304348
Recall:  100.0
F1:  97.77777777777777

None
Precision:  96.44012944983818
Recall:  96.44012944983818
F1:  96.44012944983818

Water
Precision:  100.0
Recall:  96.03960396039604
F1:  97.97979797979797

Average F1:  97.45065516455136


In [11]:
lib.show_confusion_matrix(predictions)

Unnamed: 0,Energy,Food,Medical,None,Water
Energy,24,8,0,7,1
Food,2,122,0,4,1
Medical,3,4,6,0,0
,18,2,1,58,0
Water,1,10,0,1,8


In [23]:
tweets, test_tweets = lib.read_data()
prior_probs, token_probs = lib.learn_nb(tweets)
predictions = [(tweet, lib.classify_nb(tweet, prior_probs, token_probs)) for tweet in test_tweets]
lib.evaluate(predictions)

Energy
Precision:  62.16216216216216
Recall:  57.5
F1:  59.74025974025974

Food
Precision:  85.0
Recall:  92.24806201550388
F1:  88.47583643122677

Medical
Precision:  75.0
Recall:  46.15384615384615
F1:  57.14285714285714

None
Precision:  79.01234567901234
Recall:  81.0126582278481
F1:  80.00000000000001

Water
Precision:  73.33333333333333
Recall:  55.0
F1:  62.85714285714286

Average F1:  69.64321923429729


In [15]:
lib.show_confusion_matrix(predictions)

Unnamed: 0,Energy,Food,Medical,None,Water
Energy,23,10,0,6,1
Food,3,118,1,6,1
Medical,2,5,6,0,0
,16,2,1,60,0
Water,1,8,0,1,10


In [19]:
predictions_train = [(tweet, lib.classify_nb(tweet, prior_probs, token_probs)) for tweet in tweets]
lib.evaluate(predictions_train)

Energy
Precision:  95.83333333333333
Recall:  100.0
F1:  97.87234042553192

Food
Precision:  99.23518164435947
Recall:  98.29545454545455
F1:  98.76308277830638

Medical
Precision:  100.0
Recall:  100.0
F1:  100.0

None
Precision:  98.38709677419355
Recall:  98.70550161812298
F1:  98.54604200323104

Water
Precision:  100.0
Recall:  98.01980198019803
F1:  99.0

Average F1:  98.83629304141387
