## Notebook Imports

In [1]:
import pandas as pd
import numpy as np

In [70]:
TRAINING_DATA_FILE = 'KAGGLE/train-data.txt'
TEST_DATA_FILE = 'KAGGLE/test-data.txt'

DISASTER_TWEET_PROB_FILE = 'KAGGLE/prob-disaster.txt'
NON_DISASTER_TWEET_PROB_FILE = 'KAGGLE/prob-nondisaster.txt'
TOKEN_ALL_PROB_FILE = 'KAGGLE/prob-all-tokens.txt'

TEST_FEATURE_MATRIX = 'KAGGLE/test-features.txt'
TEST_TARGET_FILE = 'KAGGLE/test-target.txt'

VOCAB_SIZE = 250

## Read and Load Features from .txt Files into NumPy Array

In [71]:
sparse_train_data = np.loadtxt(TRAINING_DATA_FILE, delimiter=' ', dtype=int)
sparse_test_data = np.loadtxt(TEST_DATA_FILE, delimiter=' ', dtype=int)

In [72]:
sparse_train_data[:5]

array([[  1,   2,   1,   1],
       [  1, 140,   1,   1],
       [  1, 186,   1,   1],
       [  2,  32,   1,   1],
       [  2, 222,   1,   1]])

In [73]:
sparse_train_data[-5:]

array([[7611,   26,    1,    1],
       [7611,   52,    1,    1],
       [7611,   88,    1,    1],
       [7611,  110,    1,    1],
       [7611,  165,    1,    1]])

In [74]:
print('Nr of rows in training file', sparse_train_data.shape[0])
print('Nr of rows in test file', sparse_test_data.shape[0])

Nr of rows in training file 17398
Nr of rows in test file 7388


In [75]:
print('Nr of tweets in training file', np.unique(sparse_train_data[:, 0]).size)

Nr of tweets in training file 5130


In [76]:
print('Nr of tweets in test file', np.unique(sparse_test_data[:, 0]).size)

Nr of tweets in test file 2205


### Create an Empty DataFrame

In [77]:
column_names = ['DOC_ID'] + ['CATEGORY'] + list(range(0, VOCAB_SIZE))
column_names[:10]

['DOC_ID', 'CATEGORY', 0, 1, 2, 3, 4, 5, 6, 7]

In [78]:
len(column_names)

252

In [79]:
index_names = np.unique(sparse_train_data[:, 0])
index_names

array([   1,    2,    3, ..., 7609, 7610, 7611])

In [80]:
full_train_data = pd.DataFrame(index=index_names, columns=column_names)
full_train_data.fillna(value=0, inplace=True)

In [81]:
full_train_data

Unnamed: 0,DOC_ID,CATEGORY,0,1,2,3,4,5,6,7,...,240,241,242,243,244,245,246,247,248,249
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7606,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7607,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7609,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7610,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [82]:
sparse_train_data[10:13]

array([[  4, 101,   1,   1],
       [  4, 127,   1,   1],
       [  4, 139,   1,   1]])

## Create a Full Matrix from a Sparse Matrix

In [83]:
def make_full_matrix(sparse_matrix, nr_words, doc_idx=0, word_idx=1, cat_idx=2, freq_idx=3):
    """
    Form a full matrix from a sparse matrix. Return a pandas dataframe. 
    Keyword arguments:
    sparse_matrix -- numpy array
    nr_words -- size of the vocabulary. Total number of tokens. 
    doc_idx -- position of the document id in the sparse matrix. Default: 1st column
    word_idx -- position of the word id in the sparse matrix. Default: 2nd column
    cat_idx -- position of the label (spam is 1, nonspam is 0). Default: 3rd column
    freq_idx -- position of occurrence of word in sparse matrix. Default: 4th column
    """
    column_names = ['DOC_ID'] + ['CATEGORY'] + list(range(0, VOCAB_SIZE))
    doc_id_names = np.unique(sparse_matrix[:, 0])
    full_matrix = pd.DataFrame(index=doc_id_names, columns=column_names)
    full_matrix.fillna(value=0, inplace=True)
    
    for i in range(sparse_matrix.shape[0]):
        doc_nr = sparse_matrix[i][doc_idx]
        word_id = sparse_matrix[i][word_idx]
        label = sparse_matrix[i][cat_idx]
        occurrence = sparse_matrix[i][freq_idx]
        
        full_matrix.at[doc_nr, 'DOC_ID'] = doc_nr
        full_matrix.at[doc_nr, 'CATEGORY'] = label
        full_matrix.at[doc_nr, word_id] = occurrence
    
    full_matrix.set_index('DOC_ID', inplace=True)
    return full_matrix
    

In [84]:
%%time
full_train_data = make_full_matrix(sparse_train_data, VOCAB_SIZE)

Wall time: 693 ms


In [85]:
full_train_data.head()

Unnamed: 0_level_0,CATEGORY,0,1,2,3,4,5,6,7,8,...,240,241,242,243,244,245,246,247,248,249
DOC_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
4,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,1,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Training the Naive Bayes Model
### Calculating the Probability that the tweet is a disaster tweet

In [86]:
full_train_data.CATEGORY.size

5130

In [87]:
full_train_data.CATEGORY.sum()

2259

In [88]:
prob_disaster_tweet = full_train_data.CATEGORY.sum() / full_train_data.CATEGORY.size
print('Probability of being a disaster tweet is', prob_disaster_tweet)

Probability of being a disaster tweet is 0.44035087719298244


### Total Number of Words / Tokens

In [89]:
full_train_features = full_train_data.loc[:, full_train_data.columns != 'CATEGORY']
full_train_features.head()

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,240,241,242,243,244,245,246,247,248,249
DOC_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [90]:
tweet_lengths = full_train_features.sum(axis=1)
tweet_lengths.shape

(5130,)

In [91]:
tweet_lengths[:5]

DOC_ID
1    3
2    2
3    4
4    5
5    4
dtype: int64

In [92]:
total_wc = tweet_lengths.sum()
total_wc

18535

## Number of Tokens in Disaster & NonDisaster Tweets

**Next:** We create a subset of the tweet_lengths series that only contains the disaster tweets. We call the subset disaster_tweet_lengths. Then we count the total number of words that occur in disaster tweets. 

We then do same for the non-disaster tweets. We create a subset called non_disaster_tweet_lengths. Then, we count the total number of words that occur in the non-disaster tweets. 

In [93]:
disaster_tweet_lengths = tweet_lengths[full_train_data.CATEGORY == 1]
disaster_tweet_lengths.shape

(2259,)

In [94]:
non_disaster_tweet_lengths = tweet_lengths[full_train_data.CATEGORY == 0]
non_disaster_tweet_lengths.shape

(2871,)

In [95]:
disaster_wc = disaster_tweet_lengths.sum()
disaster_wc

9170

In [96]:
tweet_lengths.shape[0] - disaster_tweet_lengths.shape[0] - non_disaster_tweet_lengths.shape[0]

0

In [97]:
nondisaster_wc = non_disaster_tweet_lengths.sum()
nondisaster_wc

9365

In [98]:
nondisaster_wc + disaster_wc - total_wc

0

### Summing the Tokens Occuring in Disaster Tweets

In [99]:
train_disater_tokens = full_train_features.loc[full_train_data.CATEGORY == 1]
train_disater_tokens.head()

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,240,241,242,243,244,245,246,247,248,249
DOC_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [100]:
train_disater_tokens.tail()

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,240,241,242,243,244,245,246,247,248,249
DOC_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
7606,1,0,0,0,0,0,1,0,0,1,...,0,0,0,0,0,0,0,0,0,0
7607,2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7609,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7610,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7611,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [101]:
summed_disaster_tokens = train_disater_tokens.sum(axis=0) + 1
summed_disaster_tokens.shape

(250,)

In [102]:
summed_disaster_tokens.tail()

245    10
246     9
247    14
248    22
249    37
dtype: int64

### Summing the Tokens Occuring in the Non-disaster tweets

Next we repeat this process for the non disaster tweets. Sum the tokens that occur in the 
non-disaster tweets and store the values in a variable called summed_nondisaster_tokens. 

In [103]:
train_nondisaster_tokens = full_train_features.loc[full_train_data.CATEGORY == 0]
summed_nondisaster_tokens = train_nondisaster_tokens.sum(axis=0) + 1

In [104]:
summed_nondisaster_tokens.shape

(250,)

In [105]:
summed_nondisaster_tokens.tail()

245    24
246    21
247    22
248    16
249     1
dtype: int64

In [106]:
train_nondisaster_tokens[249].sum() + 1

1

### P(Token | Disaster) - Probability that a Token Occurs given the Tweet is Disastrous

In [107]:
prob_tokens_disaster = summed_disaster_tokens / (disaster_wc + VOCAB_SIZE)
prob_tokens_disaster[:5]

0    0.187686
1    0.007856
2    0.019533
3    0.007113
4    0.013907
dtype: float64

In [108]:
prob_tokens_disaster.sum()

0.9999999999999999

### P(Token | Non_Disater) - Probability that a Token Occurs given the Tweet is Non-Disastrous

In [109]:
prob_tokens_nondisaster = summed_nondisaster_tokens / (nondisaster_wc + VOCAB_SIZE)
prob_tokens_nondisaster[:5]

0    0.160478
1    0.021841
2    0.007176
3    0.016017
4    0.003640
dtype: float64

In [110]:
prob_tokens_nondisaster.sum()

1.0

### P(Token) - Probability that Token Occurs 

In [111]:
prob_tokens_all = full_train_features.sum(axis=0) / total_wc
prob_tokens_all

0      0.178527
1      0.015214
2      0.013542
3      0.011815
4      0.008848
         ...   
245    0.001726
246    0.001511
247    0.001834
248    0.001942
249    0.001942
Length: 250, dtype: float64

In [112]:
prob_tokens_all.sum()

1.0

## Save the Trained Model

In [113]:
np.savetxt(DISASTER_TWEET_PROB_FILE, prob_tokens_disaster)
np.savetxt(NON_DISASTER_TWEET_PROB_FILE, prob_tokens_nondisaster)
np.savetxt(TOKEN_ALL_PROB_FILE, prob_tokens_all)

## Prepare Test Data

In [114]:
sparse_test_data.shape

(7388, 4)

In [115]:
%%time
full_test_data = make_full_matrix(sparse_test_data, nr_words=VOCAB_SIZE)

Wall time: 314 ms


In [116]:
X_test = full_test_data.loc[:, full_test_data.columns != 'CATEGORY']
y_test = full_test_data.CATEGORY

In [117]:
np.savetxt(TEST_TARGET_FILE, y_test)
np.savetxt(TEST_FEATURE_MATRIX, X_test)