I did not find the gold labels for the test set, so the only way to check your performance on the test set is to make a submission on the [Kaggle website](https://www.kaggle.com/c/nlp-getting-started/submit). 
I made a function to create a submission file from a prediction vector, *download_predictions_to_file()*, feel free to use it. Alternatively, you can just check your performance on a subset of the training set. 

In [1]:
# some libraries
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import regexp_tokenize
from nltk.tokenize import TweetTokenizer
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer 
from sklearn.naive_bayes import MultinomialNB

In [2]:
import numpy as np
import pandas as pd 

data_url = 'https://www.math.unipd.it/~dasan/disaster/'
train_csv = pd.read_csv(data_url + 'train.csv', sep=",") 
test_csv = pd.read_csv(data_url + 'test.csv', sep=",") 


In [None]:
def download_predictions_to_file(id_array, predictions_array):
  # This function, given a numpy array with the ids of the test tweets and a numpy array with the corresponding predictions, 
  # creates and let's you download a prediction file suitable to be uploaded to the Kaggle website

  id_array = id_array.reshape(len(id_array),1) # making sure the arrays are of the size requested by hstack(), i.e. (x,1) instead of (x,) 
  predictions_array = predictions_array.reshape(len(predictions_array),1)

  from google.colab import files
  result = pd.DataFrame(np.hstack((id_array, predictions_array)), columns=['id','target'])
  result.to_csv('prediction_file.csv', index=False)
  files.download('prediction_file.csv')


In [3]:
#let's take a look at the data
print(train_csv.head())

Y_train = train_csv["target"].values 
X_train = train_csv.values[:, 1:4] # extracting the columns keyword, location and text

#let's print one example to check we picked the right columns
print(X_train[0])

#X_test_id = test_csv["id"].values # these are needed to build output prediction files if you want to submit your predictions to the Kaggle website (see format of the submission)

#print(Y_train.shape, X_test_id.shape, X_train.shape)

# create a dummy pessimistic predictor that always says DISASTER (class 1) 
#test_predictions = np.ones((len(X_test_id),1), dtype=int)

#download_predictions_to_file(X_test_id, test_predictions) # download the prediction file


   id keyword location                                               text  \
0   1     NaN      NaN  Our Deeds are the Reason of this #earthquake M...   
1   4     NaN      NaN             Forest fire near La Ronge Sask. Canada   
2   5     NaN      NaN  All residents asked to 'shelter in place' are ...   
3   6     NaN      NaN  13,000 people receive #wildfires evacuation or...   
4   7     NaN      NaN  Just got sent this photo from Ruby #Alaska as ...   

   target  
0       1  
1       1  
2       1  
3       1  
4       1  
[nan nan
 'Our Deeds are the Reason of this #earthquake May ALLAH Forgive us all']


In [4]:
x_tweets = train_csv['text']
x_tweets.head()

0    Our Deeds are the Reason of this #earthquake M...
1               Forest fire near La Ronge Sask. Canada
2    All residents asked to 'shelter in place' are ...
3    13,000 people receive #wildfires evacuation or...
4    Just got sent this photo from Ruby #Alaska as ...
Name: text, dtype: object

In [5]:
tknzr = TweetTokenizer(preserve_case = False, strip_handles = True, reduce_len = True)
tweet_tokens = []
for sent in x_tweets:
    #print(tweet_tokenizer.tokenize(sent))
    tweet_tokens.append(tknzr.tokenize(sent))
print(tweet_tokens[:5])

[['our', 'deeds', 'are', 'the', 'reason', 'of', 'this', '#earthquake', 'may', 'allah', 'forgive', 'us', 'all'], ['forest', 'fire', 'near', 'la', 'ronge', 'sask', '.', 'canada'], ['all', 'residents', 'asked', 'to', "'", 'shelter', 'in', 'place', "'", 'are', 'being', 'notified', 'by', 'officers', '.', 'no', 'other', 'evacuation', 'or', 'shelter', 'in', 'place', 'orders', 'are', 'expected'], ['13,000', 'people', 'receive', '#wildfires', 'evacuation', 'orders', 'in', 'california'], ['just', 'got', 'sent', 'this', 'photo', 'from', 'ruby', '#alaska', 'as', 'smoke', 'from', '#wildfires', 'pours', 'into', 'a', 'school']]


In [6]:
import nltk
import string
import re
from nltk.corpus import stopwords
english_stopwords = stopwords.words("english")
# print(english_stopwords)

In [7]:
# clean out stopwords, # signs, punctuation and numbers
clean_t2 = [[word if (word.startswith('#') == False) else re.sub(r'#', '', word) for word in tweet] for tweet in tweet_tokens]
clean_t3 = [[word for word in tweet if (word not in string.punctuation) and (word.isalpha() == True) and (word not in english_stopwords)] for tweet in clean_t2]

In [12]:
clean_t3[:2]

[['deeds', 'reason', 'earthquake', 'may', 'allah', 'forgive', 'us'],
 ['forest', 'fire', 'near', 'la', 'ronge', 'sask', 'canada']]

In [18]:
x_tweets = np.array(clean_t3)
x_tweets[:5]

array([list(['deeds', 'reason', 'earthquake', 'may', 'allah', 'forgive', 'us']),
       list(['forest', 'fire', 'near', 'la', 'ronge', 'sask', 'canada']),
       list(['residents', 'asked', 'shelter', 'place', 'notified', 'officers', 'evacuation', 'shelter', 'place', 'orders', 'expected']),
       list(['people', 'receive', 'wildfires', 'evacuation', 'orders', 'california']),
       list(['got', 'sent', 'photo', 'ruby', 'alaska', 'smoke', 'wildfires', 'pours', 'school'])],
      dtype=object)

In [19]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(x_tweets, Y_train, test_size=0.15, random_state=42)

In [20]:
X_train[:5]

array([list(['ûï', 'palestinian', 'homes', 'demolished', 'israel', 'since', 'w', 'israeli', 'home', 'demolished']),
       list(['rightways', 'building', 'structural', 'integrity', 'failure', 'inspections', 'damages', 'defects', 'testing', 'repair', 'via']),
       list(['london', 'life', 'photos', 'beautiful', 'britain', 'arts']),
       list(['civil', 'war', 'general', 'battle', 'bull', 'run', 'hero', 'colonel', 'new', 'hampshire', 'letter', 'signed']),
       list(['impossible', 'ww', 'like', 'survive', 'day', 'without', 'meat', 'wew'])],
      dtype=object)

In [27]:
x_tr = pd.DataFrame(X_train)
x_te = pd.DataFrame(X_test)

['ûï',
 'palestinian',
 'homes',
 'demolished',
 'israel',
 'since',
 'w',
 'israeli',
 'home',
 'demolished']

In [25]:
from sklearn.feature_extraction.text import TfidfVectorizer
# TfidfVectorizer
tf_computer = TfidfVectorizer(use_idf=False, ngram_range=(1,1), stop_words='english')

#x_feat[2].apply(lambda x: tknzr_3.tokenize(x))
train_features = tf_computer.fit_transform(x_tr)
test_features = tf_computer.transform(x_te)

print("vocabulary size:", len(tf_computer.vocabulary_))

AttributeError: 'int' object has no attribute 'lower'

## Look at the information in the columns

In [None]:
# Check out the location column - is it useful for our purposes? 
# potentially remove this column from the features

#for i in range(len(x_df[1].unique())):
#    print(x_df[1].unique()[i])

# it also has 5080 non-missing values, 3341 of them unique
x_df = pd.DataFrame(X_train, columns = ['kw', 'loc', 'txt'])
x_df.head()

In [None]:
# Check out the keyword column - possibly a heavy factor for the classifier
x_df['kw'].unique()[:10]

In [None]:
x_df.describe()

Initially thought USA being the most frequent value for the location column might be something to look into, but the number of mentions of USA is only 104

In [None]:
sum(x_df['loc']=='USA')

In [None]:
# figure out what this nan value is
np.isnan(X_train[0][0])

In [None]:
x_df.isna().sum()

In [None]:
x_df_2 = pd.DataFrame(X_train)
x_feat = x_df_2.drop(labels = 1, axis = 'columns')
x_feat.head()

### Messy old code

In [None]:
#tweets = X_train[:,2]
#tweets[:2]

In [None]:
#tknzr = TweetTokenizer(preserve_case = False, strip_handles = True, reduce_len = True)
#tweet_tokens = []
#for sent in tweets:
#    #print(tweet_tokenizer.tokenize(sent))
#    tweet_tokens.append(tknzr.tokenize(sent))

In [None]:
##### for tokenizing the keywords ##### MIGHT BE USEFUL IF WE CHOOSE KEYWORD AS A FEATURE
# for i, value in enumerate(x_feat[0][:]):
#    if type(value) != float:
#        x_feat[0][i]=tknzr_3.tokenize(value)

In [None]:
#x_feat.head(6)

In [None]:
#x_feat_copy = x_feat.copy()

In [None]:
# remove hashtags but keep key words 
#for i, row in enumerate(x_feat[2]):
#    for w, text in enumerate(row):
#        #if text.startswith('#'):
#        #    x_feat[2][i][w] = text.strip('#')
#        if text.startswith('#'):
#            x_feat[2][i][w] = re.sub(r'#', '', text)
#print(x_feat[2][:10])
#for i, row in enumerate(x_feat[2]):
#    for w, text in enumerate(row):
#        if text in string.punctuation:
#            del x_feat[2][i][w]
#for i, row in enumerate(x_feat[2]):
#    for w, text in enumerate(row):
#        if text.isalpha() == False:
#            del x_feat[2][i][w]
#for i, row in enumerate(x_feat[2]):
#    for w, text in enumerate(row):
#        if text in english_stopwords:
#            del x_feat[2][i][w]
#print(x_feat[2][:10])

#clean_t1 = [re.sub(r'#', '', word) if word.startswith['#'] else word for word in tweet for tweet in tweet_tokens]
#clean_t = [word for word in tweet for tweet in tweet_tokens 
#           if (word not in string.punctuation) and (word.isalpha() == True) and (word not in english_stopwords)]
#for i, tweet in enumerate(tweet_tokens):
#    for word in tweet:
#        if word.startswith('#'):
#            clean_tw.append(re.sub(r'#', '', word))
#        elif (word not in string.punctuation) and (word.isalpha() == True) and (word not in english_stopwords):
#            clean_tw.append(word)

In [None]:
# remove punctuation and english stopwords
# doesn't work so well - skips some of the tokens
# and cleans them only if run 5-6 times
# check is in the next chunk
#for i, row in enumerate(x_feat[2]):
#    for w, text in enumerate(row):
#        if text.startswith('#'):
#            x_feat[2][i][w] = re.sub(r'#', '', text)
#        elif (text in string.punctuation) or (text.isalpha() == False) or (text in english_stopwords):
#            #del x_feat[2][i][w]
#            x_feat[2][i].remove(text)

In [None]:
# Test if the unwanted tokens are still present
#for i, row in enumerate(x_tweet):
#    for w, text in enumerate(row):
#        if text in string.punctuation or text.isalpha() == False or text in english_stopwords:
#            print(text)

### Check balance

In [None]:
# is the number of disaster vs not disaster tweets balanced? 
print('disaster tweets = {}, not disaster = {}'.format(sum(Y_train==1), sum(Y_train == 0)))

In [None]:
# Create the CountVectorizer DataFrame: count_df
#count_df = pd.DataFrame(count_train.A, columns = count_vectorizer.get_feature_names())


In [None]:
# Print the head of count_df
#print(count_df.head())


In [None]:
X_tr = x_feat[2].values

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
# from sklearn.naive_bayes import MultinomialNB
#nb_classifier = MultinomialNB()
#nb_classifier.fit()

In [None]:
# TfidfVectorizer
tf_computer = TfidfVectorizer(use_idf=False, ngram_range=(1,1), stop_words='english')

#x_feat[2].apply(lambda x: tknzr_3.tokenize(x))
#train_features = X_tr.apply(lambda x: tf_computer.fit_transform(x))
train_features = tf_computer.fit_transform(x_tweet)
#test_features = tf_computer.transform(X_test)

print("vocabulary size:", len(tf_computer.vocabulary_))

In [None]:
train_features

In [None]:
# try the vectorization again