In [92]:
%matplotlib inline
import re
import string
import matplotlib
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import pickle
import sys
from datetime import date
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
from sklearn.metrics import auc
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import average_precision_score
from sklearn.metrics import classification_report
from sklearn.multiclass import OneVsRestClassifier
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
stop_words = set(stopwords.words('english'))
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
import seaborn as sns

In [93]:
dataset_unprocessed = pd.read_csv('../Datasets/combined_dataset_all.csv', 
                                                 encoding = "ISO-8859-1", dtype={"split": "string", "message": "string", "genre": "string"})
val_yes_disater_unprocessed = pd.read_csv('../Datasets/ValidationUseCaseDisaster.csv', 
                                          encoding = "ISO-8859-1", dtype={"message": "string", "date": "string","location": "string"})

In [94]:
# converting any 0 to 2 (translates to no)
dataset= dataset_unprocessed.replace(2,0)

In [95]:
# converting any 'off-topic' to '0' and 'on-topic' to 1 in disaster_yes val set
val_yes_disater = val_yes_disater_unprocessed.replace(to_replace=['off-topic', 'on-topic'], value=[0, 1])

In [96]:
# val_yes_disater = val_yes_disater_unprocessed.drop(columns = ['related'])

In [97]:
val_yes_disater.head(20)

Unnamed: 0,message,related,date,location
0,I've got enough candles to supply a Mexican fa...,0,28/10/2012,"New York City, New York"
1,Sandy be soooo mad that she be shattering our ...,1,28/10/2012,"New York City, New York"
2,@ibexgirl thankfully Hurricane Waugh played it...,0,28/10/2012,"New York City, New York"
3,@taos you never got that magnificent case of B...,0,28/10/2012,"New York City, New York"
4,"I'm at Mad River Bar &amp; Grille (New York, N...",0,28/10/2012,"New York City, New York"
5,Neighborly duties. @Cory_Kennedy arrives to th...,1,28/10/2012,"New York City, New York"
6,And that's it until the spring.,0,28/10/2012,"New York City, New York"
7,I don't know how I'm getting back to Jersey si...,1,28/10/2012,"New York City, New York"
8,@NaeemPeena We were asked to get off the plane...,0,28/10/2012,"New York City, New York"
9,@jaytee_96 you must be crazy! &amp; omg you tw...,0,28/10/2012,"New York City, New York"


In [98]:
val_yes_disater.shape

(10008, 4)

In [99]:
# minimising the dataset further ready for model training
dataset.drop(dataset.columns.difference(['message', 'related']), 1, inplace=True)

In [100]:
dataset.sample(5)

Unnamed: 0,message,related
1311,"Thank you for the answer, because I think we a...",1
12371,The Swedish Committee for Afghanistan (SCA) wi...,1
18614,"Omg, whats with Windows Vista today. Tried ins...",0
37543,"In some areas, floodwaters reached as high as ...",1
7756,These early rains were not nearly enough to ea...,1


In [101]:
# having apeek at the first message
dataset['message'][0]

'Weather update - a cold front from Cuba that could pass over Haiti'

In [102]:
# function to clean the data
def clean_text(text):
    # make all text lowercase
    text = text.lower()
    # substitute common abbreviations
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "can not ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r"\'scuse", " excuse ", text)
    text = re.sub('\W', ' ', text)
    text = re.sub('\s+', ' ', text)
    # removes any leading spaces at the beginning and trailing spaces at the end
    text = text.strip(' ')
    # new bit
    # remove punctuations
    words = text.split()
    table = str.maketrans('', '', string.punctuation)
    stripped = [w.translate(table) for w in words]
    # remove remaining tokens that are not alphabetic
    words = [word for word in stripped if word.isalpha()]
    # filter out stop words
    stop_words = set(stopwords.words('english'))
    words = [w for w in words if not w in stop_words]
    # stemming of words
    porter = PorterStemmer()
    stemmed = [porter.stem(word) for word in words]
    seperator = " "
    text = seperator.join(words)
#     print(words)
#     print("---")
    return text

In [103]:
dataset['message'] = dataset['message'].map(lambda com : clean_text(com))
#dataset['message'][0]

In [104]:
val_yes_disater['message'] = val_yes_disater['message'].map(lambda com : clean_text(com))
#val_yes_disater['message'][0]

In [105]:
categories = ['related']
train, test = train_test_split(dataset, random_state=42, test_size=0.10, shuffle=True)
X_train = train.message
X_validate_yes = val_yes_disater.message
y_train = train.related
# y_validate_yes = val_yes_disater.related
predictionsArray = np.array([])
targetValuesArray = np.array([])
print(X_train.shape)
print(y_train.shape)

(34436,)
(34436,)


In [106]:
# adding all the VALIDATION YES data to an array used for classification report
# for column in val_yes_disater[['related']]:
#     columnSeriesObj = val_yes_disater[column]
#     targetValuesArray = np.append(targetValuesArray, columnSeriesObj.values)

# print(targetValuesArray)

In [107]:
# Define a pipeline combining a text feature extractor with multi label classifier
NB_pipeline = Pipeline([
                ('tfidf', TfidfVectorizer(stop_words=stop_words)),
                ('clf', LogisticRegression()),
            ])
# print accuracy for each label
for category in categories:
    print('... Processing {}'.format(category))
    # train the model using X_dtm & y
    NB_pipeline.fit(X_train, train[category])
    # compute the testing accuracy
    prediction = NB_pipeline.predict(X_validate_yes) ### pass in X_validate_yes, X_validate_no OR X_test
    predictionsArray = np.append(predictionsArray, prediction)
    print("Done! Predictions Complete")
#     print('Test accuracy is {}'.format('%.2f' % accuracy_score(val_yes_disater[category], prediction)))

... Processing related
Done! Predictions Complete


In [108]:
# view predictions and true values arrays
print("Predictions")
print(np.shape(predictionsArray))
print(" ")
# for x in predictionsArray:
#     print(x) 

Predictions
(10008,)
 


In [109]:
# first check if there are enough positive predictions to suggest a disaster is occuring
disasterRelatedTweets = 0
for prediction in predictionsArray:
    if (prediction == 1):
        disasterRelatedTweets += 1;
if(disasterRelatedTweets >=3000):
    print ("A disaster is currently happening! ")
    print (disasterRelatedTweets)

A disaster is currently happening! 
6581


In [110]:
#  IF No, return "no disasters"
#  IF Yes : see below

In [111]:
# append predictions to validation dataframe
val_yes_disater['model_prediction'] = np.array(predictionsArray)

In [112]:
val_yes_disater.shape

(10008, 5)

In [113]:
# remove rows where prediction = 0 (they aren't disaster related)
val_yes_disater_reduced = val_yes_disater.query("model_prediction > 0")

In [114]:
val_yes_disater_reduced.shape

(6581, 5)

In [141]:
# create 2 seperate arrays: one with the locations to pass and the other with the amount of tweets per city
ResultsLocationArray = np.array([])
ResultsLocationRatiosArray = np.array([])
ResultsArray = np.array([])
ResultsLocationArray = np.append(val_yes_disater_reduced.location.unique(), ResultsLocationArray)
ResultsLocationRatiosArray = np.append(val_yes_disater_reduced['location'].value_counts(), ResultsLocationRatiosArray)

In [142]:
for result in ResultsLocationArray:
    print(result)
for result in ResultsLocationRatiosArray:
    print(result)

New York City, New York
Bergen County, New Jersey
Atlantic City, New Jersey
Essex, Massachusetts
Monmouth County, New Jersey
Middlesex County, New York
Cape May, New Jersey
1936
1332
1316
960
350
347
340


In [143]:
#  combine these 2 array into 1 2D array
i = 0
while i < ResultsLocationArray.size:
    ResultsArray = np.append([ResultsLocationArray[i],ResultsLocationRatiosArray[i]], ResultsArray)
    i +=1

In [144]:
# pass the 'ResultsArray' to Lambda 1 who'll pass it onto the web application
for result in ResultsArray:
    print(result)

Cape May, New Jersey
340
Middlesex County, New York
347
Monmouth County, New Jersey
350
Essex, Massachusetts
960
Atlantic City, New Jersey
1316
Bergen County, New Jersey
1332
New York City, New York
1936
