In [30]:
import json
import nltk
import plotly
import pandas as pd

from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk import pos_tag, word_tokenize

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.externals import joblib

from flask import Flask, render_template, request, jsonify, redirect
from flask_sqlalchemy import sqlalchemy
from sqlalchemy import create_engine
from sqlalchemy.ext.automap import automap_base
from sqlalchemy.orm import Session
from sqlalchemy import create_engine, func, inspect


class StartingVerbExtractor(BaseEstimator, TransformerMixin):
    def starting_verb(self, text):
        sentence_list = nltk.sent_tokenize(text)
        for sentence in sentence_list:
            pos_tags = nltk.pos_tag(tokenize(sentence))
            first_word, first_tag = pos_tags[0]
            if first_tag in ['VB', 'VBP'] or first_word == 'RT':
                return True
        return False
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        X_tagged = pd.Series(X).apply(self.starting_verb)
        return pd.DataFrame(X_tagged)


def tokenize(text):
    # replace the webpage with a urlplaceholder
    url_regex = 'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
    detected_urls = re.findall(url_regex, text)
    for url in detected_urls:
        text = text.replace(url, "urlplaceholder")
    # tokenize text
    tokens = word_tokenize(text)
    lemmatizer = WordNetLemmatizer()
    # lemmatize, turn into lowercase, strip spaces
    clean_tokens = []
    for tok in tokens:
        clean_tok = lemmatizer.lemmatize(tok).lower().strip()
        clean_tokens.append(clean_tok)
    return clean_tokens


# load the data.
engine = create_engine("sqlite:///./PY/DisasterResponse.db", echo=True)
session = Session(engine)

In [32]:
engine = create_engine('sqlite:///DisasterResponse.db')
df = pd.read_sql_table('disaster_response', engine)

In [45]:
cans = df["genre"].value_counts().to_dict()

from pprint import pprint
pprint (cans)

{'direct': 10747, 'news': 13036, 'social': 2394}


In [51]:
cans["news"]

13036

In [40]:
genre_counts = df.groupby('genre').count()['message']

In [53]:

category_names = df.iloc[:,4:].columns
category_boolean = (df.iloc[:,4:] != 0).sum().values


print (category_names)
print ("-----------------------")
print (category_boolean)

Index(['related', 'request', 'offer', 'aid_related', 'medical_help',
       'medical_products', 'search_and_rescue', 'security', 'military',
       'child_alone', 'water', 'food', 'shelter', 'clothing', 'money',
       'missing_people', 'refugees', 'death', 'other_aid',
       'infrastructure_related', 'transport', 'buildings', 'electricity',
       'tools', 'hospitals', 'shops', 'aid_centers', 'other_infrastructure',
       'weather_related', 'floods', 'storm', 'fire', 'earthquake', 'cold',
       'other_weather', 'direct_report'],
      dtype='object')
-----------------------
[20062  4464   117 10840  2080  1310   724   471   859     0  1669  2917
  2308   404   603   298   874  1192  3441  1705  1199  1331   532   159
   283   120   309  1151  7286  2149  2440   282  2452   528  1376  5063]


In [61]:
facts = pd.DataFrame([category_boolean], columns=category_names)
facts

Unnamed: 0,related,request,offer,aid_related,medical_help,medical_products,search_and_rescue,security,military,child_alone,...,aid_centers,other_infrastructure,weather_related,floods,storm,fire,earthquake,cold,other_weather,direct_report
0,20062,4464,117,10840,2080,1310,724,471,859,0,...,309,1151,7286,2149,2440,282,2452,528,1376,5063


In [66]:
for thing in facts:
    print (thing)
    print (int(facts[thing]))

related
20062
request
4464
offer
117
aid_related
10840
medical_help
2080
medical_products
1310
search_and_rescue
724
security
471
military
859
child_alone
0
water
1669
food
2917
shelter
2308
clothing
404
money
603
missing_people
298
refugees
874
death
1192
other_aid
3441
infrastructure_related
1705
transport
1199
buildings
1331
electricity
532
tools
159
hospitals
283
shops
120
aid_centers
309
other_infrastructure
1151
weather_related
7286
floods
2149
storm
2440
fire
282
earthquake
2452
cold
528
other_weather
1376
direct_report
5063
