# ML Pipeline Preparation
Follow the instructions below to help you create your ML pipeline.
### 1. Import libraries and load data from database.
- Import Python libraries
- Load dataset from database with [`read_sql_table`](https://pandas.pydata.org/pandas-docs/stable/generated/pandas.read_sql_table.html)
- Define feature and target variables X and Y

In [1]:
!pip install pydot

Collecting pydot
  Downloading https://files.pythonhosted.org/packages/33/d1/b1479a770f66d962f545c2101630ce1d5592d90cb4f083d38862e93d16d2/pydot-1.4.1-py2.py3-none-any.whl
Installing collected packages: pydot
Successfully installed pydot-1.4.1


In [2]:
!pip install pyvis

Collecting pyvis
  Downloading https://files.pythonhosted.org/packages/7d/7e/df88acbe771afb1fe69b64516d2a56be46befab3e04cfd4258dc6063f96a/pyvis-0.1.7.0-py3-none-any.whl
Installing collected packages: pyvis
Successfully installed pyvis-0.1.7.0


In [126]:
# basic data libraries
import pandas as pd
import numpy as np
import re
from sqlalchemy import create_engine

# scikit-learn modules for pipelining, transformation, model fitting and classification
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

from sklearn.metrics import precision_recall_fscore_support as score
from sklearn.metrics import classification_report

# nltk-modules for text processing, tokenizing and lemmatizing
import nltk
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import matplotlib.pyplot as plt

# Download relevant ntlk packages
nltk.download(["punkt", "stopwords", "wordnet"])

# pickle for python object serialization and storing
import pickle
from sklearn.tree import DecisionTreeClassifier, export_graphviz
import pydot
import networkx
import pyvis

[nltk_data] Downloading package punkt to /home/jovyan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/jovyan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/jovyan/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [4]:
import sklearn.metrics

In [2]:
pd.__version__

'1.0.0'

In [292]:
# load data from database

engine = create_engine('sqlite:///crisisresponse.db')
df = pd.read_sql_table('messages', engine)
X = df.loc[:,"message"]
Y = df.iloc[:,4:40]

In [293]:
def tokenize(text):
    """
    Tokenize, lemmatize, lower and remove punctuation of input text.

    Input arguments:
        text: Single string with input text 
              Example: 'For today:= this is, a advanced _ example #- String!'
              
    Output:
        output: List of processed string
                Example: ['today', 'advanced', 'example', 'string']
        
    """
    # set text to lower case and remove punctuation
    text = re.sub("[\W_]", " ", text)
    text= text.lower()

    # tokenize words 
    tokens = word_tokenize(text)
    
    # lemmatizer
    lemmatizer = WordNetLemmatizer()
    
    # init and remove stopwords
    stop_words = set(stopwords.words('english'))
    output = [lemmatizer.lemmatize(w) for w in tokens if not w in stop_words]

    return output

In [334]:
df["tok_message"] = df["message"].apply(tokenize)
df_tok = df.explode("tok_message").drop(columns=["message","original","id"])
df_tok = df_tok[df_tok["related"] == 1]
df_tok.drop(columns=["related"], inplace=True)
df_tok = df_tok.reset_index().drop(columns=["index"]).reset_index()

In [335]:
df_tok.shape

(312195, 38)

In [336]:
var_columns = df_tok.columns[2:36]
df_tok_melt = df_tok.melt(id_vars = ['index', 'tok_message'], value_vars=var_columns)
df_tok_melt = df_tok_melt[df_tok_melt["value"] == 1]

In [339]:
df_tok_melt.shape

(1230331, 4)

In [295]:
from matplotlib import cm
from matplotlib import colors
from scipy import interpolate

In [296]:
cat_interpol = interpolate.interp1d([0, 15], [0,1])


In [301]:
g = Network("1000px", "1500px", notebook=True)
g.hrepulsion(central_gravity=6.55, spring_length=620, node_distance=465, damping=1)
#g.add_nodes([1,2,3], value=[10, 100, 400], title=["I am node 1", "node 2 here", "and im node 3"], label=["NODE 1", "NODE 2", "NODE 3"], color=["#00ff1e", "#162347", "#dd4b39"])
cat_dict = {}
node_index = 0
for cat, value in df_tok_melt["variable"].value_counts().sort_values(ascending=False)[:15].iteritems():
    node_index += 1
    cat_dict[cat] = node_index
    g.add_node(cat_dict[cat], value=value*10000, title="Category {}: {}".format(cat,str(value)), label=cat )

    
word_dict = {}
for word, value in df_tok_melt["tok_message"].value_counts().sort_values(ascending=False)[:100].iteritems():
    node_index += 1
    word_dict[word] = node_index
    g.add_node(word_dict[word], value=value*1000, title="Word: {}: {}".format(word,str(value)), color="red",label=word)

word_per_cat = pd.DataFrame(df_tok_melt.groupby("tok_message")["variable"].value_counts()).T
    
edge_dict = {}
for word in word_dict.keys():
    for cat in word_per_cat[word].columns:
        try:
            g.add_edge(cat_dict[cat], word_dict[word], width= int(word_per_cat[word][cat]["variable"]/50),color = colors.to_hex(cm.get_cmap("rainbow")(cat_interpol(cat_dict[cat]))))
        except:
            pass
        #
#g.show_buttons(filter_=['physics'])

g.save_graph("graph_disaster_response.html")



In [313]:
cols = df.iloc[:,5:].columns
df_melt = df.melt(id_vars = ['message'], value_vars=var_columns)


In [325]:
df_melt.head()

Unnamed: 0,message,variable,value
0,Weather update - a cold front from Cuba that c...,request,0
1,Is the Hurricane over or is it not over,request,0
2,Looking for someone but no name,request,0
3,UN reports Leogane 80-90 destroyed. Only Hospi...,request,1
4,"says: west side of Haiti, rest of the country ...",request,0


In [320]:
pd.concat([df,df_melt["variable"]], axis=1).shape

(884952, 42)

In [329]:
df.shape

(26028, 41)

In [326]:
df_melt.shape

(884952, 3)

In [345]:
bla = df_melt[df_melt["value"] == 1]["variable"].value_counts()

In [346]:
bla.index

Index(['aid_related', 'weather_related', 'request', 'other_aid', 'food',
       'earthquake', 'storm', 'shelter', 'floods', 'medical_help',
       'infrastructure_related', 'water', 'other_weather', 'buildings',
       'medical_products', 'transport', 'death', 'other_infrastructure',
       'refugees', 'military', 'search_and_rescue', 'money', 'electricity',
       'cold', 'security', 'clothing', 'aid_centers', 'missing_people',
       'hospitals', 'fire', 'tools', 'shops', 'offer'],
      dtype='object')

In [342]:
df.groupby('genre').count()['message']

genre
direct    10634
news      13036
social     2358
Name: message, dtype: int64