# ML Pipeline Preparation
Follow the instructions below to help you create your ML pipeline.
### 1. Import libraries and load data from database.
- Import Python libraries
- Load dataset from database with [`read_sql_table`](https://pandas.pydata.org/pandas-docs/stable/generated/pandas.read_sql_table.html)
- Define feature and target variables X and Y

In [1]:
!pip install pydot

Collecting pydot
  Downloading https://files.pythonhosted.org/packages/33/d1/b1479a770f66d962f545c2101630ce1d5592d90cb4f083d38862e93d16d2/pydot-1.4.1-py2.py3-none-any.whl
Installing collected packages: pydot
Successfully installed pydot-1.4.1


In [2]:
!pip install pyvis

Collecting pyvis
  Downloading https://files.pythonhosted.org/packages/7d/7e/df88acbe771afb1fe69b64516d2a56be46befab3e04cfd4258dc6063f96a/pyvis-0.1.7.0-py3-none-any.whl
Installing collected packages: pyvis
Successfully installed pyvis-0.1.7.0


In [126]:
# basic data libraries
import pandas as pd
import numpy as np
import re
from sqlalchemy import create_engine

# scikit-learn modules for pipelining, transformation, model fitting and classification
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

from sklearn.metrics import precision_recall_fscore_support as score
from sklearn.metrics import classification_report

# nltk-modules for text processing, tokenizing and lemmatizing
import nltk
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import matplotlib.pyplot as plt

# Download relevant ntlk packages
nltk.download(["punkt", "stopwords", "wordnet"])

# pickle for python object serialization and storing
import pickle
from sklearn.tree import DecisionTreeClassifier, export_graphviz
import pydot
import networkx
import pyvis

[nltk_data] Downloading package punkt to /home/jovyan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/jovyan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/jovyan/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [4]:
import sklearn.metrics

In [2]:
pd.__version__

'1.0.0'

In [3]:
# load data from database

engine = create_engine('sqlite:///crisisresponse.db')
df = pd.read_sql_table('messages', engine)
X = df.loc[:,"message"]
Y = df.iloc[:,4:40]

In [4]:
def tokenize(text):
    """
    Tokenize, lemmatize, lower and remove punctuation of input text.

    Input arguments:
        text: Single string with input text 
              Example: 'For today:= this is, a advanced _ example #- String!'
              
    Output:
        output: List of processed string
                Example: ['today', 'advanced', 'example', 'string']
        
    """
    # set text to lower case and remove punctuation
    text = re.sub("[\W_]", " ", text)
    text= text.lower()

    # tokenize words 
    tokens = word_tokenize(text)
    
    # lemmatizer
    lemmatizer = WordNetLemmatizer()
    
    # init and remove stopwords
    stop_words = set(stopwords.words('english'))
    output = [lemmatizer.lemmatize(w) for w in tokens if not w in stop_words]

    return output

In [5]:
df["tok_message"] = df["message"].apply(tokenize)

In [8]:
df_tok = df.explode("tok_message").drop(columns=["message","original","id"])

In [11]:
df_tok.shape

(379133, 38)

In [12]:
df_tok = df_tok[df_tok["related"] == 1]

In [None]:
df_tok.drop(columns=["related"], inplace=True)

In [27]:
df_tok = df_tok.reset_index().drop(columns=["index"]).reset_index()

In [28]:
var_columns = df_tok.columns[2:36]

In [36]:
df_tok_melt = df_tok.melt(id_vars = ['index', 'tok_message'], value_vars=var_columns)

In [39]:
df_tok_melt = df_tok_melt[df_tok_melt["value"] == 1]

In [40]:
df_tok_melt.head()

Unnamed: 0,index,tok_message,variable,value
12,12,un,request,1
13,13,report,request,1
14,14,leogane,request,1
15,15,80,request,1
16,16,90,request,1


In [275]:
word_per_cat.fillna(0,inplace=True)

In [180]:
from matplotlib import cm
from matplotlib import colors
from scipy import interpolate

In [286]:
cat_interpol = interpolate.interp1d([0, 15], [0,1])


In [291]:
g = Network("1000px", "1500px", notebook=True)
g.hrepulsion(central_gravity=6.55, spring_length=620, node_distance=465, damping=1)
#g.add_nodes([1,2,3], value=[10, 100, 400], title=["I am node 1", "node 2 here", "and im node 3"], label=["NODE 1", "NODE 2", "NODE 3"], color=["#00ff1e", "#162347", "#dd4b39"])
cat_dict = {}
node_index = 0
for cat, value in df_tok_melt["variable"].value_counts().sort_values(ascending=False)[:15].iteritems():
    node_index += 1
    cat_dict[cat] = node_index
    g.add_node(cat_dict[cat], value=value*10000, title="Category {}: {}".format(cat,str(value)), label=cat )

    
word_dict = {}
for word, value in df_tok_melt["tok_message"].value_counts().sort_values(ascending=False)[:100].iteritems():
    node_index += 1
    word_dict[word] = node_index
    g.add_node(word_dict[word], value=value*1000, title="Word: {}: {}".format(word,str(value)), color="red",label=word)

word_per_cat = pd.DataFrame(df_tok_melt.groupby("tok_message")["variable"].value_counts()).T
    
edge_dict = {}
for word in word_dict.keys():
    for cat in word_per_cat[word].columns:
        try:
            g.add_edge(cat_dict[cat], word_dict[word], width= int(word_per_cat[word][cat]["variable"]/50),color = colors.to_hex(cm.get_cmap("rainbow")(cat_interpol(cat_dict[cat]))))
        except:
            pass
        #
#g.show_buttons(filter_=['physics'])

g.show("graph_test_disaster_response.html")



In [205]:
cm_col = []
for word in word_dict.keys():
    for cat in word_per_cat[word].columns:
        cm_col.append(colors.to_hex(cm.get_cmap("rainbow")(cat_interpol(cat_dict[cat]))))
        cm_col.append(cat_dict[cat])

In [214]:
cat_interpol(32)

array(247.03125)

In [221]:
cm.get_cmap("rainbow")(np.ceil(cat_interpol(0.4)))

(0.5, 0.0, 1.0, 1.0)

In [206]:
cm_col

['#8000ff',
 1,
 '#ff0000',
 12,
 '#ff0000',
 4,
 '#ff0000',
 2,
 '#ff0000',
 7,
 '#ff0000',
 8,
 '#ff0000',
 5,
 '#ff0000',
 13,
 '#ff0000',
 9,
 '#ff0000',
 3,
 '#ff0000',
 11,
 '#ff0000',
 6,
 '#ff0000',
 18,
 '#ff0000',
 16,
 '#ff0000',
 17,
 '#ff0000',
 14,
 '#ff0000',
 19,
 '#ff0000',
 10,
 '#ff0000',
 15,
 '#ff0000',
 23,
 '#ff0000',
 21,
 '#ff0000',
 25,
 '#ff0000',
 27,
 '#ff0000',
 20,
 '#ff0000',
 29,
 '#ff0000',
 22,
 '#ff0000',
 24,
 '#ff0000',
 26,
 '#ff0000',
 30,
 '#ff0000',
 31,
 '#ff0000',
 28,
 '#ff0000',
 32,
 '#ff0000',
 33,
 '#8000ff',
 1,
 '#ff0000',
 2,
 '#ff0000',
 7,
 '#ff0000',
 4,
 '#ff0000',
 15,
 '#ff0000',
 8,
 '#ff0000',
 3,
 '#ff0000',
 5,
 '#ff0000',
 9,
 '#ff0000',
 6,
 '#ff0000',
 10,
 '#ff0000',
 12,
 '#ff0000',
 14,
 '#ff0000',
 11,
 '#ff0000',
 19,
 '#ff0000',
 13,
 '#ff0000',
 16,
 '#ff0000',
 17,
 '#ff0000',
 18,
 '#ff0000',
 21,
 '#ff0000',
 20,
 '#ff0000',
 24,
 '#ff0000',
 30,
 '#ff0000',
 22,
 '#ff0000',
 25,
 '#ff0000',
 23,
 '#ff0000',
 26

In [None]:
word_dict

In [79]:
df_tok_melt["tok_message"].value_counts()[:300]

water       12777
people      12189
food        11437
help         9008
need         8706
            ...  
homeless      738
others        734
ministry      731
clothing      730
wfp           730
Name: tok_message, Length: 300, dtype: int64

In [170]:
word_per_cat[list(word_dict.keys())[20]]["aid_related"]

variable    509
Name: aid_related, dtype: int64

In [None]:
it_items[:100]