In [None]:
import sys
import os
import pandas as pd
import numpy as np
import re
from datetime import datetime as dt
import csv
import sqlite3
import nltk
from nltk.stem import WordNetLemmatizer as wnl

sys.path.append('./data')

from process_data import load_data, load_categories_data, save_data, DB_TABLE_NAME

nltk.download('words'); # English word corpus for filtering data
nltk.download('wordnet')

# Project Overview
## ETL Pipeline: process_data.py
In a Python script, process_data.py, write a data cleaning pipeline that:
- Loads the messages and categories datasets
- Merges the two datasets
- Cleans the data
- Stores it in a SQLite database

## ML pipeline: train_classifier.py
In a Python script, train_classifier.py, write a machine learning pipeline that:
- Loads data from the SQLite database
- Splits the dataset into training and test sets
- Builds a text processing and machine learning pipeline
- Trains and tunes a model using GridSearchCV
- Outputs results on the test set
- Exports the final model as a pickle file

## Flask Web App
We are providing much of the flask web app for you, but feel free to add extra features depending on your knowledge of flask, html, css and javascript. For this part, you'll need to:
- Modify file paths for database and model as needed
- Add data visualizations using Plotly in the web app. One example is provided for you.

## Github and Code Quality
Your project will also be graded based on the following:
- Use of Git and Github
- Strong documentation
- Clean and modular code
- Follow the [RUBRIC](https://learn.udacity.com/nanodegrees/nd025/parts/cd0018/lessons/e692c8ed-b713-464b-95ac-72d93a35b4fc/concepts/e692c8ed-b713-464b-95ac-72d93a35b4fc-project-rubric) when you work on your project to assure you meet all of the necessary criteria for developing the pipelines and web app.

In [None]:
DATA_DIR = './data'
categories_filepath = os.path.join(DATA_DIR, 'disaster_categories.csv')
messages_filepath = os.path.join(DATA_DIR, 'disaster_messages.csv')
database_filename = os.path.join(DATA_DIR, 'project_data.sqlite3')

In [None]:
df = load_data(messages_filepath=messages_filepath, categories_filepath=categories_filepath)

In [None]:
# def clean_data(df):
#     pass

In [None]:
english_words = set(nltk.corpus.words.words())

In [None]:
tolerance = 0.50

inspect = {}

cleaned_english_tokens = {}
likely_english_messages = {}

# loop over message_raw, split on commas, loop over each segment to determine which are likely to be English
for k, v in df.message_raw.items():
    # check if this string contains a quotation mark - if so, we know there is a 'message' and an original'
    # we can loop over and attempt to identify which is which
    if v.count('"') > 0:
        english_tokens_in_this_message = []
        english_part_of_this_message = []
        segments = v.split('"')
        for segment_raw in segments:
            if len(segment_raw) > 0:
                # tokenize the segment if it isn't empty (note it could contain just non-words)
                all_words_in_segment = [i for i in nltk.wordpunct_tokenize(segment_raw.lower()) if i.isalpha()]
                english_words_in_segment = [i for i in all_words_in_segment if i in english_words]

                # if english words are detected in this segment, measure the proportion of them
                if all_words_in_segment:
                    ratio_of_english_words_in_segment = len(set(english_words_in_segment)) / len(set(all_words_in_segment))

                    # if the ratio is more thab the tolerance, consider this segment as english and add it to clean_message
                    if ratio_of_english_words_in_segment > tolerance:
                        english_part_of_this_message.append(segment_raw)
                        english_tokens_in_this_message += english_words_in_segment
                    else:
                        if k in inspect:
                            inspect[k].append(segment_raw)
                        else:
                            inspect[k] = [segment_raw]
            if english_tokens_in_this_message:
                cleaned_english_tokens[k] = list(set(english_tokens_in_this_message))
                likely_english_messages[k] = english_part_of_this_message
    elif v.count(',') > 0:
        # if there is no quotation mark but there IS a comma, then the first part of the CSV string should be the English message.
        english_part_of_this_message, other_part_of_this_message = v.split(',')
        print('breaking')
        break
    else:
        print('This should not happen')
        

In [None]:
wnl.lemmatize?

In [None]:
len(english_words_in_segment) / len(segment)

In [None]:
set(segment) - set(english_words_in_segment)

In [None]:
set(segment)

In [None]:
save_data(df, database_filename)

In [None]:
# make sure it works
conn = sqlite3.connect(database_filename)

# TODO: create a cursor object
cur = conn.cursor()

test = pd.read_sql(f'SELECT * from {DB_TABLE_NAME}', con=conn)
conn.close()

test.head()

# CRISP-DM Flow

## Business Understanding

## Data Understanding

## Data Preparation

## Data Modeling

## Result Evaluation

## Deployment