In [None]:
import sys
import os
import pandas as pd
import numpy as np
import re
from datetime import datetime as dt
import csv

# Project Overview
## ETL Pipeline: process_data.py
In a Python script, process_data.py, write a data cleaning pipeline that:
- Loads the messages and categories datasets
- Merges the two datasets
- Cleans the data
- Stores it in a SQLite database

## ML pipeline: train_classifier.py
In a Python script, train_classifier.py, write a machine learning pipeline that:
- Loads data from the SQLite database
- Splits the dataset into training and test sets
- Builds a text processing and machine learning pipeline
- Trains and tunes a model using GridSearchCV
- Outputs results on the test set
- Exports the final model as a pickle file

## Flask Web App
We are providing much of the flask web app for you, but feel free to add extra features depending on your knowledge of flask, html, css and javascript. For this part, you'll need to:
- Modify file paths for database and model as needed
- Add data visualizations using Plotly in the web app. One example is provided for you.

## Github and Code Quality
Your project will also be graded based on the following:
- Use of Git and Github
- Strong documentation
- Clean and modular code
- Follow the [RUBRIC](https://learn.udacity.com/nanodegrees/nd025/parts/cd0018/lessons/e692c8ed-b713-464b-95ac-72d93a35b4fc/concepts/e692c8ed-b713-464b-95ac-72d93a35b4fc-project-rubric) when you work on your project to assure you meet all of the necessary criteria for developing the pipelines and web app.

In [None]:
DATA_DIR = './data'
categories_filepath = os.path.join(DATA_DIR, 'disaster_categories.csv')
messages_filepath = os.path.join(DATA_DIR, 'disaster_messages.csv')

In [None]:
def load_categories_data(categories_filepath, skip_rows=1):
    """ Parse the CSV file containing category information.

    Args:
        categories_filepath (str): Path to CSV file containing category info.
        skip_rows (int, optional): Number of rows to skip when parsing (e.g. for the header). Defaults to 1.

    Returns:
        _type_: _description_
    """
    data = {}
    print(f'Parsing {categories_filepath}')
    with open(categories_filepath, 'r') as f:
        for i, line in enumerate(f.readlines()):
            if i > skip_rows - 1: # remember: Python indexes are 0-based
                this_line_info = {} # holder for just this line's info
                identifier, keyvalue = line.split(',')
                for item in keyvalue.split(';'):
                    k, v = item.strip().split('-')
                    this_line_info[k] = v
                data[identifier] = this_line_info
    data = pd.DataFrame(data).T
    print('... finished!')
    return data

In [None]:
categories_data = load_categories_data(categories_filepath)

In [None]:
categories_data

In [None]:
skip_rows=1

data = {}
print(f'Parsing {messages_filepath}')
with open(messages_filepath, 'r') as f:
    for i, line in enumerate(f.readlines()):
        if i > skip_rows - 1: # remember: Python indexes are 0-based
            this_line_info = {} # holder for just this line's info

            # each line has a vaguely consistent format, something like
            # id, message (in english), original (message in original language), genre
            # id is always at the beginning, always a number, and always followed by a comma
            # genre is always as the end, and always preceded by a comma
            # the rest is harder, because the original is sometimes blank (in between two commas, followed by the genre)
            # sometimes message is surrounded in quotes, sometimes not. It LOOKs like it gets surrounded in quotes if it contains a comma.
            # so we can try to apply a regex looking pairs of quotes with a comma in middle
            # we can also look for two consecutive commas followed by the genre to isolate instances where the original is blank

            # the idenififier is always a number, always at the beginning, and always followed by a comma
            identifier = re.search(r'^[0-9]+,', line).group() # .replace(',', '')
            
            # split the string on the idenifier - it'll be at the beginning. Then isolate the second part of the split.
            remaining = line.split(identifier)[-1]
            
            # the genre (last segment) is always a single word at the end of the string, preceded by a comma
            genre = re.search(r",[a-zA-Z]+$", line).group()

            # split the string on the genre - it'll be at the endg. Then isolate the first part of the split.
            remaining = remaining.split(genre)[0]
            
            


In [None]:
remaining

In [None]:
line.split(r"^'[0-9]+,")

# CRISP-DM Flow

## Business Understanding

## Data Understanding

## Data Preparation

## Data Modeling

## Result Evaluation

## Deployment