In [1]:
import sys
import pandas as pd
from sqlalchemy import create_engine


In [9]:
def load_data(messages_filepath, categories_filepath):
    '''
    Input:
        messages_filepath: File path of messages data
        categories_filepath: File path of categories data
    Output:
        df: Merged dataset from messages and categories
    '''
    # Read message data
    messages=pd.read_csv(messages_filepath)
    # Read categories data
    categories=pd.read_csv(categories_filepath)
    ####
    #msd.merge(ctd, left_on='id', right_on='categories')
    df=pd.merge(messages,categories)
    # Merge messages and categories
    
    return df



In [10]:
df=load_data('messages.csv','categories.csv')


In [12]:
def clean_data(df):
    '''
    Input:
        df: Merged dataset from messages and categories
    Output:
        df: Cleaned dataset
    '''
    df["categories"]= df["categories"].str.split(";")
    # Create a dataframe of the 36 individual category columns
    tempDataf=pd.DataFrame()
    # appendig categories column will create one column in the dataf dataframe 
    tempDataf=tempDataf.append(df["categories"][0])
    #df3 will contain the column names by substracting the last tow digits.
    # Select the first row of the categories dataframe
     # Use this row to extract a list of new column names for categories
    columnsName=tempDataf[0].str[:-2]
    #  reshaping the dataf will conver the 36 rows to 36 column 
    # which allows me to use dataf.loc method 
    tempDataf=pd.DataFrame(tempDataf.values.reshape(1,-1))
    i=0
    for i in df.index:
        tempDataf.loc[i]=df["categories"][i]
    
    # Convert category values to just numbers 0 or 1
        # set each value to be the last character of the string
        # convert column from string to numeric
     # Drop the original categories column
    
    # Concatenate the original dataframe with the new categories dataframe
    i=0

    for i in tempDataf.columns:
        # convert column from string to numeric
        tempDataf[i]= tempDataf[i].str[-1]
        tempDataf[i]=  pd.to_numeric(tempDataf[i])
        # Rename the categories columns
        tempDataf=tempDataf.rename(columns={i:columnsName[i]})
        
    
    dfs=[df,tempDataf]
    df1=pd.concat(dfs,axis=1)
    df1=df1.drop(columns=["categories"])
    df1=df1.drop('original', axis=1)
    df1=df1.drop_duplicates()
    
    
    return df1
    
    
   

In [38]:
def save_data(df, database_filename):
    '''
    Save df into sqlite db
    Input:
        df: cleaned dataset
        database_filename: database name, e.g. DisasterMessages.db
    Output: 
        A SQLite database
    '''
    
    engine = create_engine('sqlite:///'+str(database_filename))
    df.to_sql('df', engine, index=False)


In [39]:
save_data(df,'DisasterResponse.db')

In [40]:
def main():
    if len(sys.argv) == 4:

        messages_filepath, categories_filepath, database_filepath = sys.argv[1:]

        print('Loading data...\n    MESSAGES: {}\n    CATEGORIES: {}'
              .format(messages_filepath, categories_filepath))
        df = load_data(messages_filepath, categories_filepath)

        print('Cleaning data...')
        df = clean_data(df)
        
        print('Saving data...\n    DATABASE: {}'.format(database_filepath))
        save_data(df, database_filepath)
        
        print('Cleaned data saved to database!')
    
    else:
        print('Please provide the filepaths of the messages and categories '\
              'datasets as the first and second argument respectively, as '\
              'well as the filepath of the database to save the cleaned data '\
              'to as the third argument. \n\nExample: python process_data.py '\
              'disaster_messages.csv disaster_categories.csv '\
              'DisasterResponse.db')


if __name__ == '__main__':
    main()

Please provide the filepaths of the messages and categories datasets as the first and second argument respectively, as well as the filepath of the database to save the cleaned data to as the third argument. 

Example: python process_data.py disaster_messages.csv disaster_categories.csv DisasterResponse.db
