# ETL Pipeline Preparation
Follow the instructions below to help you create your ETL pipeline.
### 1. Import libraries and load datasets.
- Import Python libraries
- Load `messages.csv` into a dataframe and inspect the first few lines.
- Load `categories.csv` into a dataframe and inspect the first few lines.

In [22]:
# import libraries
import pandas as pd

In [23]:
# load messages dataset
messages = pd.read_csv('messages.csv')
messages.head()

Unnamed: 0,id,message,original,genre
0,2,Weather update - a cold front from Cuba that c...,Un front froid se retrouve sur Cuba ce matin. ...,direct
1,7,Is the Hurricane over or is it not over,Cyclone nan fini osinon li pa fini,direct
2,8,Looking for someone but no name,"Patnm, di Maryani relem pou li banm nouvel li ...",direct
3,9,UN reports Leogane 80-90 destroyed. Only Hospi...,UN reports Leogane 80-90 destroyed. Only Hospi...,direct
4,12,"says: west side of Haiti, rest of the country ...",facade ouest d Haiti et le reste du pays aujou...,direct


In [3]:
messages.shape
messages['id'].nunique()

26180

In [96]:
messages['id'].value_counts().sort_values(ascending = False).head(5)

24779    3
23229    2
17569    2
15169    2
862      2
Name: id, dtype: int64

In [4]:
messages.loc[messages.duplicated()]

Unnamed: 0,id,message,original,genre
163,202,?? port au prince ?? and food. they need gover...,p bay pap la syen ak manje. Yo bezwen ed gouve...,direct
656,804,elle est vraiment malade et a besoin d'aide. u...,she is really sick she need your help. please ...,direct
710,862,What is the address of the radio station? I as...,Ki adres radyo a? Paske m bezwen al depoze dos...,direct
1408,1652,"please we need water, food and tents, we have ...","p jwen dlo, manje, tant pou nou demi nou gen 1...",direct
2131,2446,How much money did TV Latino American collect ...,Konbyen kob tv latino america ranmase miami,direct
2793,3217,"Sir or madam, I'm hungry, I cannot stand becau...",Mesye ou madam mwen grangou mwen pa menm kakan...,direct
2825,3250,I thought you gave my number and this was seri...,Mwen tepanse le nou tebay nimewo a se yon afse...,direct
3374,3882,"I am in Carefour Feuilles, Dkayet neighborhood...",Mwen nan kafou fy zon dekayt nou beswen dlo po...,direct
3852,4412,"Good evening staff of responsibles, please giv...",Bonswa staf responsab svp banm yon ti d paske ...,direct
4341,4956,"What we saw yesterday is a sign, you need to k...",a nou w la yo se sign yoy ou bezwen kon sa Jez...,direct


In [5]:
messages.drop_duplicates(inplace = True)

In [6]:
messages.duplicated().sum()

0

In [7]:
messages.shape

(26180, 4)

In [24]:
# load categories dataset
categories = pd.read_csv('categories.csv')
categories.head()

Unnamed: 0,id,categories
0,2,related-1;request-0;offer-0;aid_related-0;medi...
1,7,related-1;request-0;offer-0;aid_related-1;medi...
2,8,related-1;request-0;offer-0;aid_related-0;medi...
3,9,related-1;request-1;offer-0;aid_related-1;medi...
4,12,related-1;request-0;offer-0;aid_related-0;medi...


In [9]:
categories.shape

(26248, 2)

In [10]:
categories[categories.duplicated()]

Unnamed: 0,id,categories
656,804,related-2;request-0;offer-0;aid_related-0;medi...
2131,2446,related-1;request-0;offer-0;aid_related-0;medi...
2793,3217,related-1;request-1;offer-0;aid_related-1;medi...
3852,4412,related-1;request-1;offer-0;aid_related-1;medi...
4630,5265,related-2;request-0;offer-0;aid_related-0;medi...
4893,5573,related-0;request-0;offer-0;aid_related-0;medi...
4951,5643,related-1;request-1;offer-0;aid_related-1;medi...
5558,6327,related-2;request-0;offer-0;aid_related-0;medi...
5772,6573,related-1;request-1;offer-0;aid_related-1;medi...
6983,7892,related-1;request-0;offer-0;aid_related-0;medi...


In [11]:
categories.drop_duplicates(inplace = True)

In [12]:
categories.duplicated().sum()

0

In [13]:
messages[messages.id == 17553]

Unnamed: 0,id,message,original,genre
14836,17553,The last locust infestation in the Central Asi...,,news


In [107]:
categories[categories.id == 17553]['categories']

14836    related-1;request-1;offer-0;aid_related-1;medi...
14837    related-1;request-0;offer-0;aid_related-0;medi...
Name: categories, dtype: object

In [14]:
categories.id.value_counts().sort_values(ascending = False).head()

17553    2
25512    2
862      2
24347    2
6492     2
Name: id, dtype: int64

In [15]:
for i in range(5):

    strn = categories.iloc[i, 1]
    print(strn, '\n')

related-1;request-0;offer-0;aid_related-0;medical_help-0;medical_products-0;search_and_rescue-0;security-0;military-0;child_alone-0;water-0;food-0;shelter-0;clothing-0;money-0;missing_people-0;refugees-0;death-0;other_aid-0;infrastructure_related-0;transport-0;buildings-0;electricity-0;tools-0;hospitals-0;shops-0;aid_centers-0;other_infrastructure-0;weather_related-0;floods-0;storm-0;fire-0;earthquake-0;cold-0;other_weather-0;direct_report-0 

related-1;request-0;offer-0;aid_related-1;medical_help-0;medical_products-0;search_and_rescue-0;security-0;military-0;child_alone-0;water-0;food-0;shelter-0;clothing-0;money-0;missing_people-0;refugees-0;death-0;other_aid-1;infrastructure_related-0;transport-0;buildings-0;electricity-0;tools-0;hospitals-0;shops-0;aid_centers-0;other_infrastructure-0;weather_related-1;floods-0;storm-1;fire-0;earthquake-0;cold-0;other_weather-0;direct_report-0 

related-1;request-0;offer-0;aid_related-0;medical_help-0;medical_products-0;search_and_rescue-0;security

In [110]:
categories.loc[categories['id'].duplicated(), :]
categories[categories['id'] == 202].categories

162    related-1;request-1;offer-0;aid_related-1;medi...
163    related-1;request-1;offer-0;aid_related-1;medi...
Name: categories, dtype: object

In [111]:
categories.iloc[162, 1] == categories.iloc[163, 1]

False

In [112]:
print(categories.iloc[162, 1], '\n')
print(categories.iloc[162, 1])

related-1;request-1;offer-0;aid_related-1;medical_help-0;medical_products-0;search_and_rescue-0;security-0;military-0;child_alone-0;water-0;food-1;shelter-0;clothing-0;money-0;missing_people-0;refugees-0;death-0;other_aid-1;infrastructure_related-0;transport-0;buildings-0;electricity-0;tools-0;hospitals-0;shops-0;aid_centers-0;other_infrastructure-0;weather_related-0;floods-0;storm-0;fire-0;earthquake-0;cold-0;other_weather-0;direct_report-0 

related-1;request-1;offer-0;aid_related-1;medical_help-0;medical_products-0;search_and_rescue-0;security-0;military-0;child_alone-0;water-0;food-1;shelter-0;clothing-0;money-0;missing_people-0;refugees-0;death-0;other_aid-1;infrastructure_related-0;transport-0;buildings-0;electricity-0;tools-0;hospitals-0;shops-0;aid_centers-0;other_infrastructure-0;weather_related-0;floods-0;storm-0;fire-0;earthquake-0;cold-0;other_weather-0;direct_report-0


In [113]:
print(categories.iloc[162, 0]) == print(categories.iloc[163, 0])

202
202


True

In [114]:
categories.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 26216 entries, 0 to 26247
Data columns (total 2 columns):
id            26216 non-null int64
categories    26216 non-null object
dtypes: int64(1), object(1)
memory usage: 1.9+ MB


In [115]:
categories.shape

(26216, 2)

In [16]:
id_cat = categories['id'].unique().tolist()
id_mes = messages['id'].unique().tolist()

In [17]:
in_cat = [i for i in id_cat if i not in id_mes]

In [18]:
in_cat

[]

In [19]:
categories[categories.id.isin(in_cat)]

Unnamed: 0,id,categories


### 2. Merge datasets.
- Merge the messages and categories datasets using the common id
- Assign this combined dataset to `df`, which will be cleaned in the following steps

In [25]:
# merge datasets
df = pd.merge(left = messages, right = categories, how = 'left', on = 'id')
df.head()

Unnamed: 0,id,message,original,genre,categories
0,2,Weather update - a cold front from Cuba that c...,Un front froid se retrouve sur Cuba ce matin. ...,direct,related-1;request-0;offer-0;aid_related-0;medi...
1,7,Is the Hurricane over or is it not over,Cyclone nan fini osinon li pa fini,direct,related-1;request-0;offer-0;aid_related-1;medi...
2,8,Looking for someone but no name,"Patnm, di Maryani relem pou li banm nouvel li ...",direct,related-1;request-0;offer-0;aid_related-0;medi...
3,9,UN reports Leogane 80-90 destroyed. Only Hospi...,UN reports Leogane 80-90 destroyed. Only Hospi...,direct,related-1;request-1;offer-0;aid_related-1;medi...
4,12,"says: west side of Haiti, rest of the country ...",facade ouest d Haiti et le reste du pays aujou...,direct,related-1;request-0;offer-0;aid_related-0;medi...


In [26]:
df.shape

(26386, 5)

### 3. Split `categories` into separate category columns.
- Split the values in the `categories` column on the `;` character so that each value becomes a separate column. You'll find [this method](https://pandas.pydata.org/pandas-docs/version/0.23/generated/pandas.Series.str.split.html) very helpful! Make sure to set `expand=True`.
- Use the first row of categories dataframe to create column names for the categories data.
- Rename columns of `categories` with new column names.

In [27]:
# create a dataframe of the 36 individual category columns
categories = categories['categories'].str.split(';', expand = True)
categories.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,26,27,28,29,30,31,32,33,34,35
0,related-1,request-0,offer-0,aid_related-0,medical_help-0,medical_products-0,search_and_rescue-0,security-0,military-0,child_alone-0,...,aid_centers-0,other_infrastructure-0,weather_related-0,floods-0,storm-0,fire-0,earthquake-0,cold-0,other_weather-0,direct_report-0
1,related-1,request-0,offer-0,aid_related-1,medical_help-0,medical_products-0,search_and_rescue-0,security-0,military-0,child_alone-0,...,aid_centers-0,other_infrastructure-0,weather_related-1,floods-0,storm-1,fire-0,earthquake-0,cold-0,other_weather-0,direct_report-0
2,related-1,request-0,offer-0,aid_related-0,medical_help-0,medical_products-0,search_and_rescue-0,security-0,military-0,child_alone-0,...,aid_centers-0,other_infrastructure-0,weather_related-0,floods-0,storm-0,fire-0,earthquake-0,cold-0,other_weather-0,direct_report-0
3,related-1,request-1,offer-0,aid_related-1,medical_help-0,medical_products-1,search_and_rescue-0,security-0,military-0,child_alone-0,...,aid_centers-0,other_infrastructure-0,weather_related-0,floods-0,storm-0,fire-0,earthquake-0,cold-0,other_weather-0,direct_report-0
4,related-1,request-0,offer-0,aid_related-0,medical_help-0,medical_products-0,search_and_rescue-0,security-0,military-0,child_alone-0,...,aid_centers-0,other_infrastructure-0,weather_related-0,floods-0,storm-0,fire-0,earthquake-0,cold-0,other_weather-0,direct_report-0


In [28]:
# select the first row of the categories dataframe
row = categories.iloc[0, :]

# use this row to extract a list of new column names for categories.
# one way is to apply a lambda function that takes everything 
# up to the second to last character of each string with slicing
category_colnames = row.apply(lambda x: x[:-2])
print(category_colnames)

0                    related
1                    request
2                      offer
3                aid_related
4               medical_help
5           medical_products
6          search_and_rescue
7                   security
8                   military
9                child_alone
10                     water
11                      food
12                   shelter
13                  clothing
14                     money
15            missing_people
16                  refugees
17                     death
18                 other_aid
19    infrastructure_related
20                 transport
21                 buildings
22               electricity
23                     tools
24                 hospitals
25                     shops
26               aid_centers
27      other_infrastructure
28           weather_related
29                    floods
30                     storm
31                      fire
32                earthquake
33                      cold
34            

In [29]:
# rename the columns of `categories`
categories.columns = category_colnames
categories.head()

Unnamed: 0,related,request,offer,aid_related,medical_help,medical_products,search_and_rescue,security,military,child_alone,...,aid_centers,other_infrastructure,weather_related,floods,storm,fire,earthquake,cold,other_weather,direct_report
0,related-1,request-0,offer-0,aid_related-0,medical_help-0,medical_products-0,search_and_rescue-0,security-0,military-0,child_alone-0,...,aid_centers-0,other_infrastructure-0,weather_related-0,floods-0,storm-0,fire-0,earthquake-0,cold-0,other_weather-0,direct_report-0
1,related-1,request-0,offer-0,aid_related-1,medical_help-0,medical_products-0,search_and_rescue-0,security-0,military-0,child_alone-0,...,aid_centers-0,other_infrastructure-0,weather_related-1,floods-0,storm-1,fire-0,earthquake-0,cold-0,other_weather-0,direct_report-0
2,related-1,request-0,offer-0,aid_related-0,medical_help-0,medical_products-0,search_and_rescue-0,security-0,military-0,child_alone-0,...,aid_centers-0,other_infrastructure-0,weather_related-0,floods-0,storm-0,fire-0,earthquake-0,cold-0,other_weather-0,direct_report-0
3,related-1,request-1,offer-0,aid_related-1,medical_help-0,medical_products-1,search_and_rescue-0,security-0,military-0,child_alone-0,...,aid_centers-0,other_infrastructure-0,weather_related-0,floods-0,storm-0,fire-0,earthquake-0,cold-0,other_weather-0,direct_report-0
4,related-1,request-0,offer-0,aid_related-0,medical_help-0,medical_products-0,search_and_rescue-0,security-0,military-0,child_alone-0,...,aid_centers-0,other_infrastructure-0,weather_related-0,floods-0,storm-0,fire-0,earthquake-0,cold-0,other_weather-0,direct_report-0


In [33]:
func = lambda x: x[-1:]
for col in categories.columns:
    categories[col] = categories[col].apply(func)
    categories[col] = categories[col].astype('int')

In [34]:
categories.head()

Unnamed: 0,related,request,offer,aid_related,medical_help,medical_products,search_and_rescue,security,military,child_alone,...,aid_centers,other_infrastructure,weather_related,floods,storm,fire,earthquake,cold,other_weather,direct_report
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,0,0,1,0,0,0,0,0,0,...,0,0,1,0,1,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,1,0,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [35]:
categories.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26248 entries, 0 to 26247
Data columns (total 36 columns):
related                   26248 non-null int64
request                   26248 non-null int64
offer                     26248 non-null int64
aid_related               26248 non-null int64
medical_help              26248 non-null int64
medical_products          26248 non-null int64
search_and_rescue         26248 non-null int64
security                  26248 non-null int64
military                  26248 non-null int64
child_alone               26248 non-null int64
water                     26248 non-null int64
food                      26248 non-null int64
shelter                   26248 non-null int64
clothing                  26248 non-null int64
money                     26248 non-null int64
missing_people            26248 non-null int64
refugees                  26248 non-null int64
death                     26248 non-null int64
other_aid                 26248 non-null int6

In [124]:
import numpy as np
arr = pd.DataFrame(0, index = range(len(categories)), columns = category_colnames)

for col in categories.columns.tolist():
    new = categories[col].apply(lambda row: row[-1:])
    arr[col] = new
    
arr.head()     # bunu gec

Unnamed: 0,related,request,offer,aid_related,medical_help,medical_products,search_and_rescue,security,military,child_alone,...,aid_centers,other_infrastructure,weather_related,floods,storm,fire,earthquake,cold,other_weather,direct_report
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,0,0,1,0,0,0,0,0,0,...,0,0,1,0,1,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,1,0,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [32]:
categories.head()

Unnamed: 0,related,request,offer,aid_related,medical_help,medical_products,search_and_rescue,security,military,child_alone,...,aid_centers,other_infrastructure,weather_related,floods,storm,fire,earthquake,cold,other_weather,direct_report
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,0,0,1,0,0,0,0,0,0,...,0,0,1,0,1,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,1,0,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### 4. Convert category values to just numbers 0 or 1.
- Iterate through the category columns in df to keep only the last character of each string (the 1 or 0). For example, `related-0` becomes `0`, `related-1` becomes `1`. Convert the string to a numeric value.
- You can perform [normal string actions on Pandas Series](https://pandas.pydata.org/pandas-docs/stable/text.html#indexing-with-str), like indexing, by including `.str` after the Series. You may need to first convert the Series to be of type string, which you can do with `astype(str)`.

In [146]:
for column in categories:
    # set each value to be the last character of the string
    categories[column] = categories[column].apply(lambda row: row[-1:])
    
    # convert column from string to numeric
    categories[column] = categories[column].astype('int')
categories.head()

Unnamed: 0,related,request,offer,aid_related,medical_help,medical_products,search_and_rescue,security,military,child_alone,...,aid_centers,other_infrastructure,weather_related,floods,storm,fire,earthquake,cold,other_weather,direct_report
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,0,0,1,0,0,0,0,0,0,...,0,0,1,0,1,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,1,0,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### 5. Replace `categories` column in `df` with new category columns.
- Drop the categories column from the df dataframe since it is no longer needed.
- Concatenate df and categories data frames.

In [36]:
# drop the original categories column from `df`
df.drop('categories', axis = 1, inplace = True)

df.head()

Unnamed: 0,id,message,original,genre
0,2,Weather update - a cold front from Cuba that c...,Un front froid se retrouve sur Cuba ce matin. ...,direct
1,7,Is the Hurricane over or is it not over,Cyclone nan fini osinon li pa fini,direct
2,8,Looking for someone but no name,"Patnm, di Maryani relem pou li banm nouvel li ...",direct
3,9,UN reports Leogane 80-90 destroyed. Only Hospi...,UN reports Leogane 80-90 destroyed. Only Hospi...,direct
4,12,"says: west side of Haiti, rest of the country ...",facade ouest d Haiti et le reste du pays aujou...,direct


In [37]:
# concatenate the original dataframe with the new `categories` dataframe
df = pd.concat([df, categories], axis = 1)
df.head()

Unnamed: 0,id,message,original,genre,related,request,offer,aid_related,medical_help,medical_products,...,aid_centers,other_infrastructure,weather_related,floods,storm,fire,earthquake,cold,other_weather,direct_report
0,2,Weather update - a cold front from Cuba that c...,Un front froid se retrouve sur Cuba ce matin. ...,direct,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,7,Is the Hurricane over or is it not over,Cyclone nan fini osinon li pa fini,direct,1.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2,8,Looking for someone but no name,"Patnm, di Maryani relem pou li banm nouvel li ...",direct,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,9,UN reports Leogane 80-90 destroyed. Only Hospi...,UN reports Leogane 80-90 destroyed. Only Hospi...,direct,1.0,1.0,0.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,12,"says: west side of Haiti, rest of the country ...",facade ouest d Haiti et le reste du pays aujou...,direct,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### 6. Remove duplicates.
- Check how many duplicates are in this dataset.
- Drop the duplicates.
- Confirm duplicates were removed.

In [38]:
# check number of duplicates
df.duplicated().sum()

41

In [39]:
# drop duplicates
df.drop_duplicates(inplace = True)

In [40]:
# check number of duplicates
df.duplicated().sum()

0

In [41]:
df.shape

(26345, 40)

### 7. Save the clean dataset into an sqlite database.
You can do this with pandas [`to_sql` method](https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.to_sql.html) combined with the SQLAlchemy library. Remember to import SQLAlchemy's `create_engine` in the first cell of this notebook to use it below.

In [153]:
from sqlalchemy import create_engine

In [154]:
engine = create_engine('sqlite:///InsertDatabaseName.db')
df.to_sql('clean', engine, index=False)

### 8. Use this notebook to complete `etl_pipeline.py`
Use the template file attached in the Resources folder to write a script that runs the steps above to create a database based on new datasets specified by the user. Alternatively, you can complete `etl_pipeline.py` in the classroom on the `Project Workspace IDE` coming later.

In [45]:
#final code

def load_clean_data(message_filepath, categories_filepath):
    '''Load and clean the data
    
    Args:
        message_filepath (str) : path to messages file
        categories_filepath (str): path to categories file
        
    Returns:
        clean_df : dataframe with columns id, message, original, genre
        and categories
    '''
    
    # load data from filepath
    messages = pd.read_csv(message_filepath)
    categories = pd.read_csv(categories_filepath)
    
    # merge messages and categories dataframes
    df = pd.merge(left = messages, right = categories, how = 'left', on = 'id')
    
    #split categories dataframe into different category columns
    categories = categories['categories'].str.split(';', expand = True)
    
    #extracting category column names
    row = categories.iloc[0, :]
    func = lambda x: x[:-2]
    category_column_names = row.apply(func)
    
    #getting the last number from values and converting it to type 'int'
    get_last_char = lambda x: x[-1:]
    for col in categories.columns:
        categories[col] = categories[col].apply(get_last_char)
        categories[col] = categories[col].astype('int')
      
    #assign category_column_names as the column names of 'categories'
    categories.columns = category_column_names
    
    #dropping 'categories' column from df
    df.drop('categories', axis = 1, inplace = True)
    
    #concatenating df and categories dataframe
    clean_df = pd.concat([df, categories], axis = 1)
    
    #drop duplicates
    clean_df.drop_duplicates(inplace = True)
    
    return clean_df

In [47]:
df = load_clean_data('messages.csv', 'categories.csv')

In [48]:
df.shape

(26345, 40)

In [52]:
def save_data(df, database_name):
    '''Save the dataframe df into sqlite database 
    The data is saved with a file name 'clean_data' in the
    database    
    
    Args:
        df : clean data to be saved
        database_name (str): name of the database
        
    Returns:
        None
    '''
    
    from sqlalchemy import create_engine
    
    dbase_path = 'sqlite:///' + database_name
    
    engine = create_engine(dbase_path)
    df.to_sql('clean', engine, index=False)

In [53]:
save_data(df, 'InsertDatabaseName.db')

In [None]:
import argparse
parser = argparse.ArgumentParser(description = 'Getting filepaths and database  name')
parser.add_argument('fpath_messages', help = 'Filepath to disaster messages file')
parser.add_argument('fpath_categories', help = 'Filepath to categories file')
parser.add_argument('database_name', help = 'Database name')

args = parser.parse_args()

message_filepath = args.fpath_messages
categories_filepath = args.fpath_categories
dbase_name = args.database_name

if __name__ == '__main__':
    
    clean_df = load_clean_data(message_filepath, categories_filepath)
    save_data(clean_df, dbase_name)

In [None]:
import sys


def load_data(database_filepath):
    pass


def tokenize(text):
    pass


def build_model():
    pass


def evaluate_model(model, X_test, Y_test, category_names):
    pass


def save_model(model, model_filepath):
    pass


def main():
    if len(sys.argv) == 3:
        database_filepath, model_filepath = sys.argv[1:]
        print('Loading data...\n    DATABASE: {}'.format(database_filepath))
        X, Y, category_names = load_data(database_filepath)
        X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2)
        
        print('Building model...')
        model = build_model()
        
        print('Training model...')
        model.fit(X_train, Y_train)
        
        print('Evaluating model...')
        evaluate_model(model, X_test, Y_test, category_names)

        print('Saving model...\n    MODEL: {}'.format(model_filepath))
        save_model(model, model_filepath)

        print('Trained model saved!')

    else:
        print('Please provide the filepath of the disaster messages database '\
              'as the first argument and the filepath of the pickle file to '\
              'save the model to as the second argument. \n\nExample: python '\
              'train_classifier.py ../data/DisasterResponse.db classifier.pkl')


if __name__ == '__main__':
    main()

In [1]:
import pickle
def save_model(model, model_filepath):
    
    with open(model_filepath, 'wb') as file:
        pickle.dump(model, file, protocol=pickle.HIGHEST_PROTOCOL)
        
        

In [2]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression()

save_model(model, 'model.pkl')