In [2]:
import pandas as pd 
import numpy as np 
import matplotlib as plt 
import sqlite3

# Pipeline

Data pipeline is a generic term for transfering data from one or more sources to a destination.

## ETL

A ETL pipeline is a procedure that `extract` data from a source or multiple sources, `transform` the data according with the project specifications and `load` the data to its destination. 

> Extract data -> Tansform data -> Load data into database -> Create an ETL pipeline

## ELT 

A ELT pipeline differs from ETL in order of processes. In ELT the load are done previously than transformation.




### Extracting data

In [62]:
# Extracting corporate messaging data as dataframe
df = pd.read_csv(r'data\corporate_messaging.csv')

### Wrangling

In [75]:
# Checking df
df.head(2)

Unnamed: 0,_unit_id,_golden,_unit_state,_trusted_judgments,_last_judgment_at,category,category:confidence,category_gold,id,screenname,text
0,662822308,False,finalized,3,2/18/15 4:31,Information,1.0,,4.36528e+17,Barclays,Barclays CEO stresses the importance of regula...
1,662822309,False,finalized,3,2/18/15 13:55,Information,1.0,,3.86013e+17,Barclays,Barclays announces result of Rights Issue http...


In [74]:
# Checling initial shape
rows, columns = df.shape
print('df has {} rows and {} columns'.format(rows, columns))

df has 3118 rows and 11 columns


In [70]:
# Checking nulls
df.isnull().sum()

_unit_id                  0
_golden                   0
_unit_state               0
_trusted_judgments        0
_last_judgment_at       307
category                  0
category:confidence       0
category_gold          2811
id                        0
screenname                0
text                      0
dtype: int64

In [56]:
# Checking columns
df.columns

Index(['_unit_id', '_golden', '_unit_state', '_trusted_judgments',
       '_last_judgment_at', 'category', 'category:confidence', 'category_gold',
       'id', 'screenname', 'text'],
      dtype='object')

In [57]:
# Checking text columns values
df.loc[:,'text']

0       Barclays CEO stresses the importance of regula...
1       Barclays announces result of Rights Issue http...
2       Barclays publishes its prospectus for its å£5....
3       Barclays Group Finance Director Chris Lucas is...
4       Barclays announces that Irene McDermott Brown ...
                              ...                        
3113    Wishing you a happy and healthy Thanksgiving. ...
3114    WouldnÌ¢‰âÂ‰ã¢t it be great if your oven recog...
3115    Yesterday, these #HealthyKids lit up Broadway ...
3116    Yo-Jelly, Danone new brand in South Africa : t...
3117    Z Bhutta: Problems with food&amp;land systems ...
Name: text, Length: 3118, dtype: object

In [58]:
# Checking category confidence values
df.loc[:,'category:confidence'].value_counts()

1.0000    2430
0.6614      35
0.6643      33
0.6747      32
0.6775      29
          ... 
0.8547       1
0.6641       1
0.8578       1
0.9089       1
0.8245       1
Name: category:confidence, Length: 194, dtype: int64

In [61]:
# Checking categories
df.loc[:,'category'].value_counts()

Information    2129
Action          724
Dialogue        226
Exclude          39
Name: category, dtype: int64

In [79]:
# Selecting only with messages that have confidence equal to 1 and the category is different from 'Exclude'
df = df[(df.loc[:,'category:confidence']==1) & (df.loc[:,'category'] != 'Exclude')]

In [80]:
# Checking new shape
rows, columns = df.shape
print('df now has {} rows and {} columns'.format(rows, columns))

df now has 2403 rows and 11 columns


In [81]:
# Spliting data for test
X = df.text.values
y = df.category.values

In [86]:
# Dectect url using regex
url = 'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'

# Tokenization
def tokenize(text):
    """ 
    DESCRIPTION
    1. Dectect all urls in each message, replace all of them for a string.
    2. Tokenize: delimit all words in each message.capitalize.
    3. Lemmatize: group together the inflected forms of a word, so they can be analysed as a            single item, identified by the word's lemma.

    INPUT
    text (string): a list of disaster messages

    OUTPUT
    clean_tokens (string): a tokenized list from input 
    """




SyntaxError: unexpected EOF while parsing (<ipython-input-86-68975d878ba8>, line 3)