In [141]:
import nltk
nltk.download(['punkt', 'wordnet'])

import re
import numpy as np
import pandas as pd

from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\laura\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\laura\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\wordnet.zip.


# Pipeline

Data pipeline is a generic term for transfering data from one or more sources to a destination.

## ETL

A ETL pipeline is a procedure that `extract` data from a source or multiple sources, `transform` the data according with the project specifications and `load` the data to its destination. 

> Extract data -> Tansform data -> Load data into database -> Create an ETL pipeline

## ELT 

A ELT pipeline differs from ETL in order of processes. In ELT the load are done previously than transformation.




### Extracting data

In [62]:
# Extracting corporate messaging data as dataframe
df = pd.read_csv(r'data\corporate_messaging.csv')

### Wrangling

In [75]:
# Checking df
df.head(2)

Unnamed: 0,_unit_id,_golden,_unit_state,_trusted_judgments,_last_judgment_at,category,category:confidence,category_gold,id,screenname,text
0,662822308,False,finalized,3,2/18/15 4:31,Information,1.0,,4.36528e+17,Barclays,Barclays CEO stresses the importance of regula...
1,662822309,False,finalized,3,2/18/15 13:55,Information,1.0,,3.86013e+17,Barclays,Barclays announces result of Rights Issue http...


In [74]:
# Checking initial shape
rows, columns = df.shape
print('df has {} rows and {} columns'.format(rows, columns))

df has 3118 rows and 11 columns


In [91]:
# Checking nulls ratio
df.isnull().sum()/rows

_unit_id               0.000000
_golden                0.000000
_unit_state            0.000000
_trusted_judgments     0.000000
_last_judgment_at      0.088223
category               0.000000
category:confidence    0.000000
category_gold          0.911777
id                     0.000000
screenname             0.000000
text                   0.000000
dtype: float64

In [56]:
# Checking columns
df.columns

Index(['_unit_id', '_golden', '_unit_state', '_trusted_judgments',
       '_last_judgment_at', 'category', 'category:confidence', 'category_gold',
       'id', 'screenname', 'text'],
      dtype='object')

In [102]:
# Checking text columns values
df.loc[:,'text']

0       Barclays CEO stresses the importance of regula...
1       Barclays announces result of Rights Issue http...
2       Barclays publishes its prospectus for its å£5....
3       Barclays Group Finance Director Chris Lucas is...
4       Barclays announces that Irene McDermott Brown ...
                              ...                        
3107    We're grateful for 2x honors @ChamberBCLC Citi...
3108    WeÌ¢‰âÂ‰ã¢re the 1. to sign up to a European i...
3109    WeÌ¢‰âÂ‰ã¢re working hard to do all we can to ...
3115    Yesterday, these #HealthyKids lit up Broadway ...
3117    Z Bhutta: Problems with food&amp;land systems ...
Name: text, Length: 2403, dtype: object

In [58]:
# Checking category confidence values
df.loc[:,'category:confidence'].value_counts()

1.0000    2430
0.6614      35
0.6643      33
0.6747      32
0.6775      29
          ... 
0.8547       1
0.6641       1
0.8578       1
0.9089       1
0.8245       1
Name: category:confidence, Length: 194, dtype: int64

In [61]:
# Checking categories
df.loc[:,'category'].value_counts()

Information    2129
Action          724
Dialogue        226
Exclude          39
Name: category, dtype: int64

In [79]:
# Selecting only with messages that have confidence equal to 1 and the category is different from 'Exclude'
df = df[(df.loc[:,'category:confidence']==1) & (df.loc[:,'category'] != 'Exclude')]

In [80]:
# Checking new shape
rows, columns = df.shape
print('df now has {} rows and {} columns'.format(rows, columns))

df now has 2403 rows and 11 columns


In [156]:
# Tokenization Function
def tokenize(text):
    """ 
    DESCRIPTION
    1. Dectect all urls in each message, replace all of them for a string.
    2. Tokenize: delimit all words in each message.
    3. Lemmatize: group together the inflected forms of a word, so they can be analysed as a single item, identified them by the word's lemma.

    INPUT
    text (string): a string of disaster messages.

    OUTPUT
    clean_tokens (string): a tokenized list from input.
    """

    # 1.
    # Dectect url using regex
    url = 'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
    detected_urls = re.findall(url,text)
    for url in detected_urls:
        text = text.replace(url, 'urlplaceholder')

    # 2.
    tokens = word_tokenize(text)

    # 3.
    lemmatizer = WordNetLemmatizer()
    clean_tokens = []
    for token in tokens:
        # take each word on tokens list and lemmatize.
        clean_token = lemmatizer.lemmatize(token)
        # add clean token into the clean_tokens list
        clean_tokens.append(clean_token)

    return clean_tokens


### Train and Test Split

In [173]:
# Spliting data for test
X = df.text.values
y = df.category.values

# Training data
X_train, X_test, y_train, y_test = train_test_split(X, y)



In [174]:
def display_results(y_test, y_pred):
    labels = np.unique(y_pred)
    confusion_mat = confusion_matrix(y_test, y_pred, labels=labels)
    accuracy = (y_pred == y_test).mean()

    print("Labels:", labels)
    print("Confusion Matrix:\n", confusion_mat)
    print("Accuracy:", accuracy)

In [1]:
import nltk
nltk.download(['punkt', 'wordnet'])

import re
import numpy as np
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

url_regex = 'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'


def load_data():
    df = pd.read_csv(r'data\corporate_messaging.csv')
    df = df[(df["category:confidence"] == 1) & (df['category'] != 'Exclude')]
    X = df.text.values
    y = df.category.values
    return X, y


def tokenize(text):
    detected_urls = re.findall(url_regex, text)
    for url in detected_urls:
        text = text.replace(url, "urlplaceholder")

    tokens = word_tokenize(text)
    lemmatizer = WordNetLemmatizer()

    clean_tokens = []
    for tok in tokens:
        clean_tok = lemmatizer.lemmatize(tok).lower().strip()
        clean_tokens.append(clean_tok)

    return clean_tokens


def display_results(y_test, y_pred):
    labels = np.unique(y_pred)
    confusion_mat = confusion_matrix(y_test, y_pred, labels=labels)
    accuracy = (y_pred == y_test).mean()

    print("Labels:", labels)
    print("Confusion Matrix:\n", confusion_mat)
    print("Accuracy:", accuracy)


def main():
    X, y = load_data()
    X_train, X_test, y_train, y_test = train_test_split(X, y)

    vect = CountVectorizer(tokenizer=tokenize)
    tfidf = TfidfTransformer()
    clf = RandomForestClassifier()

    # train classifier
    X_train_counts = vect.fit_transform(X_train)
    X_train_tfidf = tfidf.fit_transform(X_train_counts)
    clf.fit(X_train_tfidf, y_train)

    # predict on test data
    X_test_counts = vect.transform(X_test)
    X_test_tfidf = tfidf.transform(X_test_counts)
    y_pred = clf.predict(X_test_tfidf)

    # display results
    display_results(y_test, y_pred)


main()

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\laura\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\laura\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
Labels: ['Action' 'Dialogue' 'Information']
Confusion Matrix:
 [[ 96   1  31]
 [  0  27   8]
 [  4   1 433]]
Accuracy: 0.9251247920133111
