In [1]:
## NLP library
import re
import string
import nltk
from nltk.corpus import stopwords

## ML Library
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer, TfidfTransformer
from sklearn.model_selection import RepeatedStratifiedKFold,cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import f1_score
import pickle

import orchest

In [2]:
data = orchest.get_inputs()
train, test = data["data"]

In [3]:
train.text.head()

0    Our Deeds are the Reason of this #earthquake M...
1               Forest fire near La Ronge Sask. Canada
2    All residents asked to 'shelter in place' are ...
3    13,000 people receive #wildfires evacuation or...
4    Just got sent this photo from Ruby #Alaska as ...
Name: text, dtype: object

## Text Cleaning

In [4]:
def text_processing(data):
    # lowering the text
    data.text=data.text.apply(lambda x:x.lower() )
    #removing square brackets
    data.text=data.text.apply(lambda x:re.sub('\[.*?\]', '', x) )
    data.text=data.text.apply(lambda x:re.sub('<.*?>+', '', x) )
    #removing hyperlink
    data.text=data.text.apply(lambda x:re.sub('https?://\S+|www\.\S+', '', x) )
    #removing puncuation
    data.text=data.text.apply(lambda x:re.sub(
               '[%s]' % re.escape(string.punctuation), '', x
                                                ))
    data.text=data.text.apply(lambda x:re.sub('\n' , '', x) )
    #remove words containing numbers
    data.text=data.text.apply(lambda x:re.sub('\w*\d\w*' , '', x) )
    return data

In [5]:
train = text_processing(train)
test = text_processing(test)
train.text.head()

0    our deeds are the reason of this earthquake ma...
1                forest fire near la ronge sask canada
2    all residents asked to shelter in place are be...
3     people receive wildfires evacuation orders in...
4    just got sent this photo from ruby alaska as s...
Name: text, dtype: object

## Tokenization
Tokenization is the process of tokenizing or splitting a string, text into a list of tokens. One can think of token as parts like a word is a token in a sentence, and a sentence is a token in a paragraph.

In [6]:
#Tokenizer
token=nltk.tokenize.RegexpTokenizer(r'\w+')
#applying token
train.text=train.text.apply(lambda x:token.tokenize(x))
test.text=test.text.apply(lambda x:token.tokenize(x))
#view
display(train.text.head())

0    [our, deeds, are, the, reason, of, this, earth...
1        [forest, fire, near, la, ronge, sask, canada]
2    [all, residents, asked, to, shelter, in, place...
3    [people, receive, wildfires, evacuation, order...
4    [just, got, sent, this, photo, from, ruby, ala...
Name: text, dtype: object

In [7]:
## removing stop words

In [8]:
nltk.download('stopwords')
#removing stop words
train.text=train.text.apply(lambda x:[w for w in x if w not in stopwords.words('english')])
test.text=test.text.apply(lambda x:[w for w in x if w not in stopwords.words('english')])
#view
train.text.head()

[nltk_data] Downloading package stopwords to /home/jovyan/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


0    [deeds, reason, earthquake, may, allah, forgiv...
1        [forest, fire, near, la, ronge, sask, canada]
2    [residents, asked, shelter, place, notified, o...
3    [people, receive, wildfires, evacuation, order...
4    [got, sent, photo, ruby, alaska, smoke, wildfi...
Name: text, dtype: object

## Stemming
Stemming and Lemmatization in Python NLTK are text normalization techniques for Natural Language Processing. These techniques are widely used for text preprocessing. The difference between stemming and lemmatization is that stemming is faster as it cuts words without knowing the context, while lemmatization is slower as it knows the context of words before processing.

**In this case PoerterStemmer performed well then lemmatization**

In [9]:
#stemmering the text and joining
stemmer = nltk.stem.PorterStemmer()
train.text=train.text.apply(lambda x:" ".join(stemmer.stem(token) for token in x))
test.text=test.text.apply(lambda x:" ".join(stemmer.stem(token) for token in x))
#View
train.text.head()

0            deed reason earthquak may allah forgiv us
1                 forest fire near la rong sask canada
2    resid ask shelter place notifi offic evacu she...
3          peopl receiv wildfir evacu order california
4    got sent photo rubi alaska smoke wildfir pour ...
Name: text, dtype: object

## Text Vectorization
Machine learning algorithms most often take numeric feature vectors as input. Thus, when working with text documents, we need a way to convert each document into a numeric vector.

**In this case Countvectorizer is best performing.**

In [10]:
count_vectorizer = CountVectorizer()
train_vectors_count = count_vectorizer.fit_transform(train['text'])
test_vectors_count = count_vectorizer.transform(test["text"])

In [11]:
orchest.output((train_vectors_count, train["target"]), name="train")
orchest.output(test_vectors_count, name="test")