<a href="https://colab.research.google.com/github/kemaladamr/NLP_Fundamentals/blob/main/Basic_FeatureExtraction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Text Cleaning and Tokenization

In [1]:
import re

In [2]:
sentence = 'Sunil tweeted, "Witnessing 70th Republic Day of India from Rajpath, \
New Delhi. Mesmerizing performance by Indian Army! Awesome airshow! @india_official \
@indian_army #India #70thRepublic_Day. For more photos ping me sunil@photoking.com :)"'

In [3]:
re.sub(r'([^\s\w]|_)+', ' ', sentence).split()

['Sunil',
 'tweeted',
 'Witnessing',
 '70th',
 'Republic',
 'Day',
 'of',
 'India',
 'from',
 'Rajpath',
 'New',
 'Delhi',
 'Mesmerizing',
 'performance',
 'by',
 'Indian',
 'Army',
 'Awesome',
 'airshow',
 'india',
 'official',
 'indian',
 'army',
 'India',
 '70thRepublic',
 'Day',
 'For',
 'more',
 'photos',
 'ping',
 'me',
 'sunil',
 'photoking',
 'com']

## Extracting n-grams

In [4]:
def n_gram_extractor(sentence, n):
    tokens = re.sub(r'([^\s\w]|_)+', ' ', sentence).split()
    for i in range(len(tokens)-n+1):
        print(tokens[i:i+n])

bi-grams

In [5]:
n_gram_extractor('The cute little boy is playing with the kitten.', 2)

['The', 'cute']
['cute', 'little']
['little', 'boy']
['boy', 'is']
['is', 'playing']
['playing', 'with']
['with', 'the']
['the', 'kitten']


tri-grams

In [6]:
n_gram_extractor('The cute little boy is playing with the kitten.', 3)

['The', 'cute', 'little']
['cute', 'little', 'boy']
['little', 'boy', 'is']
['boy', 'is', 'playing']
['is', 'playing', 'with']
['playing', 'with', 'the']
['with', 'the', 'kitten']


In [7]:
from nltk import ngrams
list(ngrams('The cute little boy is playing with the kitten.'.split(), 2))

[('The', 'cute'),
 ('cute', 'little'),
 ('little', 'boy'),
 ('boy', 'is'),
 ('is', 'playing'),
 ('playing', 'with'),
 ('with', 'the'),
 ('the', 'kitten.')]

In [8]:
list(ngrams('The cute little boy is playing with the kitten.'.split(), 3))

[('The', 'cute', 'little'),
 ('cute', 'little', 'boy'),
 ('little', 'boy', 'is'),
 ('boy', 'is', 'playing'),
 ('is', 'playing', 'with'),
 ('playing', 'with', 'the'),
 ('with', 'the', 'kitten.')]

In [9]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [10]:
from textblob import TextBlob
blob = TextBlob('The cute little boy is playing with the kitten.')
blob.ngrams(n=2)

[WordList(['The', 'cute']),
 WordList(['cute', 'little']),
 WordList(['little', 'boy']),
 WordList(['boy', 'is']),
 WordList(['is', 'playing']),
 WordList(['playing', 'with']),
 WordList(['with', 'the']),
 WordList(['the', 'kitten'])]

In [11]:
blob.ngrams(n=3)

[WordList(['The', 'cute', 'little']),
 WordList(['cute', 'little', 'boy']),
 WordList(['little', 'boy', 'is']),
 WordList(['boy', 'is', 'playing']),
 WordList(['is', 'playing', 'with']),
 WordList(['playing', 'with', 'the']),
 WordList(['with', 'the', 'kitten'])]

## Tokenizing Texts with Keras and TextBlob

In [12]:
sentence

'Sunil tweeted, "Witnessing 70th Republic Day of India from Rajpath, New Delhi. Mesmerizing performance by Indian Army! Awesome airshow! @india_official @indian_army #India #70thRepublic_Day. For more photos ping me sunil@photoking.com :)"'

In [13]:
from keras.preprocessing.text import text_to_word_sequence

In [14]:
text_to_word_sequence(sentence)

['sunil',
 'tweeted',
 'witnessing',
 '70th',
 'republic',
 'day',
 'of',
 'india',
 'from',
 'rajpath',
 'new',
 'delhi',
 'mesmerizing',
 'performance',
 'by',
 'indian',
 'army',
 'awesome',
 'airshow',
 'india',
 'official',
 'indian',
 'army',
 'india',
 '70threpublic',
 'day',
 'for',
 'more',
 'photos',
 'ping',
 'me',
 'sunil',
 'photoking',
 'com']

In [15]:
blob = TextBlob(sentence)
blob.words

WordList(['Sunil', 'tweeted', 'Witnessing', '70th', 'Republic', 'Day', 'of', 'India', 'from', 'Rajpath', 'New', 'Delhi', 'Mesmerizing', 'performance', 'by', 'Indian', 'Army', 'Awesome', 'airshow', 'india_official', 'indian_army', 'India', '70thRepublic_Day', 'For', 'more', 'photos', 'ping', 'me', 'sunil', 'photoking.com'])

## Tokenizing Text using Various Tokenizer

In [16]:
sentence

'Sunil tweeted, "Witnessing 70th Republic Day of India from Rajpath, New Delhi. Mesmerizing performance by Indian Army! Awesome airshow! @india_official @indian_army #India #70thRepublic_Day. For more photos ping me sunil@photoking.com :)"'

In [17]:
from nltk.tokenize import TweetTokenizer
tweet_tokenizer = TweetTokenizer()
tweet_tokenizer.tokenize(sentence)

['Sunil',
 'tweeted',
 ',',
 '"',
 'Witnessing',
 '70th',
 'Republic',
 'Day',
 'of',
 'India',
 'from',
 'Rajpath',
 ',',
 'New',
 'Delhi',
 '.',
 'Mesmerizing',
 'performance',
 'by',
 'Indian',
 'Army',
 '!',
 'Awesome',
 'airshow',
 '!',
 '@india_official',
 '@indian_army',
 '#India',
 '#70thRepublic_Day',
 '.',
 'For',
 'more',
 'photos',
 'ping',
 'me',
 'sunil@photoking.com',
 ':)',
 '"']

In [18]:
from nltk.tokenize import MWETokenizer
mwe_tokenizer = MWETokenizer([('Republic', 'Day')])
mwe_tokenizer.add_mwe(('Indian', 'Army'))
mwe_tokenizer.tokenize(sentence.split())

['Sunil',
 'tweeted,',
 '"Witnessing',
 '70th',
 'Republic_Day',
 'of',
 'India',
 'from',
 'Rajpath,',
 'New',
 'Delhi.',
 'Mesmerizing',
 'performance',
 'by',
 'Indian',
 'Army!',
 'Awesome',
 'airshow!',
 '@india_official',
 '@indian_army',
 '#India',
 '#70thRepublic_Day.',
 'For',
 'more',
 'photos',
 'ping',
 'me',
 'sunil@photoking.com',
 ':)"']

In [19]:
mwe_tokenizer.tokenize(sentence.replace('!', '').split())

['Sunil',
 'tweeted,',
 '"Witnessing',
 '70th',
 'Republic_Day',
 'of',
 'India',
 'from',
 'Rajpath,',
 'New',
 'Delhi.',
 'Mesmerizing',
 'performance',
 'by',
 'Indian_Army',
 'Awesome',
 'airshow',
 '@india_official',
 '@indian_army',
 '#India',
 '#70thRepublic_Day.',
 'For',
 'more',
 'photos',
 'ping',
 'me',
 'sunil@photoking.com',
 ':)"']

In [20]:
from nltk.tokenize import RegexpTokenizer
reg_tokenizer = RegexpTokenizer('\w+|\$[\d\.]+|\S+')
reg_tokenizer.tokenize(sentence)

['Sunil',
 'tweeted',
 ',',
 '"Witnessing',
 '70th',
 'Republic',
 'Day',
 'of',
 'India',
 'from',
 'Rajpath',
 ',',
 'New',
 'Delhi',
 '.',
 'Mesmerizing',
 'performance',
 'by',
 'Indian',
 'Army',
 '!',
 'Awesome',
 'airshow',
 '!',
 '@india_official',
 '@indian_army',
 '#India',
 '#70thRepublic_Day.',
 'For',
 'more',
 'photos',
 'ping',
 'me',
 'sunil',
 '@photoking.com',
 ':)"']

In [21]:
from nltk.tokenize import WhitespaceTokenizer
wh_tokenizer = WhitespaceTokenizer()
wh_tokenizer.tokenize(sentence)

['Sunil',
 'tweeted,',
 '"Witnessing',
 '70th',
 'Republic',
 'Day',
 'of',
 'India',
 'from',
 'Rajpath,',
 'New',
 'Delhi.',
 'Mesmerizing',
 'performance',
 'by',
 'Indian',
 'Army!',
 'Awesome',
 'airshow!',
 '@india_official',
 '@indian_army',
 '#India',
 '#70thRepublic_Day.',
 'For',
 'more',
 'photos',
 'ping',
 'me',
 'sunil@photoking.com',
 ':)"']

In [22]:
from nltk.tokenize import WordPunctTokenizer
wp_tokenizer = WordPunctTokenizer()
wp_tokenizer.tokenize(sentence)

['Sunil',
 'tweeted',
 ',',
 '"',
 'Witnessing',
 '70th',
 'Republic',
 'Day',
 'of',
 'India',
 'from',
 'Rajpath',
 ',',
 'New',
 'Delhi',
 '.',
 'Mesmerizing',
 'performance',
 'by',
 'Indian',
 'Army',
 '!',
 'Awesome',
 'airshow',
 '!',
 '@',
 'india_official',
 '@',
 'indian_army',
 '#',
 'India',
 '#',
 '70thRepublic_Day',
 '.',
 'For',
 'more',
 'photos',
 'ping',
 'me',
 'sunil',
 '@',
 'photoking',
 '.',
 'com',
 ':)"']

## Converting words in gerund from into base words using
## RegexpStemmer


In [23]:
sentence = 'I love playing football'

In [24]:
from nltk.stem import RegexpStemmer
regex_stemmer = RegexpStemmer('ing$', min=4)
' '.join([regex_stemmer.stem(wd) for wd in sentence.split()])

'I love play football'

## The Porter Stemmer

In [25]:
sentence = "Before eating, it would be nice to sanitize your hands with a sanitizer"
from nltk.stem.porter import *

In [26]:
ps_stemmer = PorterStemmer()
' '.join([ps_stemmer.stem(wd) for wd in sentence.split()])

'befor eating, it would be nice to sanit your hand with a sanit'

## Lemmatization

In [27]:
from nltk.stem import WordNetLemmatizer
from nltk import word_tokenize
nltk.download('wordnet')
lemmatizer = WordNetLemmatizer()
sentence = "The products produce by the process today are far better than what it produces generally."
' '.join([lemmatizer.lemmatize(word) for word in word_tokenize(sentence)])

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


'The product produce by the process today are far better than what it produce generally .'

## Singularizing and Pluralizing Words

In [28]:
sentence = TextBlob('She sells seashells on the seashore')

In [29]:
sentence.words

WordList(['She', 'sells', 'seashells', 'on', 'the', 'seashore'])

In [30]:
sentence.words[2].singularize()

'seashell'

In [31]:
sentence.words[5].pluralize()

'seashores'

## Language Translation

In [None]:
# en_blob = TextBlob(u'muy bien')
# print(en_blob.translate(from_lang='es', to='en'))

## Stop-Word Removal

In [33]:
sentence = "She sells seashells on the seashore"

In [34]:
custom_stop_word_list = ['she', 'on', 'the', 'am', 'is', 'not']
' '.join([word for word in word_tokenize(sentence) if word.lower() not in custom_stop_word_list])

'sells seashells seashore'

## Extracting General Features from Raw Text

In [35]:
import pandas as pd
df = pd.DataFrame([['The interim budget for 2019 will be anonunced on 1st February.'],
                   ['Do you know how much expectation the middle-class working population is having from the budget?'],
                   ['February is the most shortest month in a year.'], ['This financial year will end on 31st March.']])
df.columns = ['text']
df

Unnamed: 0,text
0,The interim budget for 2019 will be anonunced ...
1,Do you know how much expectation the middle-cl...
2,February is the most shortest month in a year.
3,This financial year will end on 31st March.


In [36]:
from textblob import TextBlob
df['number_of_words'] = df['text'].apply(lambda x: len(TextBlob(str(x)).words))
df['number_of_words']

0    11
1    15
2     9
3     8
Name: number_of_words, dtype: int64

In [37]:
wh_words = set(['why', 'who', 'which', 'what', 'where', 'when', 'how'])
df['is_wh_words_present'] = df['text'].apply(lambda x: True if \
                                             len(set(TextBlob(str(x))
                                             .words).intersection(wh_words))>0 else False)
df['is_wh_words_present']

0    False
1     True
2    False
3    False
Name: is_wh_words_present, dtype: bool

In [38]:
df['polarity'] = df['text'].apply(lambda x: TextBlob(str(x)).sentiment.polarity)
df['polarity']

0    0.0
1    0.2
2    0.5
3    0.0
Name: polarity, dtype: float64

In [39]:
df['subjectivity'] = df['text'].apply(lambda x: TextBlob(str(x)).sentiment.subjectivity)
df['subjectivity']

0    0.0
1    0.2
2    0.5
3    0.0
Name: subjectivity, dtype: float64

In [None]:
# df['language'] = df['text'].apply(lambda x: TextBlob(str(x)).detect_language())
# df['language']

## Creating BoW (Bag of Words)

In [41]:
from sklearn.feature_extraction.text import CountVectorizer
corpus = [
          'Data Science is an overlap between Arts and Science',
          'Generally, Arts graduates are right-brained and Science graduates are left-brained',
          'Excelling in both Arts and Science at a time becomes difficult',
          'Natural Language Processing is a part of Data Science'
]

In [42]:
bag_of_words_model = CountVectorizer()
print(bag_of_words_model.fit_transform(corpus).todense())
bag_of_word_df = pd.DataFrame(bag_of_words_model.fit_transform(corpus).todense())
bag_of_word_df.columns = sorted(bag_of_words_model.vocabulary_)
bag_of_word_df.head()

[[1 1 0 1 0 0 1 0 0 1 0 0 0 0 0 1 0 0 0 0 1 0 0 0 2 0]
 [0 1 2 1 0 0 0 0 2 0 0 0 1 2 0 0 0 1 0 0 0 0 0 1 1 0]
 [0 1 0 1 1 1 0 1 0 0 1 1 0 0 1 0 0 0 0 0 0 0 0 0 1 1]
 [0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 1 0 1 1 0 1 1 0 1 0]]


Unnamed: 0,an,and,are,arts,at,becomes,between,both,brained,data,difficult,excelling,generally,graduates,in,is,language,left,natural,of,overlap,part,processing,right,science,time
0,1,1,0,1,0,0,1,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0,0,0,2,0
1,0,1,2,1,0,0,0,0,2,0,0,0,1,2,0,0,0,1,0,0,0,0,0,1,1,0
2,0,1,0,1,1,1,0,1,0,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0,1,1
3,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,1,0,1,1,0,1,1,0,1,0


In [43]:
bag_of_words_model_small = CountVectorizer(max_features=10)
bag_of_word_df_small = pd.DataFrame(bag_of_words_model_small.fit_transform(corpus).todense())
bag_of_word_df_small.columns = sorted(bag_of_words_model_small.vocabulary_)
bag_of_word_df_small.head()

Unnamed: 0,an,and,are,arts,brained,data,graduates,is,right,science
0,1,1,0,1,0,1,0,1,0,2
1,0,1,2,1,2,0,2,0,1,1
2,0,1,0,1,0,0,0,0,0,1
3,0,0,0,0,0,1,0,1,0,1


## Zip's Law

In [45]:
from pylab import *
nltk.download('stopwords')
from sklearn.datasets import fetch_20newsgroups
from nltk.corpus import stopwords
import matplotlib.pyplot as plt
%matplotlib inline
import string
from collections import Counter
newsgroups_data_sample = fetch_20newsgroups(subset='train')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
