## Install and Import

In [1]:
#!nvidia-smi

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
#!pip install nltk

In [1]:
import nltk

In [5]:
# natural language toolkit
#!pip install nltk contractions

🔑    :     https://www.nltk.org/api/nltk.tokenize.html

In [6]:
import nltk
import numpy as np
import pandas as pd 
import contractions

### Notebook settings

In [7]:
pd.set_option('display.max_colwidth', None)
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

## Sample Text to process

In [8]:
sample_text= """This is pretty cool. A good quality candy might cost $3.88 in New York. 
                But I don't think we will buy it. Mr.Biden said $1,000,000. 2 cars."""

## Lower casing

In [9]:
sample_text = sample_text.lower()
sample_text

"this is pretty cool. a good quality candy might cost $3.88 in new york. \n                but i don't think we will buy it. mr.biden said $1,000,000. 2 cars."

## Tokenization

In [10]:
from nltk.tokenize import sent_tokenize, wordpunct_tokenize, word_tokenize

In [11]:
dir(nltk.tokenize)

['BlanklineTokenizer',
 'LegalitySyllableTokenizer',
 'LineTokenizer',
 'MWETokenizer',
 'NLTKWordTokenizer',
 'PunktSentenceTokenizer',
 'RegexpTokenizer',
 'ReppTokenizer',
 'SExprTokenizer',
 'SpaceTokenizer',
 'StanfordSegmenter',
 'SyllableTokenizer',
 'TabTokenizer',
 'TextTilingTokenizer',
 'ToktokTokenizer',
 'TreebankWordTokenizer',
 'TweetTokenizer',
 'WhitespaceTokenizer',
 'WordPunctTokenizer',
 '__builtins__',
 '__cached__',
 '__doc__',
 '__file__',
 '__loader__',
 '__name__',
 '__package__',
 '__path__',
 '__spec__',
 '_treebank_word_tokenizer',
 'api',
 'blankline_tokenize',
 'casual',
 'casual_tokenize',
 'destructive',
 'legality_principle',
 'line_tokenize',
 'load',
 'mwe',
 'punkt',
 're',
 'regexp',
 'regexp_span_tokenize',
 'regexp_tokenize',
 'repp',
 'sent_tokenize',
 'sexpr',
 'sexpr_tokenize',
 'simple',
 'sonority_sequencing',
 'stanford_segmenter',
 'string_span_tokenize',
 'texttiling',
 'toktok',
 'treebank',
 'util',
 'word_tokenize',
 'wordpunct_tokenize

In [12]:
nltk.word_tokenize??

#### Sentence Tokenization

In [13]:
# To use tokenziers
#nltk.download('punkt')

In [14]:
print(sample_text)

this is pretty cool. a good quality candy might cost $3.88 in new york. 
                but i don't think we will buy it. mr.biden said $1,000,000. 2 cars.


In [15]:
sentence_tokens = sent_tokenize(sample_text)
sentence_tokens

['this is pretty cool.',
 'a good quality candy might cost $3.88 in new york.',
 "but i don't think we will buy it.",
 'mr.biden said $1,000,000.',
 '2 cars.']

#### WordPunct Tokenization

In [16]:
wordpunc_tokens = wordpunct_tokenize(sample_text)
print(wordpunc_tokens)

['this', 'is', 'pretty', 'cool', '.', 'a', 'good', 'quality', 'candy', 'might', 'cost', '$', '3', '.', '88', 'in', 'new', 'york', '.', 'but', 'i', 'don', "'", 't', 'think', 'we', 'will', 'buy', 'it', '.', 'mr', '.', 'biden', 'said', '$', '1', ',', '000', ',', '000', '.', '2', 'cars', '.']


#### Word Tokenization

In [17]:
word_tokens = word_tokenize(sample_text)
print(word_tokens)

['this', 'is', 'pretty', 'cool', '.', 'a', 'good', 'quality', 'candy', 'might', 'cost', '$', '3.88', 'in', 'new', 'york', '.', 'but', 'i', 'do', "n't", 'think', 'we', 'will', 'buy', 'it', '.', 'mr.biden', 'said', '$', '1,000,000', '.', '2', 'cars', '.']


#### Tweet Tokenizer

In [18]:
from nltk.tokenize import TweetTokenizer
tk = TweetTokenizer()

In [19]:
# Create a string input
tweet1 = '&quot;Home home sweet home&quot;' #--> "Home home sweet home"
result = tk.tokenize(tweet1)
result

['"', 'Home', 'home', 'sweet', 'home', '"']

In [20]:
word_tokenize(tweet1)

['&', 'quot', ';', 'Home', 'home', 'sweet', 'home', '&', 'quot', ';']

In [21]:
tweet2 = 'I really dislike him. #VivaUSA @someone I really enjoy reading a book.'
print(tk.tokenize(tweet2))

['I', 'really', 'dislike', 'him', '.', '#VivaUSA', '@someone', 'I', 'really', 'enjoy', 'reading', 'a', 'book', '.']


In [22]:
print(word_tokenize(tweet2))

['I', 'really', 'dislike', 'him', '.', '#', 'VivaUSA', '@', 'someone', 'I', 'really', 'enjoy', 'reading', 'a', 'book', '.']


## Removing Punctuation and Numbers

In [23]:
print(word_tokens)

['this', 'is', 'pretty', 'cool', '.', 'a', 'good', 'quality', 'candy', 'might', 'cost', '$', '3.88', 'in', 'new', 'york', '.', 'but', 'i', 'do', "n't", 'think', 'we', 'will', 'buy', 'it', '.', 'mr.biden', 'said', '$', '1,000,000', '.', '2', 'cars', '.']


In [24]:
tokens_without_punc = [w for w in word_tokens if w.isalpha()] # .isalnum() for number and object # we are losing mr.biden
print(tokens_without_punc)

['this', 'is', 'pretty', 'cool', 'a', 'good', 'quality', 'candy', 'might', 'cost', 'in', 'new', 'york', 'but', 'i', 'do', 'think', 'we', 'will', 'buy', 'it', 'said', 'cars']


## Removing Stopwords

In [25]:
#nltk.download('stopwords')

In [26]:
from nltk.corpus import stopwords

In [27]:
stop_words  = stopwords.words('english')
print(stop_words)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [28]:
stop_words.__len__()

179

In [29]:
len(stop_words)

179

In [30]:
stop_words2  = stopwords.words('turkish')
print(stop_words2)

['acaba', 'ama', 'aslında', 'az', 'bazı', 'belki', 'biri', 'birkaç', 'birşey', 'biz', 'bu', 'çok', 'çünkü', 'da', 'daha', 'de', 'defa', 'diye', 'eğer', 'en', 'gibi', 'hem', 'hep', 'hepsi', 'her', 'hiç', 'için', 'ile', 'ise', 'kez', 'ki', 'kim', 'mı', 'mu', 'mü', 'nasıl', 'ne', 'neden', 'nerde', 'nerede', 'nereye', 'niçin', 'niye', 'o', 'sanki', 'şey', 'siz', 'şu', 'tüm', 've', 'veya', 'ya', 'yani']


In [31]:
words_to_exclude_from_stopwords = ['not', "n't", 'ain', 'aren', "aren't", 'couldn', "couldn't", 'didn', "didn't", 'doesn', "doesn't", "don't", 'hadn', 
                                   "hadn't", 'hasn', "hasn't", 'haven', "haven't", 'isn', "isn't", 'ma', 'mightn', "mightn't", 'mustn', "mustn't", 
                                   'needn', "needn't", 'shan', "shan't", 'shouldn', "shouldn't", 'wasn', "wasn't", 'weren', "weren't", 'won', 
                                   "won't", 'wouldn', "wouldn't"]

new_stopwords = [w for w in stop_words if w not in words_to_exclude_from_stopwords]
print('len new_stop_words :', len(new_stopwords))

len new_stop_words : 141


In [32]:
token_without_sw = [t for t in tokens_without_punc if t not in new_stopwords] # stop_words
print(token_without_sw)

['pretty', 'cool', 'good', 'quality', 'candy', 'might', 'cost', 'new', 'york', 'think', 'buy', 'said', 'cars']


## Lemmatization

In [33]:
from nltk.stem import WordNetLemmatizer

In [34]:
#nltk.download('wordnet')

In [35]:
WordNetLemmatizer().lemmatize("driving")
WordNetLemmatizer().lemmatize("driver")
WordNetLemmatizer().lemmatize("drivers")
WordNetLemmatizer().lemmatize("drives")
WordNetLemmatizer().lemmatize("drove")

'driving'

'driver'

'driver'

'drive'

'drove'

In [36]:
lem = [WordNetLemmatizer().lemmatize(t) for t in token_without_sw]
print(lem)
print(token_without_sw)

['pretty', 'cool', 'good', 'quality', 'candy', 'might', 'cost', 'new', 'york', 'think', 'buy', 'said', 'car']
['pretty', 'cool', 'good', 'quality', 'candy', 'might', 'cost', 'new', 'york', 'think', 'buy', 'said', 'cars']


## Stemming

In [37]:
from nltk.stem import PorterStemmer  # only supports English

In [38]:
PorterStemmer().stem("driving")
PorterStemmer().stem("driver")
PorterStemmer().stem("drives")
PorterStemmer().stem("drove")
PorterStemmer().stem("Drive")
PorterStemmer().stem("drivers")
PorterStemmer().stem("was")

'drive'

'driver'

'drive'

'drove'

'drive'

'driver'

'wa'

In [39]:
stem = [PorterStemmer().stem(t) for t in token_without_sw]

In [40]:
print('w/o norm :', token_without_sw)
print('stem     :', stem)
print('lemma    :', lem)

w/o norm : ['pretty', 'cool', 'good', 'quality', 'candy', 'might', 'cost', 'new', 'york', 'think', 'buy', 'said', 'cars']
stem     : ['pretti', 'cool', 'good', 'qualiti', 'candi', 'might', 'cost', 'new', 'york', 'think', 'buy', 'said', 'car']
lemma    : ['pretty', 'cool', 'good', 'quality', 'candy', 'might', 'cost', 'new', 'york', 'think', 'buy', 'said', 'car']


In [41]:
from nltk.stem import SnowballStemmer  # supports multiple languages such as english, russian, french, italian...
stemmer = SnowballStemmer("english")

drive_statements = ['driving', 'driver', 'drives', 'drove', 'Drive', 'drivers', 'was']

for i in range(0, len(drive_statements)):
    stemmer.stem(drive_statements[i])

'drive'

'driver'

'drive'

'drove'

'drive'

'driver'

'was'

## Joining

In [42]:
" ".join(lem)
sample_text

'pretty cool good quality candy might cost new york think buy said car'

"this is pretty cool. a good quality candy might cost $3.88 in new york. \n                but i don't think we will buy it. mr.biden said $1,000,000. 2 cars."

### Expanding Contractions

In [43]:
my_text  = word_tokenize(contractions.fix("I won't be there"))
my_text

['I', 'will', 'not', 'be', 'there']

In [44]:
contractions.fix("I won't be there")

'I will not be there'

### Part of Speech Tag


In [45]:
#nltk.download('averaged_perceptron_tagger');

In [46]:
from nltk import pos_tag

In [47]:
text = """Steven Paul Jobs was an American business magnate, industrial designer, investor, and media proprietor. 
He was the chairman, chief executive officer (CEO), and co-founder of Apple Inc.; the chairman and majority shareholder of Pixar; 
a member of The Walt Disney Company's board of directors following its acquisition of Pixar; and the founder, chairman, and CEO of NeXT."""

In [48]:
tokens = word_tokenize(text)
print(tokens)
pos = pos_tag(tokens)
pos

['Steven', 'Paul', 'Jobs', 'was', 'an', 'American', 'business', 'magnate', ',', 'industrial', 'designer', ',', 'investor', ',', 'and', 'media', 'proprietor', '.', 'He', 'was', 'the', 'chairman', ',', 'chief', 'executive', 'officer', '(', 'CEO', ')', ',', 'and', 'co-founder', 'of', 'Apple', 'Inc.', ';', 'the', 'chairman', 'and', 'majority', 'shareholder', 'of', 'Pixar', ';', 'a', 'member', 'of', 'The', 'Walt', 'Disney', 'Company', "'s", 'board', 'of', 'directors', 'following', 'its', 'acquisition', 'of', 'Pixar', ';', 'and', 'the', 'founder', ',', 'chairman', ',', 'and', 'CEO', 'of', 'NeXT', '.']


[('Steven', 'NNP'),
 ('Paul', 'NNP'),
 ('Jobs', 'NNP'),
 ('was', 'VBD'),
 ('an', 'DT'),
 ('American', 'JJ'),
 ('business', 'NN'),
 ('magnate', 'NN'),
 (',', ','),
 ('industrial', 'JJ'),
 ('designer', 'NN'),
 (',', ','),
 ('investor', 'NN'),
 (',', ','),
 ('and', 'CC'),
 ('media', 'NNS'),
 ('proprietor', 'NN'),
 ('.', '.'),
 ('He', 'PRP'),
 ('was', 'VBD'),
 ('the', 'DT'),
 ('chairman', 'NN'),
 (',', ','),
 ('chief', 'JJ'),
 ('executive', 'NN'),
 ('officer', 'NN'),
 ('(', '('),
 ('CEO', 'NNP'),
 (')', ')'),
 (',', ','),
 ('and', 'CC'),
 ('co-founder', 'NN'),
 ('of', 'IN'),
 ('Apple', 'NNP'),
 ('Inc.', 'NNP'),
 (';', ':'),
 ('the', 'DT'),
 ('chairman', 'NN'),
 ('and', 'CC'),
 ('majority', 'NN'),
 ('shareholder', 'NN'),
 ('of', 'IN'),
 ('Pixar', 'NNP'),
 (';', ':'),
 ('a', 'DT'),
 ('member', 'NN'),
 ('of', 'IN'),
 ('The', 'DT'),
 ('Walt', 'NNP'),
 ('Disney', 'NNP'),
 ('Company', 'NNP'),
 ("'s", 'POS'),
 ('board', 'NN'),
 ('of', 'IN'),
 ('directors', 'NNS'),
 ('following', 'VBG'),
 ('its', 

### NER

In [49]:
#nltk.download('maxent_ne_chunker')
#nltk.download('words')

In [50]:
from nltk import ne_chunk
print(text, '\n')
for chunk in nltk.ne_chunk(pos):
      if hasattr(chunk, 'label'):
        print(chunk.label(), ' '.join(c[0] for c in chunk))

Steven Paul Jobs was an American business magnate, industrial designer, investor, and media proprietor. 
He was the chairman, chief executive officer (CEO), and co-founder of Apple Inc.; the chairman and majority shareholder of Pixar; 
a member of The Walt Disney Company's board of directors following its acquisition of Pixar; and the founder, chairman, and CEO of NeXT. 

PERSON Steven
PERSON Paul Jobs
GPE American
ORGANIZATION CEO
ORGANIZATION Apple Inc.
GPE Pixar
ORGANIZATION Walt Disney Company
GPE Pixar
ORGANIZATION CEO
ORGANIZATION NeXT


In [51]:
# Challenges!
# Apple is shaping the industry. vs Apple means more than fruit
# Costa is a good guy. vs Costa Rica is a beautiful country.

## Cleaning Function

In [52]:
a = "I don't want to fly with your company." # vs 
[token for token in word_tokenize(a) if token not in stop_words] 
b = "I don't want to fly with your company".replace("'", '') 
[token for token in word_tokenize(b) if token not in stop_words]

['I', "n't", 'want', 'fly', 'company', '.']

['I', 'dont', 'want', 'fly', 'company']

In [53]:
def cleaning(data: str) -> str:
    
    #1. Contractions Expension & Tokenize
    text_tokens = word_tokenize(contractions.fix(data.lower())) 
    # text_tokens = word_tokenize(data.replace("'", '').lower())
    
    #2. Remove Puncs & numbers
    tokens_without_punc = [w for w in text_tokens if w.isalpha()]
    
    #3. Removing Stopwords
    tokens_without_sw = [t for t in tokens_without_punc if t not in new_stopwords]
    
    #4. lemma
    text_cleaned = [WordNetLemmatizer().lemmatize(t) for t in tokens_without_sw]
    
    #joining
    return " ".join(text_cleaned)

## CountVectorization

#### Data
🔑 Source: https://www.kaggle.com/crowdflower/twitter-airline-sentiment?select=Tweets.csv

In [54]:
# For Colab
# from google.colab import drive
# drive.mount('/content/gdrive')

In [55]:
import pandas as pd
#df = pd.read_csv('/content/gdrive/MyDrive/Colab Notebooks/Clarusway_NLP/Clarusway/clarusway-ds-students-7-21-main/3- Classes_Labs/NLP/NLP-1/airline_tweets.csv')
df = pd.read_csv("airline_tweets.csv")

In [56]:
df.head()

Unnamed: 0,tweet_id,airline_sentiment,airline_sentiment_confidence,negativereason,negativereason_confidence,airline,airline_sentiment_gold,name,negativereason_gold,retweet_count,text,tweet_coord,tweet_created,tweet_location,user_timezone
0,570306133677760513,neutral,1.0,,,Virgin America,,cairdin,,0,@VirginAmerica What @dhepburn said.,,2015-02-24 11:35:52 -0800,,Eastern Time (US & Canada)
1,570301130888122368,positive,0.3486,,0.0,Virgin America,,jnardino,,0,@VirginAmerica plus you've added commercials to the experience... tacky.,,2015-02-24 11:15:59 -0800,,Pacific Time (US & Canada)
2,570301083672813571,neutral,0.6837,,,Virgin America,,yvonnalynn,,0,@VirginAmerica I didn't today... Must mean I need to take another trip!,,2015-02-24 11:15:48 -0800,Lets Play,Central Time (US & Canada)
3,570301031407624196,negative,1.0,Bad Flight,0.7033,Virgin America,,jnardino,,0,"@VirginAmerica it's really aggressive to blast obnoxious ""entertainment"" in your guests' faces &amp; they have little recourse",,2015-02-24 11:15:36 -0800,,Pacific Time (US & Canada)
4,570300817074462722,negative,1.0,Can't Tell,1.0,Virgin America,,jnardino,,0,@VirginAmerica and it's a really big bad thing about it,,2015-02-24 11:14:45 -0800,,Pacific Time (US & Canada)


In [57]:
data = [text for text in df['text'][:5]]

In [58]:
# Tweet Tokenizer
#for i in range(0, len(data)):
#    data[i] = tk.tokenize(text)


In [59]:
# Let's check the data
data

['@VirginAmerica What @dhepburn said.',
 "@VirginAmerica plus you've added commercials to the experience... tacky.",
 "@VirginAmerica I didn't today... Must mean I need to take another trip!",
 '@VirginAmerica it\'s really aggressive to blast obnoxious "entertainment" in your guests\' faces &amp; they have little recourse',
 "@VirginAmerica and it's a really big bad thing about it"]

In [60]:
from sklearn.feature_extraction.text import CountVectorizer

In [61]:
vec = CountVectorizer()
#vec = CountVectorizer(stop_words='english')
docs = vec.fit_transform(data)

In [62]:
features = vec.get_feature_names_out()  # get_feature_names()
features

array(['about', 'added', 'aggressive', 'amp', 'and', 'another', 'bad',
       'big', 'blast', 'commercials', 'dhepburn', 'didn', 'entertainment',
       'experience', 'faces', 'guests', 'have', 'in', 'it', 'little',
       'mean', 'must', 'need', 'obnoxious', 'plus', 'really', 'recourse',
       'said', 'tacky', 'take', 'the', 'they', 'thing', 'to', 'today',
       'trip', 've', 'virginamerica', 'what', 'you', 'your'], dtype=object)

In [63]:
docs


<5x41 sparse matrix of type '<class 'numpy.int64'>'
	with 49 stored elements in Compressed Sparse Row format>

In [64]:
features.__len__()

41

In [65]:
docs.toarray()

array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0],
       [0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0],
       [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1,
        1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0],
       [0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0,
        0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1],
       [1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0,
        0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0]])

In [66]:
list(zip(features, docs.toarray().sum(axis=0)))

[('about', 1),
 ('added', 1),
 ('aggressive', 1),
 ('amp', 1),
 ('and', 1),
 ('another', 1),
 ('bad', 1),
 ('big', 1),
 ('blast', 1),
 ('commercials', 1),
 ('dhepburn', 1),
 ('didn', 1),
 ('entertainment', 1),
 ('experience', 1),
 ('faces', 1),
 ('guests', 1),
 ('have', 1),
 ('in', 1),
 ('it', 3),
 ('little', 1),
 ('mean', 1),
 ('must', 1),
 ('need', 1),
 ('obnoxious', 1),
 ('plus', 1),
 ('really', 2),
 ('recourse', 1),
 ('said', 1),
 ('tacky', 1),
 ('take', 1),
 ('the', 1),
 ('they', 1),
 ('thing', 1),
 ('to', 3),
 ('today', 1),
 ('trip', 1),
 ('ve', 1),
 ('virginamerica', 5),
 ('what', 1),
 ('you', 1),
 ('your', 1)]

In [67]:
df_vectorized = pd.DataFrame(docs.toarray(), columns=features)
df_vectorized

Unnamed: 0,about,added,aggressive,amp,and,another,bad,big,blast,commercials,...,they,thing,to,today,trip,ve,virginamerica,what,you,your
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,1,0,0
1,0,1,0,0,0,0,0,0,0,1,...,0,0,1,0,0,1,1,0,1,0
2,0,0,0,0,0,1,0,0,0,0,...,0,0,1,1,1,0,1,0,0,0
3,0,0,1,1,0,0,0,0,1,0,...,1,0,1,0,0,0,1,0,0,1
4,1,0,0,0,1,0,1,1,0,0,...,0,1,0,0,0,0,1,0,0,0


In [68]:
new_data = [text for text in df['text'][5:7]]
new_data

["@VirginAmerica seriously would pay $30 a flight for seats that didn't have this playing.\nit's really the only bad thing about flying VA",
 '@VirginAmerica yes, nearly every time I fly VX this “ear worm” won’t go away :)']

In [69]:
new_docs = vec.transform(new_data)
new_docs

<2x41 sparse matrix of type '<class 'numpy.int64'>'
	with 10 stored elements in Compressed Sparse Row format>

In [70]:
new_docs.toarray()

array([[1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0,
        0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0]])

In [71]:
print(docs.toarray()[0])
print(new_docs.toarray()[0])

[0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0
 1 1 0 0]
[1 0 0 0 0 0 1 0 0 0 0 1 0 0 0 0 1 0 1 0 0 0 0 0 0 1 0 0 0 0 1 0 1 0 0 0 0
 1 0 0 0]


### Hashing Vectorizer
Memory efficient

In [72]:
from sklearn.feature_extraction.text import HashingVectorizer

In [73]:
# Check it after the class!

## Important elements of CountVectorization

-> n-gram
-> min_df

### n-gram

> Without *n-*grams, the relative proximity of words is ignored. 
> e.g, credit score —> they are meaningful together.

> The downside to using n-grams is that it increases memory consumption and training time.

### min_df

> `min_df`, which ignores words that appear fewer than the specified number of times
> 
> - integer or float are valid
> - filtering and reduces the memory usage and training time
> - also max_df can be used

In [78]:
vec_2 = CountVectorizer(stop_words='english', ngram_range=(1,2), min_df = 2)

In [79]:
docs_2 = vec_2.fit_transform(data)

In [80]:
vec_2.get_feature_names_out()

array(['really', 'virginamerica', 'virginamerica really'], dtype=object)

## TF-IDF

🔑 : sklearn TD-IDF
https://towardsdatascience.com/how-sklearns-tf-idf-is-different-from-the-standard-tf-idf-275fa582e73d

In [131]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [148]:
train_data = ["Similar problems similar solutions",
              "Why do you ignore these problems",
              "Statistical challenges cause different problems",
              "Sometimes solutions are the simplest and easiest ones"
             ]

test_data = ["Life is beautiful and peaceful",
             "NLP problems mostly require simple solutions"]

In [149]:
tf_idf_vectorizer = TfidfVectorizer()
train_tf_idf = tf_idf_vectorizer.fit_transform(train_data)
test_tf_idf = tf_idf_vectorizer.transform(test_data)

In [150]:
tf_idf_features = tf_idf_vectorizer.get_feature_names_out()
tf_idf_features

array(['and', 'are', 'cause', 'challenges', 'different', 'do', 'easiest',
       'ignore', 'ones', 'problems', 'similar', 'simplest', 'solutions',
       'sometimes', 'statistical', 'the', 'these', 'why', 'you'],
      dtype=object)

In [151]:
train_tf_idf.toarray()

array([[0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.28462634,
        0.89184431, 0.        , 0.35157015, 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.        ,
        0.43003652, 0.        , 0.43003652, 0.        , 0.27448674,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.43003652, 0.43003652, 0.43003652],
       [0.        , 0.        , 0.47633035, 0.47633035, 0.47633035,
        0.        , 0.        , 0.        , 0.        , 0.30403549,
        0.        , 0.        , 0.        , 0.        , 0.47633035,
        0.        , 0.        , 0.        , 0.        ],
       [0.36222393, 0.36222393, 0.        , 0.        , 0.        ,
        0.        , 0.36222393, 0.        , 0.36222393, 0.        ,
        0.        , 0.36222393, 0.2855815 , 0.36222393, 0.        ,
        0.362

In [152]:
pd.DataFrame(train_tf_idf.toarray(), columns = tf_idf_features)

Unnamed: 0,and,are,cause,challenges,different,do,easiest,ignore,ones,problems,similar,simplest,solutions,sometimes,statistical,the,these,why,you
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.284626,0.891844,0.0,0.35157,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.430037,0.0,0.430037,0.0,0.274487,0.0,0.0,0.0,0.0,0.0,0.0,0.430037,0.430037,0.430037
2,0.0,0.0,0.47633,0.47633,0.47633,0.0,0.0,0.0,0.0,0.304035,0.0,0.0,0.0,0.0,0.47633,0.0,0.0,0.0,0.0
3,0.362224,0.362224,0.0,0.0,0.0,0.0,0.362224,0.0,0.362224,0.0,0.0,0.362224,0.285582,0.362224,0.0,0.362224,0.0,0.0,0.0


In [153]:
train_data

['Similar problems similar solutions',
 'Why do you ignore these problems',
 'Statistical challenges cause different problems',
 'Sometimes solutions are the simplest and easiest ones']

In [154]:
clean_data = train_data.copy()
for i in range(0, len(train_data)):
    clean_data[i] = cleaning(train_data[i])
    

In [155]:
clean_data

['similar problem similar solution',
 'ignore problem',
 'statistical challenge different problem',
 'sometimes solution simplest easiest one']

In [156]:
clean_test_data = test_data.copy()
for i in range(0, len(test_data)):
    clean_test_data[i] = cleaning(test_data[i])

In [157]:
cl_tf_idf_vectorizer = TfidfVectorizer()
cl_train_tf_idf = cl_tf_idf_vectorizer.fit_transform(clean_data)
cl_test_tf_idf = cl_tf_idf_vectorizer.transform(clean_test_data)

In [158]:
cl_features = cl_tf_idf_vectorizer.get_feature_names_out()
cl_features

array(['challenge', 'different', 'easiest', 'ignore', 'one', 'problem',
       'similar', 'simplest', 'solution', 'sometimes', 'statistical'],
      dtype=object)

In [159]:
tf_idf_features

array(['and', 'are', 'cause', 'challenges', 'different', 'do', 'easiest',
       'ignore', 'ones', 'problems', 'similar', 'simplest', 'solutions',
       'sometimes', 'statistical', 'the', 'these', 'why', 'you'],
      dtype=object)

In [160]:
print(tf_idf_features.__len__())
print(cl_features.__len__())

19
11


In [161]:
cl_test_tf_idf.toarray()

array([[0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.        ,
        0.62922751, 0.        , 0.        , 0.77722116, 0.        ,
        0.        ]])