<a href="https://colab.research.google.com/github/khalida-mujahid/ML-practice/blob/master/batch9_nlp.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize

In [0]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [0]:
corpus = ['Atta is in blue shirt. He is not doing anything. Will he do something? _PAD_']

In [0]:
word_tokenize(corpus[0])

['Atta',
 'is',
 'in',
 'blue',
 'shirt',
 '.',
 'He',
 'is',
 'not',
 'doing',
 'anything',
 '.',
 'Will',
 'he',
 'do',
 'something',
 '?']

In [0]:
tokenized = sent_tokenize(corpus[0])

In [0]:
tokenized[2]

'Will he do something?'

In [0]:
tokenized_sentences = sent_tokenize(corpus[0])

In [0]:
print(tokenized_sentences)

['Atta is in blue shirt.', 'He is not doing anything.', 'Will he do something?']


In [0]:
tokenized_words = [word_tokenize(sentence) for sentence in tokenized_sentences]
print(tokenized_words)

[['Atta', 'is', 'in', 'blue', 'shirt', '.'], ['He', 'is', 'not', 'doing', 'anything', '.'], ['Will', 'he', 'do', 'something', '?']]


In [0]:
from nltk.stem import PorterStemmer, WordNetLemmatizer

In [0]:
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

In [0]:
tokenized_words[0]

['Atta', 'is', 'in', 'blue', 'shirt', '.']

In [0]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [0]:
stemmed_words = [[stemmer.stem(word) for word in words] for words in tokenized_words]
print(stemmed_words)

[['atta', 'is', 'in', 'blue', 'shirt', '.'], ['He', 'is', 'not', 'do', 'anyth', '.'], ['will', 'he', 'do', 'someth', '?']]


In [0]:
lemmatized_words = [[lemmatizer.lemmatize(word, pos = 'v') for word in words] for words in tokenized_words]
print(lemmatized_words)

[['Atta', 'be', 'in', 'blue', 'shirt', '.'], ['He', 'be', 'not', 'do', 'anything', '.'], ['Will', 'he', 'do', 'something', '?']]


In [0]:
from nltk.corpus import stopwords

In [0]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [0]:
sw = set(stopwords.words('english'))

In [0]:
lemmatized_words = [[word for word in words if word not in sw] for words in tokenized_words]
print(lemmatized_words)

[['Atta', 'blue', 'shirt', '.'], ['He', 'anything', '.'], ['Will', 'something', '?']]


In [0]:
import re

In [0]:
matched = re.findall('\w', corpus[0])
print(matched)

['A', 't', 't', 'a', 'i', 's', 'i', 'n', 'b', 'l', 'u', 'e', 's', 'h', 'i', 'r', 't', 'H', 'e', 'i', 's', 'n', 'o', 't', 'd', 'o', 'i', 'n', 'g', 'a', 'n', 'y', 't', 'h', 'i', 'n', 'g', 'W', 'i', 'l', 'l', 'h', 'e', 'd', 'o', 's', 'o', 'm', 'e', 't', 'h', 'i', 'n', 'g']


In [0]:
[1, 1, 1, 0, 0, 0, 0]
[0, 0, 0, 1, 1, 0, 0]
[0, 0, 0, 0, 0, 1, 1]

[0, 0, 0, 0, 0, 1, 1]

In [0]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer 

In [0]:
bag_of_words = CountVectorizer(tokenizer = word_tokenize, stop_words = sw)

In [0]:
X = bag_of_words.fit_transform(tokenized_sentences)

In [0]:
bag_of_words.vocabulary_

{'.': 0,
 '?': 1,
 'anything': 2,
 'atta': 3,
 'blue': 4,
 'shirt': 5,
 'something': 6}

In [0]:
(X.toarray())

array([[1, 0, 0, 1, 1, 1, 0],
       [1, 0, 1, 0, 0, 0, 0],
       [0, 1, 0, 0, 0, 0, 1]], dtype=int64)

In [0]:
Atta is in blue shirt.

In [0]:
Tfidf = TfidfVectorizer(input = 'filename', tokenizer = word_tokenize, stop_words = sw)

In [0]:
from google.colab import files
files.upload()

Saving Yo2.txt to Yo2.txt


{'Yo2.txt': b'Hey guys, how are you? Hey What up any updates about TPU. Nothing yet, still working'}

In [0]:
X = Tfidf.fit_transform(['Yo.txt', 'Yo2.txt'])

In [0]:
X.toarray()

array([[0.48507125, 0.24253563, 0.24253563, 0.24253563, 0.48507125,
        0.24253563, 0.24253563, 0.24253563, 0.24253563, 0.24253563,
        0.24253563],
       [0.48507125, 0.24253563, 0.24253563, 0.24253563, 0.48507125,
        0.24253563, 0.24253563, 0.24253563, 0.24253563, 0.24253563,
        0.24253563]])

In [0]:
X.toarray()

array([[1.]])

In [0]:
Tfidf.vocabulary_

{'.': 0,
 '?': 1,
 'anything': 2,
 'atta': 3,
 'blue': 4,
 'shirt': 5,
 'something': 6}