<a href="https://colab.research.google.com/github/jadhav-rakesh/ML/blob/main/ds6.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import pandas as pd

#Cleaning Text

In [2]:
# unstructured text data and want to complete some basic cleaning.

text_data = ["   Interrobang. By Aishwarya Henriette     ",
             "Parking And Going. By Karl Gautier",
             "    Today Is The night. By Jarek Prakash   "]

strip_whitespace = [string.strip() for string in text_data]

strip_whitespace

['Interrobang. By Aishwarya Henriette',
 'Parking And Going. By Karl Gautier',
 'Today Is The night. By Jarek Prakash']

In [4]:
remove_periods = [string.replace(".", "") for string in strip_whitespace]

remove_periods

['Interrobang By Aishwarya Henriette',
 'Parking And Going By Karl Gautier',
 'Today Is The night By Jarek Prakash']

In [5]:
def capitalizer(string: str) -> str:
    return string.upper()

[capitalizer(string) for string in remove_periods]

['INTERROBANG BY AISHWARYA HENRIETTE',
 'PARKING AND GOING BY KARL GAUTIER',
 'TODAY IS THE NIGHT BY JAREK PRAKASH']

In [6]:
import re

def replace_letters_with_X(string: str) -> str:
    return re.sub(r"[a-zA-Z]", "X", string)

[replace_letters_with_X(string) for string in remove_periods]

['XXXXXXXXXXX XX XXXXXXXXX XXXXXXXXX',
 'XXXXXXX XXX XXXXX XX XXXX XXXXXXX',
 'XXXXX XX XXX XXXXX XX XXXXX XXXXXXX']

In [10]:
s = "machine learning in python cookbook"

find_n = s.find("n")

satrts_with_m = s.startswith("m")

ends_with_python = s.endswith("python")

is_alnum = s.isalnum()

is_alpha = s.isalpha()

encode_as_utf8 = s.encode("utf-8")

decode = encode_as_utf8.decode("utf-8")

print(find_n,
      satrts_with_m,
      ends_with_python,
      is_alnum,
      is_alpha,
      encode_as_utf8,
      decode,
      sep="\n")

5
True
False
False
False
b'machine learning in python cookbook'
machine learning in python cookbook


#Parsing and Cleaning HTML

In [15]:
#You have text data with HTML elements and want to extract just the text.

from bs4 import BeautifulSoup

html = "<div class='full_name'>" \
        "<span style='font-weight:bold'>Masego"\
        "</span> Azra</div>"

soup = BeautifulSoup(html, "lxml")

soup.find("div", {"class": "full_name"}).text

'Masego Azra'

#Removing Punctuation

In [19]:
#ou have a feature of text data and want to remove punctuation.

import unicodedata
import sys

text_data = ['Hi!!!! I. Love. This. Song....',
             '10000% Agree!!!! #LoveIT',
             'Right?!?!']

punctuation = dict.fromkeys(
    (i for i in range(sys.maxunicode)
    if unicodedata.category(chr(i)).startswith('P')
    ),
     None
)

[string.translate(punctuation) for string in text_data]

['Hi I Love This Song', '10000 Agree LoveIT', 'Right']

#Tokenizing Text

In [21]:
!pip install nltk



In [24]:
#You have text and want to break it up into individual words

import nltk
from nltk.tokenize import word_tokenize
nltk.download('punkt_tab')

string = "The science of today is the technology of tomorrow"

word_tokenize(string)

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


['The', 'science', 'of', 'today', 'is', 'the', 'technology', 'of', 'tomorrow']

In [25]:
# tokenize into sentences:

from nltk.tokenize import sent_tokenize

string = "The science of today is the technology of tomorrow. Tomorrow is today"

sent_tokenize(string)

['The science of today is the technology of tomorrow.', 'Tomorrow is today']

#Removing Stop Words

In [28]:
#Given tokenized text data, you want to remove extremely common words (e.g., a, is, of, on) that contain little informational value.

from nltk.corpus import stopwords

import nltk
nltk.download("stopwords")

tokenized_words = ['i',
                   'am',
                   'going',
                   'to',
                   'go',
                   'to',
                   'the',
                   'store',
                   'and',
                   'park']

stop_words = stopwords.words("english")

[word for word in tokenized_words if word not in stop_words]

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


['going', 'go', 'store', 'park']

In [29]:
stop_words[:5]

['i', 'me', 'my', 'myself', 'we']

#Stemming Words

In [30]:
#You have tokenized words and want to convert them into their root forms.

from nltk.stem.porter import PorterStemmer

tokenized_words = ['i', 'am', 'humbled', 'by', 'this', 'traditional', 'meeting']

porter = PorterStemmer()

[porter.stem(word) for word in tokenized_words]

['i', 'am', 'humbl', 'by', 'thi', 'tradit', 'meet']

#Tagging Parts of Speech

In [32]:
#You have text data and want to tag each word or character with its part of speech.

from nltk import pos_tag
from nltk import word_tokenize
nltk.download('averaged_perceptron_tagger_eng')

text_data = "Chris loved outdoor running"

text_tagged = pos_tag(word_tokenize(text_data))

text_tagged

[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger_eng.zip.


[('Chris', 'NNP'), ('loved', 'VBD'), ('outdoor', 'RP'), ('running', 'VBG')]

In [33]:
[word for word, tag in text_tagged if tag in ["NN", "NNS", "NNP", "NNPS"]]

['Chris']

In [34]:
from sklearn.preprocessing import MultiLabelBinarizer

tweets = ["I am eating a burrito for breakfast",
          "Political science is an amazing field",
          "San Francisco is an awesome city"]

tagged_tweets = []

for tweet in tweets:
    tweet_tag = nltk.pos_tag(word_tokenize(tweet))
    tagged_tweets.append([tag for word, tag in tweet_tag])

one_hot_multi = MultiLabelBinarizer()
one_hot_multi.fit_transform(tagged_tweets)

array([[1, 1, 0, 1, 0, 1, 1, 1, 0],
       [1, 0, 1, 1, 0, 0, 0, 0, 1],
       [1, 0, 1, 1, 1, 0, 0, 0, 1]])

In [35]:
one_hot_multi.classes_

array(['DT', 'IN', 'JJ', 'NN', 'NNP', 'PRP', 'VBG', 'VBP', 'VBZ'],
      dtype=object)

#Performing Named-Entity Recognition

In [36]:
!pip install spacy



In [42]:
#You want to perform named-entity recognition in freeform text (such as “Person,” “State,” etc.).

import spacy

nlp = spacy.load("en_core_web_sm")
doc = nlp("Elon Musk offered to buy Twitter using $21B of his own money.")

print(doc.ents)

for entity in doc.ents:
    print(entity.text, entity.label_, sep=",")

(Elon Musk, 21B)
Elon Musk,PERSON
21B,MONEY


#Encoding Text as a Bag of Words

In [45]:
#You have text data and want to create a set of features indicating the number of times an observation’s text contains a particular word.

from sklearn.feature_extraction.text import CountVectorizer

text_data = np.array(['I love Brazil. Brazil!',
                      'Sweden is best',
                      'Germany beats both'])

count = CountVectorizer()

bag_of_words = count.fit_transform(text_data)

bag_of_words

<3x8 sparse matrix of type '<class 'numpy.int64'>'
	with 8 stored elements in Compressed Sparse Row format>

In [46]:
bag_of_words.toarray()

array([[0, 0, 0, 2, 0, 0, 1, 0],
       [0, 1, 0, 0, 0, 1, 0, 1],
       [1, 0, 1, 0, 1, 0, 0, 0]])

In [47]:
count.get_feature_names_out()

array(['beats', 'best', 'both', 'brazil', 'germany', 'is', 'love',
       'sweden'], dtype=object)

* Bag-of-words models output a feature for every unique word in text data, with each feature containing a count of occurrences in observations.

In [48]:
count_2gram = CountVectorizer(ngram_range=(1,2),
                              stop_words="english",
                              vocabulary=["brazil"])

bag = count_2gram.fit_transform(text_data)

bag.toarray()

array([[2],
       [0],
       [0]])

In [49]:
count_2gram.vocabulary_

{'brazil': 0}

#Weighting Word Importance

In [56]:
#You want a bag of words with words weighted by their importance to an observation.

from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer

text_data =  np.array(['I love Brazil. Brazil!',
                      'Sweden is best',
                      'Germany beats both'])

# Create a CountVectorizer object
count = CountVectorizer()

# Fit and transform the text data to get a word count matrix
word_count_matrix = count.fit_transform(text_data)

# Create a TfidfTransformer object
tfidf = TfidfTransformer()

# Fit and transform the word count matrix to get TF-IDF features
feature_matrix = tfidf.fit_transform(word_count_matrix)
feature_matrix

<3x8 sparse matrix of type '<class 'numpy.float64'>'
	with 8 stored elements in Compressed Sparse Row format>

In [57]:
feature_matrix.toarray()

array([[0.        , 0.        , 0.        , 0.89442719, 0.        ,
        0.        , 0.4472136 , 0.        ],
       [0.        , 0.57735027, 0.        , 0.        , 0.        ,
        0.57735027, 0.        , 0.57735027],
       [0.57735027, 0.        , 0.57735027, 0.        , 0.57735027,
        0.        , 0.        , 0.        ]])

In [58]:
count.vocabulary_

{'love': 6,
 'brazil': 3,
 'sweden': 7,
 'is': 5,
 'best': 1,
 'germany': 4,
 'beats': 0,
 'both': 2}

#Using Text Vectors to Calculate Text Similarity in a Search Query

In [60]:
#use  tf-idf vectors to implement a text search function in Python.

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

text_data = np.array(['I love Brazil. Brazil!',
                      'Sweden is best',
                      'Germany beats both'])

tfidf = TfidfVectorizer()
feature_matrix = tfidf.fit_transform(text_data)

text = "Brazil is the best"
vector = tfidf.transform([text])

cosine_similarities = linear_kernel(vector, feature_matrix).flatten()

related_doc_indicies = cosine_similarities.argsort()[:-10:-1]

print([(text_data[i], cosine_similarities[i]) for i in related_doc_indicies])

[('Sweden is best', 0.6666666666666666), ('I love Brazil. Brazil!', 0.5163977794943222), ('Germany beats both', 0.0)]


#Using a Sentiment Analysis Classifier

In [61]:
#You want to classify the sentiment of some text to use as a feature or in downstream data analysis

from transformers import pipeline

classifier = pipeline("sentiment-analysis")

sentiment_1 = classifier("I hate machine learning! It's the absolute worst.")
sentiment_2 = classifier(
    "Machine learning is the absolute"
    "bees knees I love it so much!"
)

print(sentiment_1, sentiment_2)

No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision 714eb0f (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Device set to use cpu


[{'label': 'NEGATIVE', 'score': 0.9998020529747009}] [{'label': 'POSITIVE', 'score': 0.9995730519294739}]
