NLP is a subfield of AI that focuses on understanding and processing human language.
Steps:
    Data Acquisition
        Available: Table, Database, Less data
        Other Source: Public Datasets, Web Scraping, API, pdf, image, audio
        No Data
    Text Preparation
        Text Cleanup: HTML tags, emojis, spelling check
        Basic Preprocessing
            Basic: Tokenization(sentence & words)
            Optional: stop words removal, stemming, removing punctuation, lowercasing, language detection
        Advanced Preprocessing
            POS: Parts of Speech Tagging
            Parsing
    Feature Engineering: Extracting input columns from text or converting text into numbers for ML model.
        N-grams
        TF-IDF
        BoW: Bag of Words
    Modeling
        Model Building
            Heuristic
            ML approach
            DL approach
            Cloud API
        Evaluation
            Intrinsic
            Extrinsic
    Deployment
        Deployment
            API (microservice)
            Chatbot
        Monitoring
        Model Update

Text Preprocessing

In [None]:
import numpy as np
import pandas as pd

In [None]:
df = pd.read_csv("IMDB dataset.csv")

In [None]:
df.head()

1. Lower Casing

In [None]:
df["review"][3]

In [None]:
df["review"][3].lower()

In [None]:
df["review"] = df["review"].str.lower()

In [None]:
df["review"].head()

2. Removing HTML Tags

In [None]:
import re

In [None]:
text = """
<!DOCTYPE html>
<html>
<head>
    <title>Sample HTML Document</title>
</head>
<body>
    <h1>Welcome to our website!</h1>
    <p>This is a sample HTML document with some <a href="https://example.com">links</a> and <strong>formatted</strong> text.</p>
    <ul>
        <li>Item 1</li>
        <li>Item 2</li>
        <li>Item 3</li>
    </ul>
    <p>Thank you for visiting!</p>
</body>
</html>
"""

In [None]:
def remove_html(text):
    pattern = re.compile("<.*?>")
    return pattern.sub(r"", text)


remove_html(text)

In [None]:
print(remove_html(text))

In [None]:
df["review"].apply(remove_html)

3. Removing URLs

In [None]:
text = """
1. https://www.example.com/page?param1=value1&param2=value2#section
2. https://www.example.com/path/with/special%20characters/index.html
3. http://www.test-site.org/page?param=1&param=2&param=3
5. https://www.example.com/path/to/https://malicious-site.com/phishing
"""

In [None]:
def remove_url(text):
    pattern = re.compile(r"https?://\S+|www\.\S+")
    return pattern.sub(r"", text)


remove_url(text)

In [None]:
print(remove_url(text))

In [None]:
df["review"].apply(remove_url)

4. Removing Punctuation

In [None]:
import string

string.punctuation

In [None]:
exclude = string.punctuation

In [None]:
def remove_punc(text):
    for char in exclude:
        text = text.replace(char, "")
    return text


df["review"].apply(remove_punc)

In [None]:
def remove_punc1(text):  # 18 times faster than remove_punc
    return text.translate(str.maketrans("", "", exclude))


df["review"].apply(remove_punc1)

In [None]:
from textblob import TextBlob

In [None]:
incorrect_text = "My name is Krishana and I stydy in SRM"
text = TextBlob(incorrect_text)
text.correct().string

5. Removing Stop Words:

In [None]:
from nltk.corpus import stopwords

In [None]:
stopwords = stopwords.words("english")

In [None]:
def remove_stopwords(text):
    new_text = []
    for word in text.split():
        if word in stopwords:
            new_text.append("")
        else:
            new_text.append(word)
    x = new_text[:]
    new_text.clear()
    return " ".join(x)

In [None]:
remove_stopwords("The quick brown fox jumps over the lazy dog.")
df["review"].apply(remove_stopwords)

6. Handling Emojis

In [None]:
import re

In [None]:
def remove_emoji(text):
    emoji_pattern = re.compile(
        "["
        "\U0001F600-\U0001F64F"
        "\U0001F300-\U0001F5FF"
        "\U0001F680-\U0001F6FF"
        "\U0001F1E0-\U0001F1FF"
        "\U00002702-\U000027B0"
        "\U000024C2-\U0001F251"
        "]+",
        flags=re.UNICODE,
    )
    return emoji_pattern.sub(r"", text)

In [None]:
remove_emoji("Hi😌, I am Krishna😄")

In [None]:
import emoji
print(emoji.demojize("Hi 😌 , I am Krishna 😄"))

7. Tokenization: Splitting a text into smaller units.

In [None]:
 #   Using the split function

text1 = "My name is Krishna"
text.split()

In [None]:
text2 = "Hi, My name is Krishna 😌"
text2.split()

In [None]:
# Regular Expression

import re

text = "Hi, My name is Krishna 😌"
tokens = re.findall("[\w]+", text)
tokens

In [None]:
# NLTK
from nltk.tokenize import word_tokenize, sent_tokenize

In [None]:
text = "Hi, my name is Krishna! and I have a pdf in A.I"
word_tokenize(text)

In [None]:
# Spacy
import spacy

nlp = spacy.load("en_core_web_sm")

In [None]:
text = "SpaCy is an open-source natural language processing library that provides pre-trained models for various languages. It can tokenize text, perform part-of-speech tagging, named entity recognition, and much more. SpaCy's tokenization is fast and accurate, making it a popular choice for NLP tasks."
doc = nlp(text)
for token in doc:
    print(token)

In [None]:
# Stemming
from nltk.stem.porter import PorterStemmer

In [None]:
ps = PorterStemmer()


def stem_words(text):
    return " ".join([ps.stem(word) for word in text.split()])

In [None]:
sample = "1. Running 2. Flies 3. Jumps 4. Sleeping 5. Better 6. Best 7. Swimmed 8. Swam 9. Feet 10. Children"
stem_words(sample)

Text Representation

1. BagOfWords

In [None]:
df = pd.DataFrame(
    {
        "text": [
            "people watch campusx",
            "campusx watch campusx",
            "people write comment",
            "campusx write comment",
        ],
        "output": [1, 1, 0, 0],
    }
)

In [None]:
df

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer()

In [None]:
bow = cv.fit_transform(df["text"])

In [None]:
# Vocabulary
print(cv.vocabulary_)

In [None]:
print(text[0], " : ", bow[0].toarray())
print(text[1], " : ", bow[1].toarray())
print(text[2], " : ", bow[2].toarray())
print(text[3], " : ", bow[3].toarray())

In [None]:
cv.transform(["campusx watch and write comment of campusx"]).toarray()

2. Bag of N-Grams

In [None]:
cv = CountVectorizer(ngram_range=(1, 3))
bow = cv.fit_transform(df["text"])

In [None]:
print(cv.vocabulary_)

In [None]:
print(text[0], " : ", bow[0].toarray())
print(text[1], " : ", bow[1].toarray())
print(text[2], " : ", bow[2].toarray())
print(text[3], " : ", bow[3].toarray())

In [None]:
cv.transform(["campusx watch and write comment of campusx"]).toarray()

3. TF-IDF

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer()

In [None]:
TFIDF = tfidf.fit_transform(df["text"]).toarray()

In [None]:
print(tfidf.vocabulary_)

In [None]:
print(text[0], " : ", TFIDF[0])
print(text[1], " : ", TFIDF[1])
print(text[2], " : ", TFIDF[2])
print(text[3], " : ", TFIDF[3])

In [None]:
tfidf.transform(["campusx watch and write comment of campusx"]).toarray()

4. Word2Vec

In [None]:
import gensim
import os

In [None]:
from nltk import sent_tokenize
from gensim.utils import simple_preprocess

story = []
f = open(os.path.join("GOT.txt"))
text = f.read()
raw_sent = sent_tokenize(text)
for sent in raw_sent:
    story.append(simple_preprocess(sent))

In [None]:
story

In [None]:
model = gensim.models.Word2Vec(window=10, min_count=2)

In [None]:
model.build_vocab(story)

In [None]:
model.train(story, total_examples=model.corpus_count, epochs=model.epochs)

In [None]:
model.wv.most_similar("daenerys")

In [None]:
model.wv.doesnt_match(["jon", "rikon", "robb", "arya", "sansa", "bran"])

In [None]:
model.wv["sansa"]

In [None]:
model.wv.similarity("arya", "sansa")

In [None]:
model.wv.similarity("tywin", "sansa")

Text Classification

1. Using BOW and n-grams

In [None]:
temp_df = pd.read_csv("IMDB Dataset.csv")
df = temp_df.iloc[:10000]
df.head()

In [None]:
# Preprocessing
df.drop_duplicates(inplace=True)
df["review"] = df["review"].apply(lambda x: x.lower())
df["review"].apply(remove_html)
df["review"].apply(remove_stopwords)

In [None]:
X = df.iloc[:, 0:1]  # Seperating x & y
y = df["sentiment"]  # Seperating x & y

from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()
y = encoder.fit_transform(y)  # Replacing Positive&Negative with 1 & 0 in y

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

In [None]:
# Applynig BoW
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer()
X_train_bow = cv.fit_transform(X_train["review"]).toarray()
X_test_bow = cv.transform(X_test["review"]).toarray()

In [None]:
# Using ML Algorithm

# Naive Bayes
from sklearn.naive_bayes import GaussianNB

gnb = GaussianNB()
gnb.fit(X_train_bow, y_train)
y_pred = gnb.predict(X_test_bow)
from sklearn.metrics import accuracy_score, confusion_matrix

print("NaiveBayes AccuracyScore", accuracy_score(y_test, y_pred))
print("NaiveBayes ConfusionMatrix \n", confusion_matrix(y_test, y_pred))

# Random Forest
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier()
rf.fit(X_train_bow, y_train)
y_pred = rf.predict(X_test_bow)
print("RandomForest AccuracyScore", accuracy_score(y_test, y_pred))
print("RandomForest ConfusionMatrix\n", confusion_matrix(y_test, y_pred))

2. Using Tf-Idf

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(max_features=50000)
X_train_tfidf = tfidf.fit_transform(X_train["review"])
X_test_tfidf = tfidf.transform(X_test["review"])

In [None]:
# Using ML Algo
# Random Forest
rf = RandomForestClassifier()
rf.fit(X_train_tfidf, y_train)
y_pred = rf.predict(X_test_tfidf)
print("RandomForest AccuracyScore", accuracy_score(y_test, y_pred))
print("RandomForest ConfusionMatrix\n", confusion_matrix(y_test, y_pred))

3. Using Word2Vec

Parts of Speech Tagging

In [None]:
import spacy

nlp = spacy.load("en_core_web_sm")

doc1 = nlp("I will google about facebook")
doc2 = nlp("I left the room")
doc3 = nlp("I read books on history")

for word in doc1:
    print(word.text, "----->", word.pos_, word.tag_, spacy.explain(word.tag_))
print()
for word in doc2:
    print(word.text, "----->", word.pos_, word.tag_, spacy.explain(word.tag_))
print()
for word in doc3:
    print(word.text, "----->", word.pos_, word.tag_, spacy.explain(word.tag_))

In [None]:
from spacy import displacy

doc = nlp("The quick brown fox jumped over the lazy dog")
displacy.render(doc, style="dep", jupyter=True)

In [None]:
import joblib
joblib.dump(cv, 'count_vectorizer.pkl')

In [None]:
joblib.dump(rf, 'random_forest_model.pkl')

In [None]:
import pandas as pd
import string
import re
from nltk.corpus import stopwords
from sklearn.metrics import accuracy_score, confusion_matrix
import joblib
from bs4 import BeautifulSoup
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier

stop_words = set(stopwords.words("english"))

def clean_text(text):
    text = text.lower()
    text = re.sub(r'<.*?>', '', text)  # Remove HTML tags
    text = re.sub(r'http\S+', '', text)  # Remove URLs
    text = text.translate(str.maketrans('', '', string.punctuation))  # Remove punctuation
    text = re.sub(r'\d+', '', text)  # Remove numbers
    text = ' '.join([word for word in text.split() if word not in stop_words])  # Remove stopwords
    return text

def SentimentAnalysis():
    df = pd.read_csv("IMDB Dataset.csv", nrows=10000)

    df.drop_duplicates(inplace=True)
    df["review"] = df["review"].apply(clean_text)

    X = df["review"]
    y = df["sentiment"]

    encoder = LabelEncoder()
    y = encoder.fit_transform(y)

    tfidf = TfidfVectorizer(max_features=5000)
    X_tfidf = tfidf.fit_transform(X)

    rf = RandomForestClassifier()

    cv_scores = cross_val_score(rf, X_tfidf, y, cv=10, scoring='accuracy')

    print("Cross-Validation Scores:", cv_scores)
    print("Mean Accuracy:", cv_scores.mean())

    rf.fit(X_tfidf, y)

    joblib.dump(tfidf, 'TfidfVectorizer.pkl')
    joblib.dump(rf, 'random_forest_model.pkl')

SentimentAnalysis()