### Import Libraries

In [34]:
import spacy
import enchant
import re
from spellchecker import SpellChecker
from autocorrect import Speller
import pandas as pd
from textblob import TextBlob
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import IsolationForest
from transformers import GPT2LMHeadModel, GPT2Tokenizer

**1. Text Preprocessing**

In [2]:
# Load spacy model
nlp = spacy.load("en_core_web_lg")

# text
text = "The cat is sitting on the mat."

# Load text in spacy model
doc = nlp(text)

# Extract tokens
tokens = [token.text for token in doc]

print(tokens)

['The', 'cat', 'is', 'sitting', 'on', 'the', 'mat', '.']


**2. Lemmatization**

In [3]:
# Text for which we have to do lemmatization
text = "Natural Language Processing (NLP) is a field of artificial intelligence (AI) that focuses on the interaction between computers and human language. It involves the study and development of computational models and algorithms to enable computers to understand, interpret, and generate human language."

# fit text into spacy model
doc = nlp(text)

# Lemmatization 
lemmatized_word = " ".join([token.lemma_ for token in doc])
print(lemmatized_word)

Natural Language Processing ( NLP ) be a field of artificial intelligence ( AI ) that focus on the interaction between computer and human language . it involve the study and development of computational model and algorithm to enable computer to understand , interpret , and generate human language .


**3. Lowercasing**

In [4]:
lowercased_text = text.lower()
print(lowercased_text)

natural language processing (nlp) is a field of artificial intelligence (ai) that focuses on the interaction between computers and human language. it involves the study and development of computational models and algorithms to enable computers to understand, interpret, and generate human language.


**4. Stopwords Removel**

In [5]:
# Remove stop words from the processed document
filtered_tokens = [token.text for token in doc if not token.is_stop]
print(filtered_tokens)

['Natural', 'Language', 'Processing', '(', 'NLP', ')', 'field', 'artificial', 'intelligence', '(', 'AI', ')', 'focuses', 'interaction', 'computers', 'human', 'language', '.', 'involves', 'study', 'development', 'computational', 'models', 'algorithms', 'enable', 'computers', 'understand', ',', 'interpret', ',', 'generate', 'human', 'language', '.']


**5. Removing special characters and numbers**

In [6]:
# Special numbers and characters
special = "This is an example sentence! It includes 123 special characters *&^% and numbers 456."

# Fit text into spacy
doc = nlp(special)

# Get the clean tokens without special characters and numbers. Is_alpha will only keep alphabetic characters
clean_tokens = [token.text for token in doc if token.is_alpha or token.is_space]

# Join the clean tokens to form the clean text
clean_text = ' '.join(clean_tokens)
print(clean_text)

This is an example sentence It includes special characters and numbers


**6. Handling spelling errors and abbreviations**

In [7]:
paragraph = "Thiss is a paragraff containing mispelled wrds. The sciense of NLP is an importannt feeld of AI. It involves prosessing, underrstanding, and generting human language text. NLP technolojies have a wied range of apllications, includding machine transletion, sentiment anallisis, and informasion retrieval. Machene Learneng is a powerful tool that can bee used to extract insiights from large amouts of data. It is widly used in varius industries such as healthcare, fainance, and marketting. The potentshal of ML is enormus, and it continus to evolv and advanse. With contnued reserch and innvation, we can unlok new possiblities and advans our understatding of the world."
print(paragraph)

Thiss is a paragraff containing mispelled wrds. The sciense of NLP is an importannt feeld of AI. It involves prosessing, underrstanding, and generting human language text. NLP technolojies have a wied range of apllications, includding machine transletion, sentiment anallisis, and informasion retrieval. Machene Learneng is a powerful tool that can bee used to extract insiights from large amouts of data. It is widly used in varius industries such as healthcare, fainance, and marketting. The potentshal of ML is enormus, and it continus to evolv and advanse. With contnued reserch and innvation, we can unlok new possiblities and advans our understatding of the world.


In [8]:
def correct_spelling_and_abbreviations(text):
    # Initialize the spell checker
    spell = Speller(lang='en')

    # Initialize the enchant dictionary
    dictionary = enchant.Dict("en_US")

    # Split the text into words
    words = text.split()

    corrected_words = []
    for word in words:
        # Check if the word is an abbreviation
        if word.isupper() and len(word) > 1:

            # For demonstration purposes, let's assume we don't make any changes to abbreviations
            corrected_words.append(word)
        else:
            # Check if the word is misspelled
            if not dictionary.check(word):
                # Get the most likely correct spelling suggestion
                corrected_word = spell(word)
                corrected_words.append(corrected_word)
            else:
                corrected_words.append(word)

    # Join the corrected words back into text
    corrected_text = " ".join(corrected_words)

    return corrected_text

In [9]:
corrected_paragraph = correct_spelling_and_abbreviations(paragraph)
print(corrected_paragraph)

This is a paragraph containing misspelled words. The science of NLP is an important field of AI. It involves processing, understanding, and generating human language text. NLP technologies have a died range of applications, including machine translation, sentiment analysis, and information retrieval. Machine Learning is a powerful tool that can bee used to extract insights from large amounts of data. It is widely used in various industries such as healthcare, finance, and marketing. The potential of ML is enormous, and it continue to evolve and advance. With continued research and innovation, we can unlock new possibilities and advance our understanding of the world.


**7. Removing HTML tags or other markup**

In [10]:
paragraph = """
NLP, or Natural Language Processing, is a field of AI that focuses on the interaction between computers and human language. It involves analyzing, understanding, and generating natural language data. <strong>Text classification</strong>, <em>sentiment analysis</em>, and <a href="https://example.com">machine translation</a> are some common NLP tasks. NLP techniques utilize <b>machine learning</b> algorithms to process and interpret language patterns. <i>Named entity recognition</i>, <u>part-of-speech tagging</u>, and <code>tokenization</code> are fundamental NLP tasks. NLP plays a significant role in various applications such as <span class="highlight">chatbots</span>, <mark>voice assistants</mark>, and <del>information extraction</del>.</p>
<p>Python provides powerful libraries like <sup>NLTK</sup> (Natural Language Toolkit) and <abbr title="spaCy">spaCy</abbr> for NLP tasks. When working with text data, it's essential to remove HTML tags that might be present. Removing HTML tags can be done using regular expressions in Python.</p>
"""
print(paragraph)


NLP, or Natural Language Processing, is a field of AI that focuses on the interaction between computers and human language. It involves analyzing, understanding, and generating natural language data. <strong>Text classification</strong>, <em>sentiment analysis</em>, and <a href="https://example.com">machine translation</a> are some common NLP tasks. NLP techniques utilize <b>machine learning</b> algorithms to process and interpret language patterns. <i>Named entity recognition</i>, <u>part-of-speech tagging</u>, and <code>tokenization</code> are fundamental NLP tasks. NLP plays a significant role in various applications such as <span class="highlight">chatbots</span>, <mark>voice assistants</mark>, and <del>information extraction</del>.</p>
<p>Python provides powerful libraries like <sup>NLTK</sup> (Natural Language Toolkit) and <abbr title="spaCy">spaCy</abbr> for NLP tasks. When working with text data, it's essential to remove HTML tags that might be present. Removing HTML tags can 

In [11]:
def remove_html_tags(text):
    clean_text = re.sub('<.*?>', '', text)  # Removes all HTML tags
    return clean_text

In [12]:
# Remove HTML tags using regular expressions
clean_text = remove_html_tags(paragraph)
print(clean_text)


NLP, or Natural Language Processing, is a field of AI that focuses on the interaction between computers and human language. It involves analyzing, understanding, and generating natural language data. Text classification, sentiment analysis, and machine translation are some common NLP tasks. NLP techniques utilize machine learning algorithms to process and interpret language patterns. Named entity recognition, part-of-speech tagging, and tokenization are fundamental NLP tasks. NLP plays a significant role in various applications such as chatbots, voice assistants, and information extraction.
Python provides powerful libraries like NLTK (Natural Language Toolkit) and spaCy for NLP tasks. When working with text data, it's essential to remove HTML tags that might be present. Removing HTML tags can be done using regular expressions in Python.



**8. Parts of Speech Tagging**

In [13]:
# Sample sentence
sentence = "POS tagging is an important task in NLP."

# Process the sentence
doc = nlp(sentence)

# Print the token and POS tag
for token in doc:
    print(token.text, "-", token.pos_)

POS - NOUN
tagging - NOUN
is - AUX
an - DET
important - ADJ
task - NOUN
in - ADP
NLP - PROPN
. - PUNCT


**9. Named Entity Recognition**

In [14]:
text = "Barack Obama, a well-known politician, was born in Honolulu, Hawaii. He served as the 44th President of the United States of America. The White House, located in Washington, D.C., is the official residence and workplace of the President. Apple Inc. is a multinational technology company based in Cupertino, California. It designs and manufactures various electronic devices, including the iPhone and Macbook. The Eiffel Tower, a famous landmark in Paris, France, attracts millions of tourists every year. The Olympic Games, held every four years, bring together athletes from different nations. 'Harry Potter and the Sorcerer's Stone' is a popular fantasy novel written by J.K. Rowling. The Constitution of the United States is a fundamental law that governs the country. English is a widely spoken language around the world. The event will take place on July 15th, 2022. The meeting is scheduled for 9:30 AM. The price of the product is $99.99. The distance between the two cities is 200 kilometers. This is the first edition of the book. The team won by a score of 3-1."
print(text)

Barack Obama, a well-known politician, was born in Honolulu, Hawaii. He served as the 44th President of the United States of America. The White House, located in Washington, D.C., is the official residence and workplace of the President. Apple Inc. is a multinational technology company based in Cupertino, California. It designs and manufactures various electronic devices, including the iPhone and Macbook. The Eiffel Tower, a famous landmark in Paris, France, attracts millions of tourists every year. The Olympic Games, held every four years, bring together athletes from different nations. 'Harry Potter and the Sorcerer's Stone' is a popular fantasy novel written by J.K. Rowling. The Constitution of the United States is a fundamental law that governs the country. English is a widely spoken language around the world. The event will take place on July 15th, 2022. The meeting is scheduled for 9:30 AM. The price of the product is $99.99. The distance between the two cities is 200 kilometers.

In [15]:
# Process the text with spaCy
doc = nlp(text)

# Iterate over the entities in the document
for entity in doc.ents:
    print(entity.text, entity.label_)

Barack Obama PERSON
Honolulu GPE
Hawaii GPE
44th ORDINAL
the United States of America GPE
The White House ORG
Washington GPE
D.C. GPE
Apple Inc. ORG
Cupertino GPE
California GPE
Macbook PRODUCT
The Eiffel Tower FAC
Paris GPE
France GPE
millions CARDINAL
The Olympic Games EVENT
every four years DATE
Harry Potter PERSON
the Sorcerer's Stone' ORG
J.K. Rowling PERSON
the United States GPE
English LANGUAGE
July 15th, 2022 DATE
9:30 AM TIME
99.99 MONEY
two CARDINAL
200 kilometers QUANTITY
first ORDINAL
3 CARDINAL


**10. Sentiment Analysis**

In [16]:
df = pd.read_csv("customer_reviews.csv")
df["reviews"] = df["reviews"].str.split("|").str.get(1)
df.head(1)

Unnamed: 0,reviews
0,The ground staff were not helpful. Felt like...


In [17]:
# Function to categorize polarity
def categorize_polarity(polarity):
    if polarity > 0.05:
        return 'Positive'
    elif polarity < -0.05:
        return 'Negative'
    else:
        return 'Neutral'

# Function to categorize subjectivity
def categorize_subjectivity(subjectivity):
    if subjectivity > 0.7:
        return 'Highly Subjective'
    elif subjectivity > 0.3:
        return 'Subjective'
    else:
        return 'Objective'

In [18]:
# Apply sentiment analysis to the text column
df['polarity'] = df["reviews"].apply(lambda x: TextBlob(x).sentiment.polarity)
df['subjectivity'] = df["reviews"].apply(lambda x: TextBlob(x).sentiment.subjectivity)

In [19]:
# Categorize polarity and subjectivity
df['polarity_category'] = df['polarity'].apply(categorize_polarity)
df['subjectivity_category'] = df['subjectivity'].apply(categorize_subjectivity)

In [20]:
polarity_counts = df["polarity_category"].value_counts().to_frame().reset_index()
polarity_counts.columns = ["Polarity", "Reviews"]
polarity_counts

Unnamed: 0,Polarity,Reviews
0,Positive,551
1,Neutral,230
2,Negative,219


**11. Text Classification**

In [23]:
X = df['reviews']  # Textual data

# Extract features from text using TF-IDF vectorization
vectorizer = TfidfVectorizer()
X_vec = vectorizer.fit_transform(X)

# Perform anomaly detection using Isolation Forest
isolation_forest = IsolationForest(contamination=0.01)  # Adjust contamination parameter as needed
isolation_forest.fit(X_vec)

# Predict the outliers (spam-like texts)
outlier_preds = isolation_forest.predict(X_vec)

# Add the predicted labels to the dataset
df['spam'] = outlier_preds

df["spam"].replace({1: "no spam", -1: "spam"}, inplace = True)

**12. Language Modeling**

In [48]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer

# Load pre-trained GPT model and tokenizer
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenizer.pad_token = tokenizer.eos_token

model = GPT2LMHeadModel.from_pretrained('gpt2')

# Example prompt
prompt = "Two years ago"

# Tokenize the prompt
input_ids = tokenizer.encode(prompt, add_special_tokens=True, truncation=True, padding='longest', return_tensors='pt')

# Generate text using the GPT model
output = model.generate(input_ids, max_length=50, num_return_sequences=1, pad_token_id=tokenizer.eos_token_id)

# Decode the generated text
generated_text = tokenizer.decode(output[0], skip_special_tokens=True)

print(f"Generated text: {generated_text}")

Generated text: Two years ago, the U.S. government began to investigate the use of drones in Yemen. The U.S. government has been accused of using drones to kill civilians in Yemen, and the U.S. has been accused of using drones


**13. Word Embedding**

In [51]:
nlp = spacy.load('en_core_web_lg')

embeddings = []
for text in df['reviews']:
    doc = nlp(text)
    text_embedding = doc.vector
    embeddings.append(text_embedding)

df['embeddings'] = embeddings
df.head(3)

Unnamed: 0,reviews,polarity,subjectivity,polarity_category,subjectivity_category,spam,embeddings
0,The ground staff were not helpful. Felt like...,-0.216667,0.266667,Negative,Objective,no spam,"[-2.1799493, 1.78598, -2.9814925, -0.18267782,..."
1,Second time BA Premium Economy in a newer ai...,0.388106,0.622727,Positive,Subjective,no spam,"[-2.024629, -0.37910125, -1.6072346, 0.2592054..."
2,They changed our Flights from Brussels to Lo...,-0.119583,0.480833,Negative,Subjective,no spam,"[-1.2155534, 1.6921389, -3.7904077, -0.6767871..."
