In [7]:
# !pip install xgboost
# !pip install pyspellchecker
# !pip install autocorrect
# !pip install emot

Collecting pyspellchecker
  Downloading pyspellchecker-0.8.1-py3-none-any.whl (6.8 MB)
                                              0.0/6.8 MB ? eta -:--:--
                                              0.0/6.8 MB ? eta -:--:--
                                              0.1/6.8 MB 812.7 kB/s eta 0:00:09
                                              0.1/6.8 MB 1.2 MB/s eta 0:00:06
     -                                        0.3/6.8 MB 2.1 MB/s eta 0:00:04
     ---                                      0.7/6.8 MB 3.4 MB/s eta 0:00:02
     --------                                 1.4/6.8 MB 5.5 MB/s eta 0:00:01
     ---------                                1.6/6.8 MB 5.3 MB/s eta 0:00:01
     -----------                              1.9/6.8 MB 6.1 MB/s eta 0:00:01
     -------------                            2.3/6.8 MB 6.2 MB/s eta 0:00:01
     ---------------------                    3.6/6.8 MB 8.6 MB/s eta 0:00:01
     ---------------------                    3.6/6.8 MB 8.6 MB/s e

# **Import Reuired Libraries:**

In [1]:
import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.metrics import confusion_matrix

from sklearn.preprocessing import LabelEncoder

from sklearn.feature_extraction.text import CountVectorizer


from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
import xgboost as xgb


import csv
import re

import nltk
from nltk.corpus import stopwords

from nltk.tokenize import word_tokenize

from textblob import TextBlob
from spellchecker import SpellChecker
from autocorrect import Speller

import emot

import itertools

from nltk.stem import WordNetLemmatizer

In [2]:
# Set random seed for NumPy
np.random.seed(42)

# **Load Dataset:**

In [3]:
file_path = r"C:\Users\glows\Downloads\Github Projects\IMDB Dataset.csv"
chunk_size = 10000
chunks = []

try:
    reader = pd.read_csv(file_path, encoding='utf-8', engine='python', chunksize=chunk_size)
    for chunk in reader:
        try:
            chunks.append(chunk)
        except pd.errors.ParserError as e:
            print(f"ParserError while processing a chunk: {e}")
except pd.errors.ParserError as e:
    print(f"ParserError: {e}")
    dataset = None

if chunks:
    dataset = pd.concat(chunks, ignore_index=True)
    print('Success')
else:
    dataset = None

if dataset is not None:
    print(f"DataFrame loaded successfully with {dataset.shape[0]} rows and {dataset.shape[1]} columns.")
else:
    print("Failed to load DataFrame.")

Success
DataFrame loaded successfully with 50000 rows and 2 columns.


In [4]:
dataset.head(10)

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
5,"Probably my all-time favorite movie, a story o...",positive
6,I sure would like to see a resurrection of a u...,positive
7,"This show was an amazing, fresh & innovative i...",negative
8,Encouraged by the positive comments about this...,negative
9,If you like original gut wrenching laughter yo...,positive


In [9]:
# Specify the fraction of the dataset you want to use (e.g., 0.1 for 10%)
fraction = 0.005

# Create a subset of the dataset
dataset_subset = dataset.sample(frac=fraction, random_state=42)

# Reset the index of the subset DataFrame
dataset_subset = dataset_subset.reset_index(drop=True)

print(f"Size of Dataset: {dataset_subset.shape[0]} rows and {dataset_subset.shape[1]} columns.")

Size of Dataset: 250 rows and 2 columns.


# **Text Preprocessing:**
- Lower casing
- Removal of emojis
- Removal of emoticons
- Removal of URLs
- Removal of Emails
- Removal of HTML tags
- Removal of Non-Ascii Characters
- Removal of Punctuations & Special Characters
- Removal of Numbers
- Removal of Chat Words
- Removal of Stopwords
- Spelling Correction
- Lemmatization


### **Lowercase Text Function**

In [10]:
def lowercase_text(text):
    """
    Convert text to lowercase.

    Args:
    - text (str): The text to convert.

    Returns:
    - str: The text converted to lowercase.
    """
    return text.lower()

In [11]:
sample_text = dataset['review'][0]

print("Sample review text: \n")
sample_text

Sample review text: 



"One of the other reviewers has mentioned that after watching just 1 Oz episode you'll be hooked. They are right, as this is exactly what happened with me.<br /><br />The first thing that struck me about Oz was its brutality and unflinching scenes of violence, which set in right from the word GO. Trust me, this is not a show for the faint hearted or timid. This show pulls no punches with regards to drugs, sex or violence. Its is hardcore, in the classic use of the word.<br /><br />It is called OZ as that is the nickname given to the Oswald Maximum Security State Penitentary. It focuses mainly on Emerald City, an experimental section of the prison where all the cells have glass fronts and face inwards, so privacy is not high on the agenda. Em City is home to many..Aryans, Muslims, gangstas, Latinos, Christians, Italians, Irish and more....so scuffles, death stares, dodgy dealings and shady agreements are never far away.<br /><br />I would say the main appeal of the show is due to the fa

In [12]:
sample_text = lowercase_text(sample_text)

print("Cleaned sample review text: \n")
sample_text

Cleaned sample review text: 



"one of the other reviewers has mentioned that after watching just 1 oz episode you'll be hooked. they are right, as this is exactly what happened with me.<br /><br />the first thing that struck me about oz was its brutality and unflinching scenes of violence, which set in right from the word go. trust me, this is not a show for the faint hearted or timid. this show pulls no punches with regards to drugs, sex or violence. its is hardcore, in the classic use of the word.<br /><br />it is called oz as that is the nickname given to the oswald maximum security state penitentary. it focuses mainly on emerald city, an experimental section of the prison where all the cells have glass fronts and face inwards, so privacy is not high on the agenda. em city is home to many..aryans, muslims, gangstas, latinos, christians, italians, irish and more....so scuffles, death stares, dodgy dealings and shady agreements are never far away.<br /><br />i would say the main appeal of the show is due to the fa

### **Remove Emojis Function**

In [13]:
def remove_emojis(text):
    """
    Remove all emojis from the given text.

    Args:
    - text (str): The text to process.

    Returns:
    - str: The text with all emojis removed.
    """
    emot_obj = emot.core.emot()

    # Detect emojis in the text
    emoji_data = emot_obj.emoji(text)
    for emoji in emoji_data['value']:
        text = text.replace(emoji, '')

    # Remove any extra spaces that might have been left behind
    text = re.sub(r'\s+', ' ', text).strip()

    return text

In [14]:
sample_text = dataset['review'][13735]

print("Sample review text: \n")
sample_text

Sample review text: 



'I checked this movie out based on a favorable review on this page. It is slow moving and the payoff is a four star dud..The only mystery here is how Oscar® winner F. Murray Abraham got involved with such a lousy script!'

In [15]:
sample_text = remove_emojis(sample_text)

print("Cleaned sample review text: \n")
sample_text

Cleaned sample review text: 



'I checked this movie out based on a favorable review on this page. It is slow moving and the payoff is a four star dud..The only mystery here is how Oscar winner F. Murray Abraham got involved with such a lousy script!'

### **Remove Emoticons Function**

In [16]:
def remove_emoticons(text):
    """
    Remove all emoticons from the given text.

    Args:
    - text (str): The text to process.

    Returns:
    - str: The text with all emoticons removed.
    """
    emot_obj = emot.core.emot()


    # Detect emoticons in the text
    emoticon_data = emot_obj.emoticons(text)
    for emoticon in emoticon_data['value']:
        text = text.replace(emoticon, '')

    # Remove any extra spaces that might have been left behind
    text = re.sub(r'\s+', ' ', text).strip()

    return text

In [17]:
sample_text = dataset['review'][45]

print("Sample review text: \n")
sample_text

Sample review text: 



"As a disclaimer, I've seen the movie 5-6 times in the last 15 years, and I only just saw the musical this week. This allowed me to judge the movie without being tainted by what was or wasn't in the musical (however, it tainted me when I watched the musical :) ) <br /><br />I actually believe Michael Douglas worked quite well in that role, along with Kasey. I think her 'Let me dance for you scene' is one of the best parts of the movie, a worthwhile addition compared to the musical. The dancers and singing in the movie are much superior to the musical, as well as the cast which is at least 10 times bigger (easier to do in the movie of course). The decors, lighting, dancing, and singing are also much superior in the movie, which should be expected, and was indeed delivered. <br /><br />The songs that were in common with the musical are better done in the movie, the new ones are quite good ones, and the whole movie just delivers more than the musical in my opinion, especially compared to 

In [18]:
sample_text = remove_emoticons(sample_text)

print("Cleaned sample review text: \n")
sample_text

Cleaned sample review text: 



"As a disclaimer, I've seen the movie 5-6 times in the last 15 years, and I only just saw the musical this week. This allowed me to judge the movie without being tainted by what was or wasn't in the musical (however, it tainted me when I watched the musical ) <br /><br />I actually believe Michael Douglas worked quite well in that role, along with Kasey. I think her 'Let me dance for you scene' is one of the best parts of the movie, a worthwhile addition compared to the musical. The dancers and singing in the movie are much superior to the musical, as well as the cast which is at least 10 times bigger (easier to do in the movie of course). The decors, lighting, dancing, and singing are also much superior in the movie, which should be expected, and was indeed delivered. <br /><br />The songs that were in common with the musical are better done in the movie, the new ones are quite good ones, and the whole movie just delivers more than the musical in my opinion, especially compared to a m

### **Remove URLs Function**

In [19]:
def remove_urls(text):
    """
    Remove URLs from text.

    Args:
    - text (str): The text to remove URLs from.

    Returns:
    - str: The text with URLs removed.
    """
    return re.sub(r'http[s]?://\S+', ' ', text)

In [20]:
sample_text = dataset['review'][907]

print("Sample review text: \n")
sample_text

Sample review text: 



"Following directly from where the story left off in part one, the second half which sets about telling the inevitable downfall and much more grim side of the man's legacy is exactly as such. In direct contrast to the first feature, part two represents a shift from Che the pride and glory of a revolutionised country, to Che\x97struggling liberator of a country to which he has no previous ties. The change of setting isn't just aesthetic; from the autumn and spring greys of the woodlands comes a change of tone and heart to the feature, replacing the optimism of the predecessor with a cynical, battered and bruised reality aligned to an all new struggle. Yet, as Che would go on to say himself\x97such a struggle is best told exactly as that\x97a struggle. While Part One certainly helped document that initial surge to power that the revolutionary guerrilla acquired through just that, Part Two takes a much more refined, callous and bleak segment of Che's life and ambition, and gives it an ass

In [21]:
sample_text = remove_urls(sample_text)

print("Cleaned sample review text: \n")
sample_text

Cleaned sample review text: 



"Following directly from where the story left off in part one, the second half which sets about telling the inevitable downfall and much more grim side of the man's legacy is exactly as such. In direct contrast to the first feature, part two represents a shift from Che the pride and glory of a revolutionised country, to Che\x97struggling liberator of a country to which he has no previous ties. The change of setting isn't just aesthetic; from the autumn and spring greys of the woodlands comes a change of tone and heart to the feature, replacing the optimism of the predecessor with a cynical, battered and bruised reality aligned to an all new struggle. Yet, as Che would go on to say himself\x97such a struggle is best told exactly as that\x97a struggle. While Part One certainly helped document that initial surge to power that the revolutionary guerrilla acquired through just that, Part Two takes a much more refined, callous and bleak segment of Che's life and ambition, and gives it an ass

### **Remove Emails Function**

In [22]:
def remove_emails(text):
    """
    Remove email addresses from the given text and return the cleaned text along with the list of removed emails.

    Args:
    - text (str): The text from which to remove email addresses.

    Returns:
    - tuple: A tuple containing the cleaned text and a list of the removed email addresses.
    """
    # Define the email pattern
    email_pattern = r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}'

    # Find all email addresses in the text
    email_addresses = re.findall(email_pattern, text)

    # Remove email addresses from the text
    return re.sub(email_pattern, '', text)

In [23]:
sample_text = dataset['review'][1281]

print("Sample review text: \n")
sample_text

Sample review text: 



"I like many others saw this as a child and I loved it and it horrified me up until adulthood, I have been trying to find this movie and even been searching for it to play again on TV someday, since it originally played on USA networks. Does Anyone know where to buy this movie, or does anyone have it and would be willing to make a copy for me? Also does anyone know if there is a chance for it to be played on TV again? Maybe all of us fans should write a station in hopes of them airing it again. I don't think they did a good job of promoting this movie in the past because no one really knows about, people only know of the Stepford wives and Stepford husband movies. No one is familiar with the fact that there was a children version. Maybe they should also do a re-make of it since they seem to be doing that a lot lately with a lot of my favorite old thriller/horror flicks. Well if anyone has any input Please I Beg Of You write me with information. Thanks Taira tcampo23@aol.com"

In [24]:
sample_text = remove_emails(sample_text)

print("Cleaned sample review text: \n")
sample_text

Cleaned sample review text: 



"I like many others saw this as a child and I loved it and it horrified me up until adulthood, I have been trying to find this movie and even been searching for it to play again on TV someday, since it originally played on USA networks. Does Anyone know where to buy this movie, or does anyone have it and would be willing to make a copy for me? Also does anyone know if there is a chance for it to be played on TV again? Maybe all of us fans should write a station in hopes of them airing it again. I don't think they did a good job of promoting this movie in the past because no one really knows about, people only know of the Stepford wives and Stepford husband movies. No one is familiar with the fact that there was a children version. Maybe they should also do a re-make of it since they seem to be doing that a lot lately with a lot of my favorite old thriller/horror flicks. Well if anyone has any input Please I Beg Of You write me with information. Thanks Taira "

### **Remove HTML Tags Function**

In [25]:
def remove_html_tags(text):
    """
    Remove HTML tags from the given text.

    Args:
    - text (str): The text containing HTML tags.

    Returns:
    - str: The text with HTML tags removed.
    """
    return re.sub('<[^<]+?>', ' ', text)

In [26]:
sample_text = dataset['review'][0]

print("Sample review text: \n")
sample_text

Sample review text: 



"One of the other reviewers has mentioned that after watching just 1 Oz episode you'll be hooked. They are right, as this is exactly what happened with me.<br /><br />The first thing that struck me about Oz was its brutality and unflinching scenes of violence, which set in right from the word GO. Trust me, this is not a show for the faint hearted or timid. This show pulls no punches with regards to drugs, sex or violence. Its is hardcore, in the classic use of the word.<br /><br />It is called OZ as that is the nickname given to the Oswald Maximum Security State Penitentary. It focuses mainly on Emerald City, an experimental section of the prison where all the cells have glass fronts and face inwards, so privacy is not high on the agenda. Em City is home to many..Aryans, Muslims, gangstas, Latinos, Christians, Italians, Irish and more....so scuffles, death stares, dodgy dealings and shady agreements are never far away.<br /><br />I would say the main appeal of the show is due to the fa

In [27]:
sample_text = remove_html_tags(sample_text)

print("Cleaned sample review text: \n")
sample_text

Cleaned sample review text: 



"One of the other reviewers has mentioned that after watching just 1 Oz episode you'll be hooked. They are right, as this is exactly what happened with me.  The first thing that struck me about Oz was its brutality and unflinching scenes of violence, which set in right from the word GO. Trust me, this is not a show for the faint hearted or timid. This show pulls no punches with regards to drugs, sex or violence. Its is hardcore, in the classic use of the word.  It is called OZ as that is the nickname given to the Oswald Maximum Security State Penitentary. It focuses mainly on Emerald City, an experimental section of the prison where all the cells have glass fronts and face inwards, so privacy is not high on the agenda. Em City is home to many..Aryans, Muslims, gangstas, Latinos, Christians, Italians, Irish and more....so scuffles, death stares, dodgy dealings and shady agreements are never far away.  I would say the main appeal of the show is due to the fact that it goes where other sh

### **Remove Non-Ascii Characters Function**

In [28]:
def remove_non_ascii(text):
    """
    Remove non-ASCII characters from the given text.

    Args:
    - text (str): The text to clean.

    Returns:
    - str: The text with non-ASCII characters removed.
    """
    return re.sub(r'[^\x00-\x7F]', ' ', text)

In [29]:
sample_text = dataset['review'][907]

print("Sample review text: \n")
sample_text

Sample review text: 



"Following directly from where the story left off in part one, the second half which sets about telling the inevitable downfall and much more grim side of the man's legacy is exactly as such. In direct contrast to the first feature, part two represents a shift from Che the pride and glory of a revolutionised country, to Che\x97struggling liberator of a country to which he has no previous ties. The change of setting isn't just aesthetic; from the autumn and spring greys of the woodlands comes a change of tone and heart to the feature, replacing the optimism of the predecessor with a cynical, battered and bruised reality aligned to an all new struggle. Yet, as Che would go on to say himself\x97such a struggle is best told exactly as that\x97a struggle. While Part One certainly helped document that initial surge to power that the revolutionary guerrilla acquired through just that, Part Two takes a much more refined, callous and bleak segment of Che's life and ambition, and gives it an ass

In [30]:
sample_text = remove_non_ascii(sample_text)

print("Cleaned sample review text: \n")
sample_text

Cleaned sample review text: 



"Following directly from where the story left off in part one, the second half which sets about telling the inevitable downfall and much more grim side of the man's legacy is exactly as such. In direct contrast to the first feature, part two represents a shift from Che the pride and glory of a revolutionised country, to Che struggling liberator of a country to which he has no previous ties. The change of setting isn't just aesthetic; from the autumn and spring greys of the woodlands comes a change of tone and heart to the feature, replacing the optimism of the predecessor with a cynical, battered and bruised reality aligned to an all new struggle. Yet, as Che would go on to say himself such a struggle is best told exactly as that a struggle. While Part One certainly helped document that initial surge to power that the revolutionary guerrilla acquired through just that, Part Two takes a much more refined, callous and bleak segment of Che's life and ambition, and gives it an assertive po

### **Remove Punctuations & Special Characters Function**

In [31]:
def remove_punctuation_and_special_characters(text):
    """
    Remove punctuation and special characters from the given text.

    This function replaces all characters in the input text that are
    not word characters (alphanumeric and underscores) or whitespace
    with a space.

    Args:
    - text (str): The text from which to remove punctuation and special characters.

    Returns:
    - str: The text with punctuation and special characters replaced by spaces.
    """
    return re.sub(r'[^\w\s]', ' ', text)

In [32]:
sample_text = dataset['review'][0]

print("Sample review text: \n")
sample_text

Sample review text: 



"One of the other reviewers has mentioned that after watching just 1 Oz episode you'll be hooked. They are right, as this is exactly what happened with me.<br /><br />The first thing that struck me about Oz was its brutality and unflinching scenes of violence, which set in right from the word GO. Trust me, this is not a show for the faint hearted or timid. This show pulls no punches with regards to drugs, sex or violence. Its is hardcore, in the classic use of the word.<br /><br />It is called OZ as that is the nickname given to the Oswald Maximum Security State Penitentary. It focuses mainly on Emerald City, an experimental section of the prison where all the cells have glass fronts and face inwards, so privacy is not high on the agenda. Em City is home to many..Aryans, Muslims, gangstas, Latinos, Christians, Italians, Irish and more....so scuffles, death stares, dodgy dealings and shady agreements are never far away.<br /><br />I would say the main appeal of the show is due to the fa

In [33]:
sample_text = remove_punctuation_and_special_characters(sample_text)

print("Cleaned sample review text: \n")
sample_text

Cleaned sample review text: 



'One of the other reviewers has mentioned that after watching just 1 Oz episode you ll be hooked  They are right  as this is exactly what happened with me  br    br   The first thing that struck me about Oz was its brutality and unflinching scenes of violence  which set in right from the word GO  Trust me  this is not a show for the faint hearted or timid  This show pulls no punches with regards to drugs  sex or violence  Its is hardcore  in the classic use of the word  br    br   It is called OZ as that is the nickname given to the Oswald Maximum Security State Penitentary  It focuses mainly on Emerald City  an experimental section of the prison where all the cells have glass fronts and face inwards  so privacy is not high on the agenda  Em City is home to many  Aryans  Muslims  gangstas  Latinos  Christians  Italians  Irish and more    so scuffles  death stares  dodgy dealings and shady agreements are never far away  br    br   I would say the main appeal of the show is due to the fa

### **Remove Numbers Function**

In [34]:
def remove_numbers(text):
    """
    Remove numbers from text.

    Args:
    - text (str): The text to remove numbers from.

    Returns:
    - str: The text with numbers removed.
    """
    return re.sub(r'\d', ' ', text)

In [35]:
sample_text = dataset['review'][0]

print("Sample review text: \n")
sample_text

Sample review text: 



"One of the other reviewers has mentioned that after watching just 1 Oz episode you'll be hooked. They are right, as this is exactly what happened with me.<br /><br />The first thing that struck me about Oz was its brutality and unflinching scenes of violence, which set in right from the word GO. Trust me, this is not a show for the faint hearted or timid. This show pulls no punches with regards to drugs, sex or violence. Its is hardcore, in the classic use of the word.<br /><br />It is called OZ as that is the nickname given to the Oswald Maximum Security State Penitentary. It focuses mainly on Emerald City, an experimental section of the prison where all the cells have glass fronts and face inwards, so privacy is not high on the agenda. Em City is home to many..Aryans, Muslims, gangstas, Latinos, Christians, Italians, Irish and more....so scuffles, death stares, dodgy dealings and shady agreements are never far away.<br /><br />I would say the main appeal of the show is due to the fa

In [36]:
sample_text = remove_numbers(sample_text)

print("Cleaned sample review text: \n")
sample_text

Cleaned sample review text: 



"One of the other reviewers has mentioned that after watching just   Oz episode you'll be hooked. They are right, as this is exactly what happened with me.<br /><br />The first thing that struck me about Oz was its brutality and unflinching scenes of violence, which set in right from the word GO. Trust me, this is not a show for the faint hearted or timid. This show pulls no punches with regards to drugs, sex or violence. Its is hardcore, in the classic use of the word.<br /><br />It is called OZ as that is the nickname given to the Oswald Maximum Security State Penitentary. It focuses mainly on Emerald City, an experimental section of the prison where all the cells have glass fronts and face inwards, so privacy is not high on the agenda. Em City is home to many..Aryans, Muslims, gangstas, Latinos, Christians, Italians, Irish and more....so scuffles, death stares, dodgy dealings and shady agreements are never far away.<br /><br />I would say the main appeal of the show is due to the fa

### **Remove Chat Words Function**

In [37]:
url = "https://raw.githubusercontent.com/MFuchs1989/Datasets-and-Miscellaneous/main/datasets/NLP/Text%20Pre-Processing%20VII%20(Special%20Cases)/chat_expressions.csv"

try:
    chat_expressions = pd.read_csv(url, encoding='utf-8', engine='python')
    chat_expressions
except pd.errors.ParserError as e:
    print(f"ParserError: {e}")
    chat_expressions = None

if chat_expressions is not None:
    print(f"DataFrame loaded successfully with {chat_expressions.shape[0]} rows and {chat_expressions.shape[1]} columns.")
else:
    print("Failed to load DataFrame.")

DataFrame loaded successfully with 67 rows and 2 columns.


In [38]:
chat_expressions

Unnamed: 0,Chat_Words,Chat_Words_Extended
0,AFAIK,As Far As I Know
1,AFK,Away From Keyboard
2,ASAP,As Soon As Possible
3,ATK,At The Keyboard
4,ATM,At The Moment
...,...,...
62,WB,Welcome Back
63,WTF,What The Fuck
64,WTG,Way To Go
65,WUF,Where Are You From


In [39]:
def create_chat_words_list(chat_expressions):
    """
    Create a list of chat words including both their original and lowercase versions.

    Args:
    - chat_expressions (dict): Dictionary containing chat words under the key 'Chat_Words'.

    Returns:
    - list: List of chat words including their lowercase versions.
    """
    chat_words = chat_expressions['Chat_Words']
    chat_words_list = set()

    for word in chat_words:
        chat_words_list.add(word)
        chat_words_list.add(word.lower())

    return list(chat_words_list)

In [40]:
chat_words_list = create_chat_words_list(chat_expressions)
print(chat_words_list)

['prt', 'afaik', 'prw', 'lol', 'IRL', 'gr8', 'FYI', 'irl', 'fwiw', 'wtf', 'brt', 'GR8', 'GG', 'M8', 'asl', 'IMO', 'gg', 'icq', 'gn', 'cya', 'BBL', 'b4', 'IC', 'IOW', 'L8R', 'SK8', 'PITA', 'rotflmao', 'imo', 'ATM', 'BFN', 'fyi', 'OMG', 'TTFN', 'nrn', 'WTF', 'STATS', 'cu', 'WUF', 'bak', 'OIC', 'PRT', 'kiss,"keep it simple, stupid"', 'ROFL', 'AFK', 'imho', 'FAQ', 'ic', 'pita', 'CYA', 'ILU', 'FWIW', 'B4', 'ATK', 'CUL8R', 'u', 'ASAP', 'gmta', 'BAK', 'roflol', 'LDR', 'wb', 'ICQ', 'asap', 'ldr', 'LOL', 'lmao', 'm8', 'GMTA', 'G9', 'brb', 'btw', 'stats', 'faq', 'ASL', 'CU', 'rofl', 'thx', 'U', 'fc', 'ROTFLMAO', 'TTYL', 'BTW', 'afk', 'A3', 'ttyl', 'WB', 'a3', 'wuf', 'BRB', 'KISS,"Keep It Simple, Stupid"', 'g9', 'atk', 'LTNS', 'b4n', 'U2', 'u4e', 'BBS', 'w8', 'MTE', 'ltns', 'bbl', 'ttfn', 'bfn', 'omg', 'THX', 'IMHO', 'NRN', 'ilu', 'B4N', 'BRT', 'oic', 'WTG', 'bbs', 'FC', 'GN', 'u2', 'wtg', 'GAL', 'AFAIK', 'mte', 'ROFLOL', 'sk8', 'U4E', 'LMAO', 'gal', 'atm', 'W8', 'cul8r', 'l8r', 'PRW', 'iow']


In [41]:
def remove_chat_words(text):
    """
    Replace chat words in the text with spaces.

    Args:
    - text (str): The input text.
    - chat_words_list (list): List of chat words to be removed.

    Returns:
    - str: The text with chat words replaced by spaces.
    """
    words = text.split()
    cleaned_words = [w if w not in chat_words_list else ' ' for w in words]

    return ' '.join(cleaned_words)

In [42]:
sample_text = dataset['review'][12]

print("Sample review text: \n")
sample_text

Sample review text: 



'So im not a big fan of Boll\'s work but then again not many are. I enjoyed his movie Postal (maybe im the only one). Boll apparently bought the rights to use Far Cry long ago even before the game itself was even finsished. <br /><br />People who have enjoyed killing mercs and infiltrating secret research labs located on a tropical island should be warned, that this is not Far Cry... This is something Mr Boll have schemed together along with his legion of schmucks.. Feeling loneley on the set Mr Boll invites three of his countrymen to play with. These players go by the names of Til Schweiger, Udo Kier and Ralf Moeller.<br /><br />Three names that actually have made them selfs pretty big in the movie biz. So the tale goes like this, Jack Carver played by Til Schweiger (yes Carver is German all hail the bratwurst eating dudes!!) However I find that Tils acting in this movie is pretty badass.. People have complained about how he\'s not really staying true to the whole Carver agenda but we

In [43]:
sample_text = remove_chat_words(sample_text)

print("Cleaned sample review text: \n")
sample_text

Cleaned sample review text: 



'So im not a big fan of Boll\'s work but then again not many are. I enjoyed his movie Postal (maybe im the only one). Boll apparently bought the rights to use Far Cry long ago even before the game itself was even finsished. <br /><br />People who have enjoyed killing mercs and infiltrating secret research labs located on a tropical island should be warned, that this is not Far Cry... This is something Mr Boll have schemed together along with his legion of schmucks.. Feeling loneley on the set Mr Boll invites three of his countrymen to play with. These players go by the names of Til Schweiger, Udo Kier and Ralf Moeller.<br /><br />Three names that actually have made them selfs pretty big in the movie biz. So the tale goes like this, Jack Carver played by Til Schweiger (yes Carver is German all hail the bratwurst eating dudes!!) However I find that Tils acting in this movie is pretty badass.. People have complained about how he\'s not really staying true to the whole Carver agenda but we

### **Remove Extra Spaces Function**

In [44]:
def remove_extra_spaces(text):
    """
    Remove extra spaces from text.

    Args:
    - text (str): The text to remove extra spaces from.

    Returns:
    - str: The text with extra spaces removed.
    """
    return re.sub(r'\s+', ' ', text)

In [45]:
sample_text = dataset['review'][0]

print("Sample review text: \n")
sample_text

Sample review text: 



"One of the other reviewers has mentioned that after watching just 1 Oz episode you'll be hooked. They are right, as this is exactly what happened with me.<br /><br />The first thing that struck me about Oz was its brutality and unflinching scenes of violence, which set in right from the word GO. Trust me, this is not a show for the faint hearted or timid. This show pulls no punches with regards to drugs, sex or violence. Its is hardcore, in the classic use of the word.<br /><br />It is called OZ as that is the nickname given to the Oswald Maximum Security State Penitentary. It focuses mainly on Emerald City, an experimental section of the prison where all the cells have glass fronts and face inwards, so privacy is not high on the agenda. Em City is home to many..Aryans, Muslims, gangstas, Latinos, Christians, Italians, Irish and more....so scuffles, death stares, dodgy dealings and shady agreements are never far away.<br /><br />I would say the main appeal of the show is due to the fa

In [46]:
sample_text = remove_extra_spaces(sample_text)

print("Cleaned sample review text: \n")
sample_text

Cleaned sample review text: 



"One of the other reviewers has mentioned that after watching just 1 Oz episode you'll be hooked. They are right, as this is exactly what happened with me.<br /><br />The first thing that struck me about Oz was its brutality and unflinching scenes of violence, which set in right from the word GO. Trust me, this is not a show for the faint hearted or timid. This show pulls no punches with regards to drugs, sex or violence. Its is hardcore, in the classic use of the word.<br /><br />It is called OZ as that is the nickname given to the Oswald Maximum Security State Penitentary. It focuses mainly on Emerald City, an experimental section of the prison where all the cells have glass fronts and face inwards, so privacy is not high on the agenda. Em City is home to many..Aryans, Muslims, gangstas, Latinos, Christians, Italians, Irish and more....so scuffles, death stares, dodgy dealings and shady agreements are never far away.<br /><br />I would say the main appeal of the show is due to the fa

### **Remove Stop Words Function**

In [47]:
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\glows\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [48]:
print("List of Stop Words: \n")

", ".join(stopwords.words('english'))

List of Stop Words: 



"i, me, my, myself, we, our, ours, ourselves, you, you're, you've, you'll, you'd, your, yours, yourself, yourselves, he, him, his, himself, she, she's, her, hers, herself, it, it's, its, itself, they, them, their, theirs, themselves, what, which, who, whom, this, that, that'll, these, those, am, is, are, was, were, be, been, being, have, has, had, having, do, does, did, doing, a, an, the, and, but, if, or, because, as, until, while, of, at, by, for, with, about, against, between, into, through, during, before, after, above, below, to, from, up, down, in, out, on, off, over, under, again, further, then, once, here, there, when, where, why, how, all, any, both, each, few, more, most, other, some, such, no, nor, not, only, own, same, so, than, too, very, s, t, can, will, just, don, don't, should, should've, now, d, ll, m, o, re, ve, y, ain, aren, aren't, couldn, couldn't, didn, didn't, doesn, doesn't, hadn, hadn't, hasn, hasn't, haven, haven't, isn, isn't, ma, mightn, mightn't, mustn, mus

In [49]:
def remove_stopwords(text):
    """
    Remove stop words from the text.

    Args:
    - text (str): The input text.

    Returns:
    - str: The text with stop words removed.
    """
    words = text.split()
    cleaned_words = [w for w in words if w.lower() not in stop_words]
    return ' '.join(cleaned_words)

In [50]:
sample_text = dataset['review'][0]

print("Sample review text: \n")
sample_text

Sample review text: 



"One of the other reviewers has mentioned that after watching just 1 Oz episode you'll be hooked. They are right, as this is exactly what happened with me.<br /><br />The first thing that struck me about Oz was its brutality and unflinching scenes of violence, which set in right from the word GO. Trust me, this is not a show for the faint hearted or timid. This show pulls no punches with regards to drugs, sex or violence. Its is hardcore, in the classic use of the word.<br /><br />It is called OZ as that is the nickname given to the Oswald Maximum Security State Penitentary. It focuses mainly on Emerald City, an experimental section of the prison where all the cells have glass fronts and face inwards, so privacy is not high on the agenda. Em City is home to many..Aryans, Muslims, gangstas, Latinos, Christians, Italians, Irish and more....so scuffles, death stares, dodgy dealings and shady agreements are never far away.<br /><br />I would say the main appeal of the show is due to the fa

In [51]:
sample_text = remove_stopwords(sample_text)

print("Cleaned sample review text: \n")
sample_text

Cleaned sample review text: 



"One reviewers mentioned watching 1 Oz episode hooked. right, exactly happened me.<br /><br />The first thing struck Oz brutality unflinching scenes violence, set right word GO. Trust me, show faint hearted timid. show pulls punches regards drugs, sex violence. hardcore, classic use word.<br /><br />It called OZ nickname given Oswald Maximum Security State Penitentary. focuses mainly Emerald City, experimental section prison cells glass fronts face inwards, privacy high agenda. Em City home many..Aryans, Muslims, gangstas, Latinos, Christians, Italians, Irish more....so scuffles, death stares, dodgy dealings shady agreements never far away.<br /><br />I would say main appeal show due fact goes shows dare. Forget pretty pictures painted mainstream audiences, forget charm, forget romance...OZ mess around. first episode ever saw struck nasty surreal, say ready it, watched more, developed taste Oz, got accustomed high levels graphic violence. violence, injustice (crooked guards who'll sold

### **Spelling Correction Function**

In [52]:
# Initialize the spell checkers
spell_checker = SpellChecker()
spell_autocorrect = Speller(lang='en')

In [53]:
def spell_correction(text):
    """
    Perform spell check on the given text and return the corrected text along with the list of incorrect words.

    Args:
    - text (str): The text to spell check.

    Returns:
    - tuple: A tuple containing the corrected text and a list of incorrect words.
    """
    
    if not text or text.strip() == '':
        return text

    # Process a text string such that no character appears more than twice consecutively
    text = ''.join(''.join(s)[:2] for _, s in itertools.groupby(text))

    # Tokenize the standardized text
    words = text.split()

    # Find misspelled words
    misspelled = spell_checker.unknown(words)

    # Correct misspelled words 1
    corrected_words = [spell_checker.correction(word) if word in misspelled else word for word in words]

    # Correct misspelled words 2
    corrected_words = [word if word is not None else word for word in corrected_words]
    corrected_words = [spell_autocorrect.autocorrect_word(word) if word is not None else "" for word in corrected_words]
    corrected_text = ' '.join(corrected_words)

    return corrected_text

In [54]:
sample_text = "Thiss isss a smaple textt withh severall typoos andd incorrcet speling. Somee words have too manyy repeeatedd leetters. It shoulldd be corected byy the funtion."

print("Sample review text: \n")
sample_text

Sample review text: 



'Thiss isss a smaple textt withh severall typoos andd incorrcet speling. Somee words have too manyy repeeatedd leetters. It shoulldd be corected byy the funtion.'

In [55]:
sample_text = spell_correction(sample_text)

print("Cleaned sample review text: \n")
sample_text

Cleaned sample review text: 



'This is a sample text with several typos and incorrect spelling Some words have too many repeated letters It should be corrected by the function'

### **Lemmatization Function**

In [56]:
nltk.download('wordnet')
lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\glows\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [57]:
def lemmatize_words(text):

    words = word_tokenize(text)
    lemmatized_words = [lemmatizer.lemmatize(word) for word in words]
    lemmatized_text = ' '.join(lemmatized_words)

    return lemmatized_text

In [58]:
sample_text = dataset['review'][0]
# sample_text = "The striped bats are hanging on their feet for best"

print("Sample review text: \n")
sample_text

Sample review text: 



"One of the other reviewers has mentioned that after watching just 1 Oz episode you'll be hooked. They are right, as this is exactly what happened with me.<br /><br />The first thing that struck me about Oz was its brutality and unflinching scenes of violence, which set in right from the word GO. Trust me, this is not a show for the faint hearted or timid. This show pulls no punches with regards to drugs, sex or violence. Its is hardcore, in the classic use of the word.<br /><br />It is called OZ as that is the nickname given to the Oswald Maximum Security State Penitentary. It focuses mainly on Emerald City, an experimental section of the prison where all the cells have glass fronts and face inwards, so privacy is not high on the agenda. Em City is home to many..Aryans, Muslims, gangstas, Latinos, Christians, Italians, Irish and more....so scuffles, death stares, dodgy dealings and shady agreements are never far away.<br /><br />I would say the main appeal of the show is due to the fa

In [59]:
sample_text = lemmatize_words(sample_text)

print("Cleaned sample review text: \n")
sample_text

Cleaned sample review text: 



"One of the other reviewer ha mentioned that after watching just 1 Oz episode you 'll be hooked . They are right , a this is exactly what happened with me. < br / > < br / > The first thing that struck me about Oz wa it brutality and unflinching scene of violence , which set in right from the word GO . Trust me , this is not a show for the faint hearted or timid . This show pull no punch with regard to drug , sex or violence . Its is hardcore , in the classic use of the word. < br / > < br / > It is called OZ a that is the nickname given to the Oswald Maximum Security State Penitentary . It focus mainly on Emerald City , an experimental section of the prison where all the cell have glass front and face inwards , so privacy is not high on the agenda . Em City is home to many .. Aryans , Muslims , gangsta , Latinos , Christians , Italians , Irish and more .... so scuffle , death stare , dodgy dealing and shady agreement are never far away. < br / > < br / > I would say the main appeal of

### **Text Preprocessing Function**

In [60]:
def text_preprocessing(text):
    """
    Perform a series of text preprocessing steps on the input text.

    The function applies the following steps:
    1. Lowercase the text.
    2. Remove emojis.
    3. Remove emoticons.
    4. Remove URLs.
    5. Remove email addresses.
    6. Remove HTML tags.
    7. Remove non-ASCII characters.
    8. Remove punctuation and special characters.
    9. Remove numbers.
    10. Remove chat words.
    11. Remove extra spaces.
    12. Remove stopwords.
    13. Correct spelling.
    14. Lemmatize words.

    Args:
    - text (str): The text to preprocess.

    Returns:
    - str: The preprocessed text.
    """

    # Convert text to lowercase
    text = lowercase_text(text)

    # Remove emojis
    text = remove_emojis(text)

    # Remove emoticons
    text = remove_emoticons(text)

    # Remove URLs
    text = remove_urls(text)

    # Remove email addresses
    text = remove_emails(text)

    # Remove HTML tags
    text = remove_html_tags(text)

    # Remove non-ASCII characters
    text = remove_non_ascii(text)

    # Remove punctuation and special characters
    text = remove_punctuation_and_special_characters(text)

    # Remove numbers
    text = remove_numbers(text)

    # Remove chat words
    text = remove_chat_words(text)

    # Remove extra spaces
    text = remove_extra_spaces(text)

    # Remove stopwords
    text = remove_stopwords(text)

    # Correct spelling
    text = spell_correction(text)

    # Lemmatize words
    text = lemmatize_words(text)
    
    return text
    

In [61]:
sample_text = dataset['review'][1]
# sample_text = "The striped bats are hanging on their feet for best"

print("Sample review text: \n")
sample_text

Sample review text: 



'A wonderful little production. <br /><br />The filming technique is very unassuming- very old-time-BBC fashion and gives a comforting, and sometimes discomforting, sense of realism to the entire piece. <br /><br />The actors are extremely well chosen- Michael Sheen not only "has got all the polari" but he has all the voices down pat too! You can truly see the seamless editing guided by the references to Williams\' diary entries, not only is it well worth the watching but it is a terrificly written and performed piece. A masterful production about one of the great master\'s of comedy and his life. <br /><br />The realism really comes home with the little things: the fantasy of the guard which, rather than use the traditional \'dream\' techniques remains solid then disappears. It plays on our knowledge and our senses, particularly with the scenes concerning Orton and Halliwell and the sets (particularly of their flat with Halliwell\'s murals decorating every surface) are terribly well d

In [62]:
sample_text = text_preprocessing(sample_text)

print("Cleaned sample review text: \n")
sample_text

Cleaned sample review text: 



'wonderful little production filming technique assuming old time be fashion give conforming sometimes discomforting sense realism entire piece actor extremely well chosen sheen got polar voice pat truly see seamless editing guided reference willie diary entry well worth watching terrific written performed piece masterful production one great master comedy life realism really come home little thing fantasy guard rather use traditional dream technique remains solid disappears play knowledge sens particularly scene concerning often set particularly flat mural decorating every surface terribly well done'

In [7]:
# Apply the text_preprocessing function to the X column
X_cleaned = dataset_subset['review'].apply(text_preprocessing)

# Print a sample of the cleaned review text
print("Cleaned sample review text: \n", X_cleaned.head(1))

NameError: name 'text_preprocessing' is not defined

In [None]:
X = X_cleaned
y = dataset_subset['sentiment']

In [None]:
#     # # Standardize the text first
#     text = lowercase_text(text)
#     text = remove_html_tags(text)
#     text = remove_urls(text)
#     text = remove_emails(text)
#     text = remove_non_ascii(text)
#     text = remove_punctuation_and_special_characters(text)
#     text = remove_numbers(text)
#     # text = remove_quotation_marks(text)
#     text = remove_extra_spaces(text)

# Lower casing
# Removal of emojis
# Removal of emoticons
# Removal of URLs
# Removal of Emails
# Removal of HTML tags
# Removal of Non-Ascii Characters
# Removal of Punctuations & Special Characters
# Removal of Numbers
# Removal of Chat Words
# Removal of Stopwords
# Spelling Correction