In [1]:
import pandas as pd

In [16]:
data_path = "IMDB Dataset.csv"

In [15]:
import csv


output_file = "cleaned_file.csv"

with open(data_path, "r", newline="", encoding="utf-8") as infile, \
     open(output_file, "w", newline="", encoding="utf-8") as outfile:

    reader = csv.reader(infile)
    writer = csv.writer(outfile)

    for i, row in enumerate(reader, start=1):
        try:
            writer.writerow(row)
        except csv.Error as e:
            print(f"Skipping malformed row at line {i}: {e}")


In [20]:
data_path = "cleaned_file.csv"

In [22]:
df = pd.read_csv(data_path)
df.shape

(50000, 2)

In [25]:
df= df.head(200) #only take first 200 review
df.shape

(200, 2)

# **Lower case preprocessing**

In [28]:
df["review"][199]

"The film quickly gets to a major chase scene with ever increasing destruction. The first really bad thing is the guy hijacking Steven Seagal would have been beaten to pulp by Seagal's driving, but that probably would have ended the whole premise for the movie.<br /><br />It seems like they decided to make all kinds of changes in the movie plot, so just plan to enjoy the action, and do not expect a coherent plot. Turn any sense of logic you may have, it will reduce your chance of getting a headache.<br /><br />I does give me some hope that Steven Seagal is trying to move back towards the type of characters he portrayed in his more popular movies."

In [29]:
df["review"]= df["review"].str.lower()

In [30]:
df["review"][199]

"the film quickly gets to a major chase scene with ever increasing destruction. the first really bad thing is the guy hijacking steven seagal would have been beaten to pulp by seagal's driving, but that probably would have ended the whole premise for the movie.<br /><br />it seems like they decided to make all kinds of changes in the movie plot, so just plan to enjoy the action, and do not expect a coherent plot. turn any sense of logic you may have, it will reduce your chance of getting a headache.<br /><br />i does give me some hope that steven seagal is trying to move back towards the type of characters he portrayed in his more popular movies."

# **HTML Tag Remove**

In [31]:
import re

def remove_html_tags(text):
  pattern = re.compile("<\s*\/?\s*([a-zA-Z0-9]+)(\s[^>]*)?>")
  return pattern.sub(r"", text)

In [32]:
text = "<html><body><p> Movie 1</p><p> Actor - Aamir Khan</p><p> Click here to <a href='http://google.com'>download</a></p></body></html>"
remove_html_tags(text)

' Movie 1 Actor - Aamir Khan Click here to download'

In [33]:
df["review"]= df["review"].apply(remove_html_tags)

In [34]:
df["review"][199]

"the film quickly gets to a major chase scene with ever increasing destruction. the first really bad thing is the guy hijacking steven seagal would have been beaten to pulp by seagal's driving, but that probably would have ended the whole premise for the movie.it seems like they decided to make all kinds of changes in the movie plot, so just plan to enjoy the action, and do not expect a coherent plot. turn any sense of logic you may have, it will reduce your chance of getting a headache.i does give me some hope that steven seagal is trying to move back towards the type of characters he portrayed in his more popular movies."

# **Remove url**

In [39]:
import re

def remove_url(text):
  pattern = re.compile(r'\b(?:https?:\/\/)?(?:www\.)?[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}(?:\/\S*)?\b')
  return pattern.sub(r' ',text)

In [40]:
text2 = 'Check out my youtube https://www.youtube.com/@AKoley47'
remove_url(text2)

'Check out my youtube  '

# **Punctuation removal**

In [49]:
import string,time
exclude = string.punctuation


In [43]:
def remove_punctuation(text):
  return text.translate(str.maketrans('', '', exclude))

In [57]:
start= time.time()
df['review']= df['review'].apply(remove_punctuation)
stop= time.time()- start
stop

0.013237714767456055

In [58]:
df['review'][199]

'the film quickly gets to a major chase scene with ever increasing destruction the first really bad thing is the guy hijacking steven seagal would have been beaten to pulp by seagals driving but that probably would have ended the whole premise for the movieit seems like they decided to make all kinds of changes in the movie plot so just plan to enjoy the action and do not expect a coherent plot turn any sense of logic you may have it will reduce your chance of getting a headachei does give me some hope that steven seagal is trying to move back towards the type of characters he portrayed in his more popular movies'

# **Chat conversion handle**

In [70]:
chat_words = {
    'AFAIK':'As Far As I Know',
    'AFK':'Away From Keyboard',
    'ASAP':'As Soon As Possible',
    "FYI": "For Your Information",
    "ASAP": "As Soon As Possible",
    "BRB": "Be Right Back",
    "BTW": "By The Way",
    "OMG": "Oh My God",
    "IMO": "In My Opinion",
    "LOL": "Laugh Out Loud",
    "TTYL": "Talk To You Later",
    "GTG": "Got To Go",
    "TTYT": "Talk To You Tomorrow",
    "IDK": "I Don't Know",
    "TMI": "Too Much Information",
    "IMHO": "In My Humble Opinion",
    "ICYMI": "In Case You Missed It",
    "AFAIK": "As Far As I Know",
    "BTW": "By The Way",
    "FAQ": "Frequently Asked Questions",
    "TGIF": "Thank God It's Friday",
    "FYA": "For Your Action",
    "ICYMI": "In Case You Missed It",
}

In [73]:
def chat_conv(text):
  new_text= []
  for w in text.split():
    if w.upper() in chat_words:
      new_text.append(chat_words[w.upper()])
    else:
      new_text.append(w)
  return " ".join(new_text)

In [74]:
chat_conv("idk , yo have to do this asap")

"I Don't Know , yo have to do this As Soon As Possible"

# **Incorrect text**

In [76]:
from textblob import TextBlob

In [77]:
incorrect_text = 'ceertain conditionas duriing seveal ggenerations aree moodified in the saame maner.'

textBlob = TextBlob(incorrect_text)
textBlob.correct().string

'certain conditions during several generations are modified in the same manner.'

# **Remove Stopword**

In [78]:
import nltk
nltk.download("stopwords")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [79]:
from nltk.corpus import stopwords
stopwords.words("english")

['a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 "he'd",
 "he'll",
 'her',
 'here',
 'hers',
 'herself',
 "he's",
 'him',
 'himself',
 'his',
 'how',
 'i',
 "i'd",
 'if',
 "i'll",
 "i'm",
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it'd",
 "it'll",
 "it's",
 'its',
 'itself',
 "i've",
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'on

In [82]:
def remove_stopwords(text):
  new_text = []
  for w in text.split():
    if w in stopwords.words("english"):
      new_text.append(" ")
    else:
      new_text.append(w)

  return "".join(new_text)

In [83]:
remove_stopwords('probably my all-time favorite movie, a story of selflessness, sacrifice and dedication to a noble cause, but it\'s not preachy or boring. it just never gets old, despite my having seen it some 15 or more times')

'probably all-timefavoritemovie, story selflessness,sacrifice dedication  noblecause,   preachy boring.  nevergetsold,despite  seen  15  times'