In [1]:
import numpy as np
import pandas as pd
import requests

In [2]:
df = pd.read_csv('A:/NLP/text_preprocessing/movies.csv')

In [3]:
df.shape

(10000, 3)

In [4]:
df.head()

Unnamed: 0,title,overview,genre
0,The Shawshank Redemption,Imprisoned in the 1940s for the double murder ...,"Drama, Crime"
1,The Godfather,"Spanning the years 1945 to 1955, a chronicle o...","Drama, Crime"
2,KPop Demon Hunters,"When K-pop superstars Rumi, Mira and Zoey aren...","Animation, Fantasy, Action, Comedy, Music"
3,The Godfather Part II,In the continuing saga of the Corleone crime f...,"Drama, Crime"
4,Schindler's List,The true story of how businessman Oskar Schind...,"Drama, History, War"


In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   title     10000 non-null  object
 1   overview  9998 non-null   object
 2   genre     10000 non-null  object
dtypes: object(3)
memory usage: 234.5+ KB


In [20]:
df.dropna(inplace = True)

# **`Text Preprocessing`**

*`Lower Casing`*

In [21]:
df["overview"] = df['overview'].str.lower()
df['overview'].head()

0    imprisoned in the 1940s for the double murder ...
1    spanning the years 1945 to 1955, a chronicle o...
2    when k-pop superstars rumi, mira and zoey aren...
3    in the continuing saga of the corleone crime f...
4    the true story of how businessman oskar schind...
Name: overview, dtype: object

`HTML tags removal`

In [22]:
import re
df['overview'].str.contains('<.*?>').sum()

0

Contains no HTML tags

`URLs Removal`

In [23]:
df['overview'].str.contains(r'https?://\S+|www\.\S+').sum()

0

No URLs

`Punctuation remval`

In [24]:
import string
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [25]:
def remove_punctuation(text):
    return text.translate(str.maketrans('', '', string.punctuation))

In [26]:
df['overview'] = df['overview'].apply(remove_punctuation)

In [27]:
df.head()

Unnamed: 0,title,overview,genre
0,The Shawshank Redemption,imprisoned in the 1940s for the double murder ...,"Drama, Crime"
1,The Godfather,spanning the years 1945 to 1955 a chronicle of...,"Drama, Crime"
2,KPop Demon Hunters,when kpop superstars rumi mira and zoey arent ...,"Animation, Fantasy, Action, Comedy, Music"
3,The Godfather Part II,in the continuing saga of the corleone crime f...,"Drama, Crime"
4,Schindler's List,the true story of how businessman oskar schind...,"Drama, History, War"


`Chat slang`

In [28]:
chat_slang = {
    "AFAIK": "As Far As I Know",
    "AFK": "Away From Keyboard",
    "ASAP": "As Soon As Possible",
    "ATK": "At The Keyboard",
    "ATM": "At The Moment",
    "A3": "Anytime, Anywhere, Anyplace",
    "BAK": "Back At Keyboard",
    "BBL": "Be Back Later",
    "BBS": "Be Back Soon",
    "BFN": "Bye For Now",
    "B4N": "Bye For Now",
    "BRB": "Be Right Back",
    "BRT": "Be Right There",
    "BTW": "By The Way",
    "B4": "Before",
    "CU": "See You",
    "CUL8R": "See You Later",
    "CYA": "See You",
    "FAQ": "Frequently Asked Questions",
    "FC": "Fingers Crossed",
    "FWIW": "For What It's Worth",
    "FYI": "For Your Information",
    "GAL": "Get A Life",
    "GG": "Good Game",
    "GN": "Good Night",
    "GMTA": "Great Minds Think Alike",
    "GR8": "Great!",
    "G9": "Genius",
    "IC": "I See",
    "ICQ": "I Seek you (also a chat program)",
    "ILU": "I Love You",
    "IMHO": "In My Honest/Humble Opinion",
    "IMO": "In My Opinion",
    "IOW": "In Other Words",
    "IRL": "In Real Life",
    "KISS": "Keep It Simple, Stupid",
    "LDR": "Long Distance Relationship",
    "LMAO": "Laughing My A** Off",
    "LOL": "Laughing Out Loud",
    "LTNS": "Long Time No See",
    "L8R": "Later",
    "MTE": "My Thoughts Exactly",
    "M8": "Mate",
    "NRN": "No Reply Necessary",
    "OIC": "Oh I See",
    "PITA": "Pain In The A**",
    "PRT": "Party",
    "PRW": "Parents Are Watching",
    "QPSA?": "Que Pasa?",
    "ROFL": "Rolling On The Floor Laughing",
    "ROFLOL": "Rolling On The Floor Laughing Out Loud",
    "ROTFLMAO": "Rolling On The Floor Laughing My A** Off",
    "SK8": "Skate",
    "STATS": "Your sex and age",
    "ASL": "Age, Sex, Location",
    "THX": "Thank You",
    "TTFN": "Ta-Ta For Now!",
    "TTYL": "Talk To You Later",
    "U": "You",
    "U2": "You Too",
    "U4E": "Yours For Ever",
    "WB": "Welcome Back",
    "WTF": "What The F***",
    "WTG": "Way To Go!",
    "WUF": "Where Are You From?",
    "W8": "Wait...",
    "7K": "Sick:-D Laugher",
    "TFW": "That feeling when",
    "MFW": "My face when",
    "MRW": "My reaction when",
    "IFYP": "I feel your pain",
    "TNTL": "Trying not to laugh",
    "JK": "Just kidding",
    "IDC": "I don’t care",
    "ILY": "I love you",
    "IMU": "I miss you",
    "ADIH": "Another day in hell",
    "ZZZ": "Sleeping, bored, tired",
    "WYWH": "Wish you were here",
    "TIME": "Tears in my eyes",
    "BAE": "Before anyone else",
    "FIMH": "Forever in my heart",
    "BSAAW": "Big smile and a wink",
    "BWL": "Bursting with laughter",
    "BFF": "Best friends forever",
    "CSL": "Can’t stop laughing"
}

In [29]:
def slang_converter(text):
    converted_words = []
    for word in text.split():
        if word.upper() in chat_slang:
            converted_words.append(chat_slang[word.upper()])
        else:
            converted_words.append(word)

    return ' '.join(converted_words)

In [30]:
df["overview"] = df["overview"].apply(slang_converter)

`Spelling Correction`

In [31]:
df.head()

Unnamed: 0,title,overview,genre
0,The Shawshank Redemption,imprisoned in the 1940s for the double murder ...,"Drama, Crime"
1,The Godfather,spanning the years 1945 to 1955 a chronicle of...,"Drama, Crime"
2,KPop Demon Hunters,when kpop superstars rumi mira and zoey arent ...,"Animation, Fantasy, Action, Comedy, Music"
3,The Godfather Part II,in the continuing saga of the corleone crime f...,"Drama, Crime"
4,Schindler's List,the true story of how businessman oskar schind...,"Drama, History, War"


In [33]:
from textblob import TextBlob

In [None]:
# corrected = []
# for text in df['overview']:
#     corrected.append(TextBlob(text).correct().string)

In [34]:
# corrected

`Stop Words`

In [36]:
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
stopwords.words('english')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\AliRaza\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


['a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 "he'd",
 "he'll",
 'her',
 'here',
 'hers',
 'herself',
 "he's",
 'him',
 'himself',
 'his',
 'how',
 'i',
 "i'd",
 'if',
 "i'll",
 "i'm",
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it'd",
 "it'll",
 "it's",
 'its',
 'itself',
 "i've",
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'on

In [37]:
def remove_stopwords(text):
    stop_words = set(stopwords.words('english'))  # Use a set for O(1) lookup
    words = text.split()
    filtered_words = [word for word in words if word not in stop_words]
    return ' '.join(filtered_words)

In [39]:
df['overview'] = df['overview'].apply(remove_stopwords)

In [40]:
df.sample(5)

Unnamed: 0,title,overview,genre
89,Taylor Swift: Reputation Stadium Tour,taylor swift takes stage dallas reputation sta...,Music
6448,Alice,alice tate mother two marriage 16 years finds ...,"Fantasy, Drama, Romance, Comedy"
7695,Goosebumps 2: Haunted Halloween,careful wish school junk business best friends...,"Comedy, Fantasy, Horror"
3031,Gangs of Wasseypur - Part 1,1970s india sardar khan vows take revenge man ...,"Action, Thriller, Crime"
5740,Save the Last Dance,death mother sara moves south side chicago liv...,"Drama, Family, Romance, Music"


In [43]:
import re

def remove_emojis(text):
    emoji_pattern = re.compile("["
        u"\U0001F600-\U0001F64F"  # Emoticons 😀-🙏
        u"\U0001F300-\U0001F5FF"  # Symbols & pictographs 🌀-🗿
        u"\U0001F680-\U0001F6FF"  # Transport & map symbols 🚀-🚧
        u"\U0001F1E0-\U0001F1FF"  # Flags 🇦-🇿
        u"\U00002700-\U000027BF"  # Dingbats ✀-➿
        u"\U0001F900-\U0001F9FF"  # Supplemental symbols 🤐-🧿
        u"\U00002600-\U000026FF"  # Misc symbols ☀-⛿
        u"\U0001FA70-\U0001FAFF"  # Symbols and pictographs extended 🩰-🛸
        u"\U000025A0-\U000025FF"  # Geometric Shapes ■-◿
        "]+", flags=re.UNICODE)

    return emoji_pattern.sub(r'', text)

In [45]:
df['overview'] = df['overview'].apply(remove_emojis)

In [60]:
df['overview'].str.split()[12]

['armies',
 'mass',
 'final',
 'battle',
 'decide',
 'fate',
 'worldand',
 'powerful',
 'ancient',
 'forces',
 'light',
 'dark',
 'compete',
 'determine',
 'outcomeone',
 'member',
 'fellowship',
 'ring',
 'revealed',
 'noble',
 'heir',
 'throne',
 'kings',
 'men',
 'yet',
 'sole',
 'hope',
 'triumph',
 'evil',
 'lies',
 'brave',
 'hobbit',
 'frodo',
 'accompanied',
 'loyal',
 'friend',
 'sam',
 'hideous',
 'wretched',
 'gollum',
 'ventures',
 'deep',
 'dark',
 'heart',
 'mordor',
 'seemingly',
 'impossible',
 'quest',
 'destroy',
 'ring',
 'power\u200b']

In [67]:
import spacy



TypeError: ForwardRef._evaluate() missing 1 required keyword-only argument: 'recursive_guard'

In [66]:
nlp(df["overview"][0])

NameError: name 'nlp' is not defined