In [1]:
import pandas as pd
import re                       # Import Regular Expression (remove HTML tags)
import string                   # Import Punctuation 
from textblob import TextBlob    # Import this Library to Handle the Spelling Issue
import nltk
from nltk.corpus import stopwords    #  NLTK library to remove Stopwords
import emoji                        # for translating symbol to text
import spacy                        # for tokenization
from nltk.stem.porter import PorterStemmer   # for stemming
from nltk.stem import WordNetLemmatizer      # for lemma
from sklearn.model_selection import train_test_split 



In [2]:
df = pd.read_csv(r'C:\Users\Admin\WORK\Project_CV\Model_NLP_sentiment\data\IMDB Dataset.csv')    # insert path to your data

In [4]:
df.head()                                                # check dataframe 

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [18]:
#Access the corpus and target variables
x = df.review
y = df.sentiment                                                                             #.replace({1:'Negative', 2:'Positive'})

# train test splitting
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0)

In [19]:
X_test.head()


11841    John Cassavetes is on the run from the law. He...
19602    It's not just that the movie is lame. It's mor...
45519    Well, if it weren't for Ethel Waters and a 7-y...
25747    I find Alan Jacobs review very accurate concer...
42642    This movie is simply awesome. It is so hilario...
Name: review, dtype: object

In [8]:
text = X_test.str.lower() 
text.head()

11841    john cassavetes is on the run from the law. he...
19602    it's not just that the movie is lame. it's mor...
45519    well, if it weren't for ethel waters and a 7-y...
25747    i find alan jacobs review very accurate concer...
42642    this movie is simply awesome. it is so hilario...
Name: review, dtype: object

In [62]:
# function for preporatin data
def preprocessing (text):
    text = text.lower()                          # LoweCasing Text
    pattern_1 = re.compile('<.*?>')                # constant using one regular expression
    pattern_2 = re.sub(pattern_1, r'', text)       # changes ('<.*?>') to gap " "
    pattern_3 = re.compile(r'https?://\S+|www\.\S+')
    pattern_4 = pattern_3.sub(r'', pattern_2)                                           #  Remove URLs from Text or Whole Corpus.
    return pattern_4
   

In [63]:
X_test = X_test.apply(preprocessing)
X_test


11841    john cassavetes is on the run from the law. he...
19602    it's not just that the movie is lame. it's mor...
45519    well, if it weren't for ethel waters and a 7-y...
25747    i find alan jacobs review very accurate concer...
42642    this movie is simply awesome. it is so hilario...
                               ...                        
25091    how did such a terrible script manage to attra...
27853    i was invited to view this film at a small art...
47278    first of all,there is a detective story:"légit...
37020    this movie grabbed me with the incredible open...
2217     i saw this film at sxsw with the director in a...
Name: review, Length: 10000, dtype: object

In [8]:
#  Remove URLs from Text or Whole Corpus.
def remove_url(text):
    pattern = re.compile(r'https?://\S+|www\.\S+')
    return pattern.sub(r'', text)

df['review'] = df['review'].apply(remove_url)
df['review'][1]

'a wonderful little production. the filming technique is very unassuming- very old-time-bbc fashion and gives a comforting, and sometimes discomforting, sense of realism to the entire piece. the actors are extremely well chosen- michael sheen not only "has got all the polari" but he has all the voices down pat too! you can truly see the seamless editing guided by the references to williams\' diary entries, not only is it well worth the watching but it is a terrificly written and performed piece. a masterful production about one of the great master\'s of comedy and his life. the realism really comes home with the little things: the fantasy of the guard which, rather than use the traditional \'dream\' techniques remains solid then disappears. it plays on our knowledge and our senses, particularly with the scenes concerning orton and halliwell and the sets (particularly of their flat with halliwell\'s murals decorating every surface) are terribly well done.'

In [72]:
check_data = {
    "review": ["@lapcat need to send 'em to my accountant tomorrow. oddly, i wasn't even referring to my taxes. those are supporting evidence, though. ",
                "<html><body><p> Movie 1</p><p> Actor - Aamir Khan</p><p> Click here to <a href='http://google.com'>download</a></p></body></html>",
                 'Check out my notebook https://www.kaggle.com/campusx/notebook8223fc1', 'IMHO he is the best', 'FYI Islamabad is the capital of Pakistan',
                 'ceertain conditionas duriing seveal ggenerations aree moodified in the saame maner', 'probably my all-time favorite movie, a story of selflessness,'
                 ' sacrifice and dedication to a noble cause', "Loved the movie. It was 😘"
]
    }

# Convert to DataFrame
df_check = pd.DataFrame(check_data)

# Print the DataFrame
prov = df_check['review']
prov


0    @lapcat need to send 'em to my accountant tomo...
1    <html><body><p> Movie 1</p><p> Actor - Aamir K...
2    Check out my notebook https://www.kaggle.com/c...
3                                  IMHO he is the best
4             FYI Islamabad is the capital of Pakistan
5    ceertain conditionas duriing seveal ggeneratio...
6    probably my all-time favorite movie, a story o...
7                            Loved the movie. It was 😘
Name: review, dtype: object

In [73]:
prov = prov.apply(preprocessing)
prov


0    @lapcat need to send 'em to my accountant tomo...
1     movie 1 actor - aamir khan click here to down...
2                               check out my notebook 
3                                  imho he is the best
4             fyi islamabad is the capital of pakistan
5    ceertain conditionas duriing seveal ggeneratio...
6    probably my all-time favorite movie, a story o...
7                            loved the movie. it was 😘
Name: review, dtype: object

In [14]:
# Remove punctuation
punc = string.punctuation
def remove_punc(text):
    return text.translate(str.maketrans('', '', punc))

df['review'] = df['review'].apply(remove_punc)
df['review'][1]

'a wonderful little production the filming technique is very unassuming very oldtimebbc fashion and gives a comforting and sometimes discomforting sense of realism to the entire piece the actors are extremely well chosen michael sheen not only has got all the polari but he has all the voices down pat too you can truly see the seamless editing guided by the references to williams diary entries not only is it well worth the watching but it is a terrificly written and performed piece a masterful production about one of the great masters of comedy and his life the realism really comes home with the little things the fantasy of the guard which rather than use the traditional dream techniques remains solid then disappears it plays on our knowledge and our senses particularly with the scenes concerning orton and halliwell and the sets particularly of their flat with halliwells murals decorating every surface are terribly well done'

In [15]:
# Handling ChatWords
chat_words = {
    "AFAIK": "As Far As I Know",
    "AFK": "Away From Keyboard",
    "ASAP": "As Soon As Possible",
    "ATK": "At The Keyboard",
    "ATM": "At The Moment",
    "A3": "Anytime, Anywhere, Anyplace",
    "BAK": "Back At Keyboard",
    "BBL": "Be Back Later",
    "BBS": "Be Back Soon",
    "BFN": "Bye For Now",
    "B4N": "Bye For Now",
    "BRB": "Be Right Back",
    "BRT": "Be Right There",
    "BTW": "By The Way",
    "B4": "Before",
    "B4N": "Bye For Now",
    "CU": "See You",
    "CUL8R": "See You Later",
    "CYA": "See You",
    "FAQ": "Frequently Asked Questions",
    "FC": "Fingers Crossed",
    "FWIW": "For What It's Worth",
    "FYI": "For Your Information",
    "GAL": "Get A Life",
    "GG": "Good Game",
    "GN": "Good Night",
    "GMTA": "Great Minds Think Alike",
    "GR8": "Great!",
    "G9": "Genius",
    "IC": "I See",
    "ICQ": "I Seek you (also a chat program)",
    "ILU": "ILU: I Love You",
    "IMHO": "In My Honest/Humble Opinion",
    "IMO": "In My Opinion",
    "IOW": "In Other Words",
    "IRL": "In Real Life",
    "KISS": "Keep It Simple, Stupid",
    "LDR": "Long Distance Relationship",
    "LMAO": "Laugh My A.. Off",
    "LOL": "Laughing Out Loud",
    "LTNS": "Long Time No See",
    "L8R": "Later",
    "MTE": "My Thoughts Exactly",
    "M8": "Mate",
    "NRN": "No Reply Necessary",
    "OIC": "Oh I See",
    "PITA": "Pain In The A..",
    "PRT": "Party",
    "PRW": "Parents Are Watching",
    "QPSA?": "Que Pasa?",
    "ROFL": "Rolling On The Floor Laughing",
    "ROFLOL": "Rolling On The Floor Laughing Out Loud",
    "ROTFLMAO": "Rolling On The Floor Laughing My A.. Off",
    "SK8": "Skate",
    "STATS": "Your sex and age",
    "ASL": "Age, Sex, Location",
    "THX": "Thank You",
    "TTFN": "Ta-Ta For Now!",
    "TTYL": "Talk To You Later",
    "U": "You",
    "U2": "You Too",
    "U4E": "Yours For Ever",
    "WB": "Welcome Back",
    "WTF": "What The F...",
    "WTG": "Way To Go!",
    "WUF": "Where Are You From?",
    "W8": "Wait...",
    "7K": "Sick:-D Laugher",
    "TFW": "That feeling when",
    "MFW": "My face when",
    "MRW": "My reaction when",
    "IFYP": "I feel your pain",
    "TNTL": "Trying not to laugh",
    "JK": "Just kidding",
    "IDC": "I don't care",
    "ILY": "I love you",
    "IMU": "I miss you",
    "ADIH": "Another day in hell",
    "ZZZ": "Sleeping, bored, tired",
    "WYWH": "Wish you were here",
    "TIME": "Tears in my eyes",
    "BAE": "Before anyone else",
    "FIMH": "Forever in my heart",
    "BSAAW": "Big smile and a wink",
    "BWL": "Bursting with laughter",
    "BFF": "Best friends forever",
    "CSL": "Can't stop laughing"
}

def chat_conversion(text):
    new_text = []
    for i in text.split():
        if i.upper() in chat_words:
            new_text.append(chat_words[i.upper()])
        else:
            new_text.append(i)
    return " ".join(new_text)


df['review'] = df['review'].apply(chat_conversion)
df['review'][1]


'a wonderful little production the filming technique is very unassuming very oldtimebbc fashion and gives a comforting and sometimes discomforting sense of realism to the entire piece the actors are extremely well chosen michael sheen not only has got all the polari but he has all the voices down pat too you can truly see the seamless editing guided by the references to williams diary entries not only is it well worth the watching but it is a terrificly written and performed piece a masterful production about one of the great masters of comedy and his life the realism really comes home with the little things the fantasy of the guard which rather than use the traditional dream techniques remains solid then disappears it plays on our knowledge and our senses particularly with the scenes concerning orton and halliwell and the sets particularly of their flat with halliwells murals decorating every surface are terribly well done'

In [None]:
# Spelling Correction
# Spell = TextBlob(df[review'])
# df['review'][1]


# print(df['review'].correct())

AttributeError: 'Series' object has no attribute 'raw'

In [None]:
# Handling StopWords
nltk.download('stopwords')
stopword = stopwords.words('english')
def remove_stopwords(text):
    new_text = []
    
    for word in text.split():
        if word in stopword:
            new_text.append('')
        else:
            new_text.append(word)
    x = new_text[:]
    new_text.clear()
    return " ".join(x)


df['review'] = df['review'].apply(remove_stopwords)
df['review'][2]

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
# Handling Emojies

df['review'] = df['review'].apply(emoji.demojize)
df['review'][2]



'I thought wonderful way spend time hot summer weekend, sitting air conditioned theater watching light-hearted comedy. The plot simplistic, dialogue witty characters likable (even well bread suspected serial killer). While may disappointed realize Match Point 2: Risk Addiction, I thought proof Woody Allen still fully control style many us grown love.<br /><br />This I\'d laughed one Woody\'s comedies years (dare I say decade?). While I\'ve never impressed Scarlet Johanson, managed tone "sexy" image jumped right average, spirited young woman.<br /><br />This may crown jewel career, wittier "Devil Wears Prada" interesting "Superman" great comedy go see friends.'

In [None]:
#Stemming

stemmer = PorterStemmer()

def stem_words(text):
    return " ".join([stemmer.stem(word) for word in text.split()])

df['review'] = df['review'].apply(stem_words)
df['review'][1]

'i thought thi wa a wonder way to spend time on a too hot summer weekend, sit in the air condit theater and watch a light-heart comedy. the plot is simplistic, but the dialogu is witti and the charact are likabl (even the well bread suspect serial killer). while some may be disappoint when they realiz thi is not match point 2: risk addiction, i thought it wa proof that woodi allen is still fulli in control of the style mani of us have grown to love.<br /><br />thi wa the most i\'d laugh at one of woody\' comedi in year (dare i say a decade?). while i\'v never been impress with scarlet johanson, in thi she manag to tone down her "sexy" imag and jump right into a average, but spirit young woman.<br /><br />thi may not be the crown jewel of hi career, but it wa wittier than "devil wear prada" and more interest than "superman" a great comedi to go see with friends.'

In [None]:
#Tokenization
nlp = spacy.load('en_core_web_sm')                     # the English language model 'en_core_web_sm
df['review'] = df['review'].apply(nlp)


I thought this was a wonderful way to spend time on a too hot summer weekend, sitting in the air conditioned theater and watching a light-hearted comedy. The plot is simplistic, but the dialogue is witty and the characters are likable (even the well bread suspected serial killer). While some may be disappointed when they realize this is not Match Point 2: Risk Addiction, I thought it was proof that Woody Allen is still fully in control of the style many of us have grown to love.<br /><br />This was the most I'd laughed at one of Woody's comedies in years (dare I say a decade?). While I've never been impressed with Scarlet Johanson, in this she managed to tone down her "sexy" image and jumped right into a average, but spirited young woman.<br /><br />This may not be the crown jewel of his career, but it was wittier than "Devil Wears Prada" and more interesting than "Superman" a great comedy to go see with friends.

In [14]:
print(df['review'][1:3])

1    (A, wonderful, little, production, ., <, br, /...
2    (I, thought, this, was, a, wonderful, way, to,...
Name: review, dtype: object


In [9]:
#Lemmatization

wordnet_lemmatizer = WordNetLemmatizer()


df['review'] = df['review'].apply(wordnet_lemmatizer.lemmatize(df['review'],pos='v'))
df['review'][1]



LookupError: 
**********************************************************************
  Resource [93mwordnet[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('wordnet')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mcorpora/wordnet[0m

  Searched in:
    - 'C:\\Users\\Admin/nltk_data'
    - 'c:\\Users\\Admin\\AppData\\Local\\Programs\\Python\\Python313\\nltk_data'
    - 'c:\\Users\\Admin\\AppData\\Local\\Programs\\Python\\Python313\\share\\nltk_data'
    - 'c:\\Users\\Admin\\AppData\\Local\\Programs\\Python\\Python313\\lib\\nltk_data'
    - 'C:\\Users\\Admin\\AppData\\Roaming\\nltk_data'
    - 'C:\\nltk_data'
    - 'D:\\nltk_data'
    - 'E:\\nltk_data'
**********************************************************************
