In [1]:
import re
import pandas as pd

In [2]:
df = pd.read_csv(r"D:\Jio Institute\Q2\NLP\PL\IMDB Dataset.csv")
df

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
...,...,...
49995,I thought this movie did a down right good job...,positive
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,I am a Catholic taught in parochial elementary...,negative
49998,I'm going to have to disagree with the previou...,negative


In [3]:
#remove html tags
def remove_htmltags(text):
    pattern =  re.compile('<.*?>')
    return pattern.sub(r'', text)

In [4]:
df['review'] = df['review'].apply(remove_htmltags)
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. The filming tec...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [5]:
#remove hyperlinks
def remove_hyperlinks(text):
    pattern = re.compile(r'https?://\S+|www\.\S+')
    return pattern.sub(r'',text)

In [6]:
df['review'] = df['review'].apply(remove_hyperlinks)
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. The filming tec...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [7]:
import string  
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [8]:
#remove_punctuations
exclude = string.punctuation

In [9]:
def remove_punctuations(text):
    for char in exclude:
        if char in text:
            text = text.replace(char,'')
    return text

In [10]:
text = 'Hey there!, How do you feel today?'
remove_punctuations(text)

'Hey there How do you feel today'

In [11]:
pip install textblob

Note: you may need to restart the kernel to use updated packages.


In [12]:
#spelling mistakes
from textblob import TextBlob

inc_text = 'Sometimess it is realy hard to cotrol sleeeep'
txtBlb = TextBlob(inc_text)
txtBlb.correct().string

'Sometimes it is really hard to control sleep'

In [13]:
from nltk.corpus import stopwords

stopwords = stopwords.words('english')

In [14]:
def remove_stopwords(text):
    text_clean=[]
    for word in text.split():
        if word in stopwords:
            text_clean.append('')
        else:
            text_clean.append(word)
    x=text_clean[:]
    text_clean.clear()
    return " ".join(x)

In [15]:
text = 'I know it is going to be 12 midnght and like everyone else I am sleepy as hell'
remove_stopwords(text)

'I know   going   12 midnght  like everyone else I  sleepy  hell'

In [16]:
df['review'].apply(remove_stopwords)

0        One    reviewers  mentioned   watching  1 Oz e...
1        A wonderful little production. The filming tec...
2        I thought    wonderful way  spend time    hot ...
3        Basically there's  family   little boy (Jake) ...
4        Petter Mattei's "Love   Time  Money"   visuall...
                               ...                        
49995    I thought  movie    right good job. It   creat...
49996    Bad plot, bad dialogue, bad acting, idiotic di...
49997    I   Catholic taught  parochial elementary scho...
49998    I'm going    disagree   previous comment  side...
49999    No one expects  Star Trek movies   high art,  ...
Name: review, Length: 50000, dtype: object

In [17]:
pip install emoji

Note: you may need to restart the kernel to use updated packages.


In [18]:
#Handling emojis
import emoji
print(emoji.demojize('Python is a 🐍'))    

Python is a :snake:


In [19]:
from nltk.tokenize import word_tokenize,sent_tokenize

sent1= "I don't know why I am so sleepy when the interviews are near. Han... Studyyy, it's now pr never"
sent_tokenize(sent1)

["I don't know why I am so sleepy when the interviews are near.",
 "Han... Studyyy, it's now pr never"]


Stemming is the process of reducing inflection in words to their root forms such as mapping a group of words to the same stem even if the stem itself is not valid word in the language.

In [20]:
from nltk.stem.porter import PorterStemmer

stem = PorterStemmer()
def stem_words(text):
    return " ".join([stem.stem(word) for word in text.split()])

In [21]:
sample = 'wakl walks walking walked like a Jambo'
stem_words(sample)

'wakl walk walk walk like a jambo'

In [25]:
import nltk
from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()

sentence = 'He was running like an idiotic person when I first saw him. Later I realized, I was not wrong'
punctuations = '@#$%^&*'
sentence_words = nltk.word_tokenize(sentence)
for word in sentence_words:
    if word in punctuations:
        sentence_words.remove(word)
        
sentence_words
print("{0:20}{1:20}".format("Word", "Lemma"))
for word in sentence_words:
    print("{0:20}{1:20}".format(word,wordnet_lemmatizer.lemmatize(word, pos='v')))

Word                Lemma               
He                  He                  
was                 be                  
running             run                 
like                like                
an                  an                  
idiotic             idiotic             
person              person              
when                when                
I                   I                   
first               first               
saw                 saw                 
him                 him                 
.                   .                   
Later               Later               
I                   I                   
realized            realize             
,                   ,                   
I                   I                   
was                 be                  
not                 not                 
wrong               wrong               


## Feature Extraction

In [26]:
df1 = pd.DataFrame({'text': ['people watch movies','movies watch movies','people write comment','movies write comment']})

In [27]:
df1.head()

Unnamed: 0,text
0,people watch movies
1,movies watch movies
2,people write comment
3,movies write comment


In [28]:
from sklearn.feature_extraction.text import CountVectorizer

In [29]:
cv= CountVectorizer()

In [30]:
bow = cv.fit_transform(df1['text'])

In [31]:
#vocab
print(cv.vocabulary_)

{'people': 2, 'watch': 3, 'movies': 1, 'write': 4, 'comment': 0}


In [32]:
print(bow[0].toarray())
print(bow[1].toarray())

[[0 1 1 1 0]]
[[0 2 0 1 0]]


In [33]:
cv.transform(['movie watches amd write comment']).toarray()

array([[1, 0, 0, 0, 1]], dtype=int64)

In [34]:
cv2 = CountVectorizer(ngram_range=(2,2))

bi_gram = cv2.fit_transform(df1['text'])

print(cv2.vocabulary_)

{'people watch': 2, 'watch movies': 4, 'movies watch': 0, 'people write': 3, 'write comment': 5, 'movies write': 1}


In [35]:
cv3 = CountVectorizer(ngram_range=(1,2))

tri = cv3.fit_transform(df1['text'])

print(cv3.vocabulary_)

{'people': 4, 'watch': 7, 'movies': 1, 'people watch': 5, 'watch movies': 8, 'movies watch': 2, 'write': 9, 'comment': 0, 'people write': 6, 'write comment': 10, 'movies write': 3}


In [36]:
df1['output'] = [1,1,0,0]
df1

Unnamed: 0,text,output
0,people watch movies,1
1,movies watch movies,1
2,people write comment,0
3,movies write comment,0


In [37]:
#TFIdf Vectorization

from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer()
tfidf.fit_transform(df1['text']).toarray()

array([[0.        , 0.49681612, 0.61366674, 0.61366674, 0.        ],
       [0.        , 0.8508161 , 0.        , 0.52546357, 0.        ],
       [0.57735027, 0.        , 0.57735027, 0.        , 0.57735027],
       [0.61366674, 0.49681612, 0.        , 0.        , 0.61366674]])

In [38]:
print(tfidf.idf_)
print(tfidf.get_feature_names_out())

[1.51082562 1.22314355 1.51082562 1.51082562 1.51082562]
['comment' 'movies' 'people' 'watch' 'write']


In [39]:
#word2Vec

import gensim
from gensim.models import Word2Vec, KeyedVectors