In [1]:
""" This notebook used for preporation data to NLP modeling.
 Proceses of Tokenization, Stemming, Lemmatization, Handling text (Remove HTML Tag, URLs, Emojies and other) are here.   """

import pandas as pd
import numpy as np
import re                                              # Import Regular Expression (remove HTML tags)
import string                                          # Import Punctuation 
from textblob import TextBlob                          # Import this Library to Handle the Spelling Issue
import nltk
from nltk.corpus import stopwords                      #  NLTK library to remove Stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import emoji                                           # for translating symbol to text
import spacy                                           # for tokenization
import spacy.cli
# spacy.cli.download("en_core_web_sm")                 # for  working with spacy, after the first start should pick  # spacy.cli.download("en_core_web_lg")
from nltk.stem.porter import PorterStemmer             # for stemming
# nltk.download('all')                                   # for  working with NLTL function, after the first start should pick #nltk.download('all') 
from sklearn.model_selection import train_test_split
from chat_words import chat_word                       # for translate slang of charts to text
from autocorrect import Speller                        # for Spelling Correction
from collections import Counter, OrderedDict           # for definition of unique words (tokens) in dataframe
from torchtext.vocab import vocab  
from tqdm import tqdm                                   # progressbar
tqdm.pandas() 
# import warnings
# warnings.filterwarnings('ignore')



In [2]:
df = pd.read_csv(r'C:\Users\Admin\WORK\Project_CV\Model_NLP_sentiment\data\IMDB Dataset.csv')    # insert path to your data

In [3]:
df.head(3)                                                # check dataframe 

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive


In [3]:
# transform sentiment  into the number labels
def transform_label(label):
    return 1 if label == 'positive' else 0

df['label'] = df['sentiment'].apply(transform_label)
df.head(2)

Unnamed: 0,review,sentiment,label
0,One of the other reviewers has mentioned that ...,positive,1
1,A wonderful little production. <br /><br />The...,positive,1


In [12]:
# Choose items for preprocessing: True or False

lower = True                           # LoweCasing Text
remove_html = True                     # Remove HTML Tag
remove_url = True                       # Remove URLs
remove_punc = True                     # Remove punctuation
change_chat = True                     # Handling chat's words to words
spell_cor = True                       # Spelling Correction
remove_stopword = True                 # Remove StopWords
remove_emoji = True                  # Handling Emojies to words
use_stemm = False                       # Apply Stemming
use_lemm = True                        # Apply Lemmatization
use_token = False                       # Apply Tokenization  

In [13]:
# Function for preprocessing
def preprocessing (text):
        
    if lower:                                                        # LoweCasing Text
        text = text.lower()
                                            
    if remove_html:
        pattern_1 = re.compile('<.*?>')                              # constant using one regular expression
        text = re.sub(pattern_1, r'', text)                          # Remove HTML Tags (changes ('<.*?>') to gap " ")

    if remove_url:
        pattern_2 = re.compile(r'https?://\S+|www\.\S+')             #  Remove URLs from Text or Whole Corpus.
        text = pattern_2.sub(r'', text)

    if remove_punc:
        punc = string.punctuation                                    # Remove punctuation
        text = text.translate(str.maketrans('', '', punc))

    if change_chat:
        new_text = []                                                 # changes chat's words to text       
        for i in text.split():
            if i.upper() in chat_word:
                new_text.append(chat_word[i.upper()])
            else:
                new_text.append(i)
        text = " ".join(new_text)
        new_text.clear()

    if spell_cor:
        spell = Speller(lang='en')                                    # Spelling Correction
        text = spell(text)

    if remove_stopword:
        stopword = stopwords.words('english')                          # Handling StopWords
        for word in text.split():
            if word in stopword:
                new_text.append('')
            else:
                new_text.append(word)
        pattern_3 = new_text[:]
        text = " ".join(pattern_3)

    if remove_emoji:
        text = emoji.demojize(text)                                   # Handling Emojies 

    
    if use_stemm:
        stemmer = PorterStemmer()                                     # Stemming
        text = " ".join([stemmer.stem(word)
                  for word in text.split()])
                            
        
    if use_lemm:
        lemmatizer = WordNetLemmatizer()                              #Lemmatization
        words = nltk.word_tokenize(text)
        lemmatized_words = [lemmatizer.lemmatize(word) for word in words]
        text = ' '.join(lemmatized_words)


    if use_token:
        nlp = spacy.load('en_core_web_sm')                            # the English language model 'en_core_web_sm'
        text = nlp(text)                                              # cmd:  python -m spacy download en_core_web_sm

    return text

In [14]:
df_new = df.loc[0:10]

In [15]:
# # Applying function for preprocessing

# # X_trainn = X_train.apply(preprocessing)
# x_testt = x_test.apply(preprocessing)
# x_testt.head(2)

df_new['clean'] = df_new['review'].apply(preprocessing) 
df_new

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_new['clean'] = df_new['review'].apply(preprocessing)


Unnamed: 0,review,sentiment,label,clean
0,One of the other reviewers has mentioned that ...,positive,1,one reviewer mentioned watching 1 oz episode h...
1,A wonderful little production. <br /><br />The...,positive,1,wonderful little production filming technique ...
2,I thought this was a wonderful way to spend ti...,positive,1,thought wonderful way spend Tears eye hot summ...
3,Basically there's a family where a little boy ...,negative,0,basically there family little boy jake think t...
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,1,better matter love Tears eye money visually st...
5,"Probably my all-time favorite movie, a story o...",positive,1,probably alltime favorite movie story selfless...
6,I sure would like to see a resurrection of a u...,positive,1,sure would like see resurrection dated shunt s...
7,"This show was an amazing, fresh & innovative i...",negative,0,show amazing fresh innovative idea 70 first ai...
8,Encouraged by the positive comments about this...,negative,0,encouraged positive comment film looking forwa...
9,If you like original gut wrenching laughter yo...,positive,1,like original gut reaching laughter like movie...


In [16]:
 #get all processed reviews
reviews = df_new.clean.values
# merge into single variable, separated by whitespaces
words = ' '.join(reviews)
# obtain list of words
words = words.split()

# check our list
words[:10]

['one',
 'reviewer',
 'mentioned',
 'watching',
 '1',
 'oz',
 'episode',
 'hooked',
 'right',
 'exactly']

In [17]:
print(len(words))

880


In [18]:
# build vocabulary
counter = Counter(words)
vocab = sorted(counter, key=counter.get, reverse=True)
int2word = dict(enumerate(vocab, 1))
int2word[0] = '<PAD>'
word2int = {word: id for id, word in int2word.items()}

In [19]:
print(int2word)

{1: 'one', 2: 'film', 3: 'show', 4: 'movie', 5: 'would', 6: 'see', 7: 'first', 8: 'well', 9: 'like', 10: 'watching', 11: 'oz', 12: 'thing', 13: 'way', 14: 'eye', 15: 'violence', 16: 'go', 17: 'never', 18: 'say', 19: 'get', 20: 'truly', 21: 'comedy', 22: 'Tears', 23: 'character', 24: 'jake', 25: 'matter', 26: 'right', 27: 'city', 28: 'prison', 29: 'far', 30: 'forget', 31: 'pretty', 32: 'got', 33: 'may', 34: 'little', 35: 'great', 36: 'every', 37: 'air', 38: 'point', 39: 'still', 40: 'many', 41: 'u', 42: 'year', 43: 'ive', 44: 'parent', 45: 'make', 46: 'must', 47: 'drama', 48: 'better', 49: 'mr', 50: 'seems', 51: 'people', 52: 'different', 53: 'world', 54: 'find', 55: 'cast', 56: 'performance', 57: 'tv', 58: 'believe', 59: 'funny', 60: 'bad', 61: 'original', 62: 'episode', 63: 'exactly', 64: 'struck', 65: 'scene', 66: 'set', 67: 'word', 68: 'use', 69: 'state', 70: 'high', 71: 'home', 72: 'away', 73: 'appeal', 74: 'due', 75: 'wouldnt', 76: 'dare', 77: 'picture', 78: 'around', 79: 'nasty',

In [20]:
# encode words
reviews_enc = [[word2int[word] for word in review.split()] for review in tqdm(reviews)]

# print first-10 words of first 5 reviews
for i in range(5):
    print(reviews_enc[i][:15])

100%|██████████| 11/11 [00:00<?, ?it/s]

[1, 140, 141, 10, 142, 11, 62, 143, 26, 63, 144, 145, 7, 12, 64]
[86, 34, 87, 232, 88, 233, 234, 235, 89, 236, 237, 238, 239, 90, 240]
[100, 86, 13, 278, 22, 14, 279, 280, 281, 282, 37, 283, 284, 10, 285]
[332, 105, 333, 34, 334, 24, 335, 105, 106, 107, 44, 336, 337, 4, 338]
[48, 25, 111, 22, 14, 112, 363, 364, 2, 113, 49, 25, 365, 41, 366]





In [21]:
# padding sequences

def pad_features(reviews, pad_id, seq_length=128):
    # features = np.zeros((len(reviews), seq_length), dtype=int)
    features = np.full((len(reviews), seq_length), pad_id, dtype=int)      #Return a new array of given shape (len(reviews), seq_length) and type = int, filled with pad_id.

    for i, row in enumerate(reviews):
        # if seq_length < len(row) then review will be trimmed
        features[i, :len(row)] = np.array(row)[:seq_length]

    return features

seq_length = 256
features = pad_features(reviews_enc, pad_id=word2int['<PAD>'], seq_length=seq_length)

assert len(features) == len(reviews_enc)
assert len(features[0]) == seq_length

features[:10, :10]

array([[  1, 140, 141,  10, 142,  11,  62, 143,  26,  63],
       [ 86,  34,  87, 232,  88, 233, 234, 235,  89, 236],
       [100,  86,  13, 278,  22,  14, 279, 280, 281, 282],
       [332, 105, 333,  34, 334,  24, 335, 105, 106, 107],
       [ 48,  25, 111,  22,  14, 112, 363, 364,   2, 113],
       [126, 421, 422,   4, 423, 424, 425, 426, 427, 428],
       [457,   5,   9,   6, 458, 459, 132, 460, 461, 133],
       [  3, 497, 498, 499, 500, 501,   7, 502,   7, 503],
       [540, 541, 542,   2, 543, 544,  10,   2,  60, 545],
       [  9,  61, 573, 574, 575,   9,   4, 103, 128, 111]])

In [22]:
# get labels as numpy
labels = df_new.label.to_numpy()
labels

array([1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0])

In [None]:
#Access the corpus and target variables
x = df.review
y = df.label                                                                            

# train test splitting
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.0005, random_state=0)
print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)
