In [None]:
""" This notebook used for preporation data to NLP modeling.
 Proceses of Tokenization, Stemming, Lemmatization, Handling text (Remove HTML Tag, URLs, Emojies and other) are here.   """

import pandas as pd
import re                                              # Import Regular Expression (remove HTML tags)
import string                                          # Import Punctuation 
from textblob import TextBlob                          # Import this Library to Handle the Spelling Issue
import nltk
from nltk.corpus import stopwords                      #  NLTK library to remove Stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import emoji                                           # for translating symbol to text
import spacy                                           # for tokenization
import spacy.cli
# spacy.cli.download("en_core_web_sm")                 # for  working with spacy, after the first start should pick  # spacy.cli.download("en_core_web_lg")
from nltk.stem.porter import PorterStemmer             # for stemming
# nltk.download('all')                                   # for  working with NLTL function, after the first start should pick #nltk.download('all') 
from sklearn.model_selection import train_test_split
from chat_words import chat_word                       # for translate slang of charts to text
from autocorrect import Speller                        # for Spelling Correction
from collections import Counter, OrderedDict           # for definition of unique words (tokens) in dataframe
from torchtext.vocab import vocab  
from tqdm import tqdm                                   # progressbar
tqdm.pandas() 
# import warnings
# warnings.filterwarnings('ignore')



In [2]:
df = pd.read_csv(r'C:\Users\Admin\WORK\Project_CV\Model_NLP_sentiment\data\IMDB Dataset.csv')    # insert path to your data

In [3]:
df.head(2)                                                # check dataframe 

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive


In [None]:
# # transform sentiment  into the number labels
# def transform_label(label):
#     return 1 if label == 'positive' else 0

# df['label'] = df['sentiment'].apply(transform_label)
# df.head(2)

Unnamed: 0,review,sentiment,label
0,One of the other reviewers has mentioned that ...,positive,1
1,A wonderful little production. <br /><br />The...,positive,1


In [None]:
# #Access the corpus and target variables
# x = df.review
# y = df.label                                                                            

# # train test splitting
# x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.0005, random_state=0)
# print(x_train.shape)
# print(x_test.shape)
# print(y_train.shape)
# print(y_test.shape)


(49975,)
(25,)
(49975,)
(25,)


In [16]:
# Choose items for preprocessing: True or False

lower = True                           # LoweCasing Text
remove_html = True                     # Remove HTML Tag
remove_url = True                       # Remove URLs
remove_punc = True                     # Remove punctuation
change_chat = True                     # Handling chat's words to words
spell_cor = True                       # Spelling Correction
remove_stopword = True                 # Remove StopWords
remove_emoji = True                  # Handling Emojies to words
use_stemm = False                       # Apply Stemming
use_lemm = True                        # Apply Lemmatization
use_token = True                       # Apply Tokenization  

In [17]:
# Function for preprocessing
def preprocessing (text):
        
    if lower:                                                        # LoweCasing Text
        text = text.lower()
                                            
    if remove_html:
        pattern_1 = re.compile('<.*?>')                              # constant using one regular expression
        text = re.sub(pattern_1, r'', text)                          # Remove HTML Tags (changes ('<.*?>') to gap " ")

    if remove_url:
        pattern_2 = re.compile(r'https?://\S+|www\.\S+')             #  Remove URLs from Text or Whole Corpus.
        text = pattern_2.sub(r'', text)

    if remove_punc:
        punc = string.punctuation                                    # Remove punctuation
        text = text.translate(str.maketrans('', '', punc))

    if change_chat:
        new_text = []                                                 # changes chat's words to text       
        for i in text.split():
            if i.upper() in chat_word:
                new_text.append(chat_word[i.upper()])
            else:
                new_text.append(i)
        text = " ".join(new_text)
        new_text.clear()

    if spell_cor:
        spell = Speller(lang='en')                                    # Spelling Correction
        text = spell(text)

    if remove_stopword:
        stopword = stopwords.words('english')                          # Handling StopWords
        for word in text.split():
            if word in stopword:
                new_text.append('')
            else:
                new_text.append(word)
        pattern_3 = new_text[:]
        text = " ".join(pattern_3)

    if remove_emoji:
        text = emoji.demojize(text)                                   # Handling Emojies 

    
    if use_stemm:
        stemmer = PorterStemmer()                                     # Stemming
        text = " ".join([stemmer.stem(word)
                  for word in text.split()])
                            
        
    if use_lemm:
        lemmatizer = WordNetLemmatizer()                              #Lemmatization
        words = nltk.word_tokenize(text)
        lemmatized_words = [lemmatizer.lemmatize(word) for word in words]
        text = ' '.join(lemmatized_words)


    if use_token:
        nlp = spacy.load('en_core_web_sm')                            # the English language model 'en_core_web_sm'
        text = nlp(text)                                              
    return text

In [18]:

df_new = df.loc[0:10]
df_new

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
5,"Probably my all-time favorite movie, a story o...",positive
6,I sure would like to see a resurrection of a u...,positive
7,"This show was an amazing, fresh & innovative i...",negative
8,Encouraged by the positive comments about this...,negative
9,If you like original gut wrenching laughter yo...,positive


In [19]:
# # Applying function for preprocessing

# X_trainn = X_train.apply(preprocessing)
# x_testt = x_test.apply(preprocessing)
# x_testt.head(2)

df_new = df_new['review'].apply(preprocessing) 
df_new

0     (one, reviewer, mentioned, watching, 1, oz, ep...
1     (wonderful, little, production, filming, techn...
2     (thought, wonderful, way, spend, Tears, eye, h...
3     (basically, there, family, little, boy, jake, ...
4     (better, matter, love, Tears, eye, money, visu...
5     (probably, alltime, favorite, movie, story, se...
6     (sure, would, like, see, resurrection, dated, ...
7     (show, amazing, fresh, innovative, idea, 70, f...
8     (encouraged, positive, comment, film, looking,...
9     (like, original, gut, reaching, laughter, like...
10    (phil, alien, one, quirky, film, humour, based...
Name: review, dtype: object

In [20]:
 #get all processed reviews
reviews = df.review.values
# merge into single variable, separated by whitespaces
words = ' '.join(reviews)
# obtain list of words
words = words.split()

# check our list
words[:20]

['One',
 'of',
 'the',
 'other',
 'reviewers',
 'has',
 'mentioned',
 'that',
 'after',
 'watching',
 'just',
 '1',
 'Oz',
 'episode',
 "you'll",
 'be',
 'hooked.',
 'They',
 'are',
 'right,']

In [24]:
# build vocabulary
counter = Counter(words)
vocab = sorted(counter, key=counter.get, reverse=True)
int2word = dict(enumerate(vocab, 1))
int2word[0] = '<PAD>'
word2int = {word: id for id, word in int2word.items()}

In [26]:
# encode words
reviews_enc = [[word2int[word] for word in review.split()] for review in tqdm(reviews)]

# print first-10 words of first 5 reviews
for i in range(5):
    print(reviews_enc[i][:5])

100%|██████████| 50000/50000 [00:01<00:00, 29831.83it/s]

[317, 4, 1, 79, 2282]
[130, 436, 110, 2631, 102]
[8, 196, 10, 13, 2]
[5357, 306, 2, 291, 108]
[169881, 53939, 10359, 7, 1]





In [21]:
print(len(words))

11557847


In [13]:
# This block for checking  def Proprocessing

check_data = {
    "review": ["@lapcat need to send 'em to my accountant tomorrow. oddly, i wasn't even referring to my taxes. those are supporting evidence, though. ",
                "<html><body><p> Movie 1</p><p> Actor - Aamir Khan</p><p> Click here to <a href='http://google.com'>download</a></p></body></html>",
                 'Check out my notebook https://www.kaggle.com/campusx/notebook8223fc1', 'IMHO he is the best', 'FYI Islamabad is the capital of Pakistan',
                 'ceertain conditionas duriing seveal ggenerations aree moodified in the saame maner', 'probably my all-time favorite movie, a story of selflessness,'
                 ' sacrifice and dedication to a noble cause', "Loved the movie. It was 😘", "walk walks walking walked",
                 "He was running and eating at same time. He has bad habit of swimming after playing long hours in the Sun."
]
    }

# Convert to DataFrame
df_check = pd.DataFrame(check_data)

# Print the DataFrame
prov = df_check['review']
prov

0    @lapcat need to send 'em to my accountant tomo...
1    <html><body><p> Movie 1</p><p> Actor - Aamir K...
2    Check out my notebook https://www.kaggle.com/c...
3                                  IMHO he is the best
4             FYI Islamabad is the capital of Pakistan
5    ceertain conditionas duriing seveal ggeneratio...
6    probably my all-time favorite movie, a story o...
7                            Loved the movie. It was 😘
8                            walk walks walking walked
9    He was running and eating at same time. He has...
Name: review, dtype: object

In [14]:
prov_check = prov.apply(preprocessing)
prov_check

0    (lancet, need, send, em, accountant, tomorrow,...
1       (movie, 1, actor, amir, khan, click, download)
2                                    (check, notebook)
3           (In, My, Honest, /, Humble, Opinion, best)
4    (For, Your, Information, islamabad, capital, p...
5    (certain, condition, several, generation, modi...
6    (probably, alltime, favorite, movie, story, se...
7            (loved, movie, :, face_blowing_a_kiss, :)
8                        (walk, walk, walking, walked)
9    (running, eating, Tears, eye, bad, habit, swim...
Name: review, dtype: object

In [25]:
# get all processed r
# Definition of unique words (tokens) in dataframe

token_counts = Counter()
for word in x_testt:
    token_counts.update(word)



In [26]:
print('Dictionary size:', len(token_counts))

Dictionary size: 2520


In [None]:


sorted_by_freg_tuples = sorted (token_counts.items(), key=lambda x: x[1], reverse = True)
ordered_dict = OrderedDict(sorted_by_freg_tuples)
vocab = vocab(ordered_dict)
vocab.insert_token('<pad>', 0)
vocab.insert_token('<unk>', 1)
vocab.set_default_index(1)
# print([vocab[token] for token in ['Hot','run'] ])

In [30]:
print(type(x_testt))
print(type(df))

<class 'pandas.core.series.Series'>
<class 'pandas.core.frame.DataFrame'>


In [28]:
all_text2 = x_testt[11841].tolist()

# get all processed reviews

# merge into single variable, separated by whitespaces
all_text2 = ' '.join(all_text2)     
# obtain list of words
words = all_text2.split()

# check our list
words[:10]

AttributeError: 'spacy.tokens.doc.Doc' object has no attribute 'tolist'

In [None]:
VOCABULARY_SIZE = 20000
TEXT = torchtext.legacy.data.Field(tokenize = 'spacy', tokenizer_language = 'en_core_web_sm')