In [None]:
""" This notebook used for preporation data to NLP modeling.
 Proceses of Tokenization, Stemming, Lemmatization, Handling text (Remove HTML Tag, URLs, Emojies and other) are here.   """

import pandas as pd
import re                                              # Import Regular Expression (remove HTML tags)
import string                                          # Import Punctuation 
from textblob import TextBlob                          # Import this Library to Handle the Spelling Issue
import nltk
from nltk.corpus import stopwords                      #  NLTK library to remove Stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import emoji                                           # for translating symbol to text
import spacy                                           # for tokenization
from nltk.stem.porter import PorterStemmer             # for stemming
#nltk.download('all')                                   # for  working with NLTL function, after the first start should pick #nltk.download('all') 
from sklearn.model_selection import train_test_split
from chat_words import chat_word                       # for translate slang of charts to text
from autocorrect import Speller                        # for Spelling Correction

In [2]:
df = pd.read_csv(r'C:\Users\Admin\WORK\Project_CV\Model_NLP_sentiment\data\IMDB Dataset.csv')    # insert path to your data

In [13]:
df.head(2)                                                # check dataframe 

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive


In [28]:
#Access the corpus and target variables
x = df.review
y = df.sentiment                                                                            

# train test splitting
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.0005, random_state=0)
print(X_train.shape)
print(X_test.shape)

(49975,)
(25,)


In [24]:
# Choose items for preprocessing: True or False

lower = True                           # LoweCasing Text
remove_html = True                     # Remove HTML Tag
remove_url = True                       # Remove URLs
remove_punc = True                     # Remove punctuation
change_chat = True                     # Handling chat's words to words
spell_cor = True                       # Spelling Correction
remove_stopword = True                 # Remove StopWords
remove_emoji = True                  # Handling Emojies to words
use_stemm = False                       # Apply Stemming
use_lemm = True                        # Apply Lemmatization
use_token = True                        # Apply Tokenization  

In [None]:
# Function for preprocessing
def preprocessing (text):
        
    if lower:                                                        # LoweCasing Text
        text = text.lower()
                                            
    if remove_html:
        pattern_1 = re.compile('<.*?>')                              # constant using one regular expression
        text = re.sub(pattern_1, r'', text)                          # Remove HTML Tags (changes ('<.*?>') to gap " ")

    if remove_url:
        pattern_2 = re.compile(r'https?://\S+|www\.\S+')             #  Remove URLs from Text or Whole Corpus.
        text = pattern_2.sub(r'', text)

    if remove_punc:
        punc = string.punctuation                                    # Remove punctuation
        text = text.translate(str.maketrans('', '', punc))

    if change_chat:
        new_text = []                                                 # changes chat's words to text       
        for i in text.split():
            if i.upper() in chat_word:
                new_text.append(chat_word[i.upper()])
            else:
                new_text.append(i)
        text = " ".join(new_text)
        new_text.clear()

    if spell_cor:
        spell = Speller(lang='en')                                    # Spelling Correction
        text = spell(text)

    if remove_stopword:
        stopword = stopwords.words('english')                          # Handling StopWords
        for word in text.split():
            if word in stopword:
                new_text.append('')
            else:
                new_text.append(word)
        pattern_3 = new_text[:]
        text = " ".join(pattern_3)

    if remove_emoji:
        text = emoji.demojize(text)                                   # Handling Emojies 

    
    if use_stemm:
        stemmer = PorterStemmer()                                     # Stemming
        text = " ".join([stemmer.stem(word)
                  for word in text.split()])
                            
        
    if use_lemm:
        lemmatizer = WordNetLemmatizer()                              #Lemmatization
        words = nltk.word_tokenize(text)
        lemmatized_words = [lemmatizer.lemmatize(word) for word in words]
        text = ' '.join(lemmatized_words)


    if use_token:
        nlp = spacy.load('en_core_web_sm')                            # the English language model 'en_core_web_sm'
        text = nlp(text)                                              # cmd:  python -m spacy download en_core_web_sm

    return text

In [None]:
# Applying function for preprocessing

# X_trainn = X_train.apply(preprocessing)
X_testt = X_test.apply(preprocessing)

In [None]:
# This block for checking  def Proprocessing

check_data = {
    "review": ["@lapcat need to send 'em to my accountant tomorrow. oddly, i wasn't even referring to my taxes. those are supporting evidence, though. ",
                "<html><body><p> Movie 1</p><p> Actor - Aamir Khan</p><p> Click here to <a href='http://google.com'>download</a></p></body></html>",
                 'Check out my notebook https://www.kaggle.com/campusx/notebook8223fc1', 'IMHO he is the best', 'FYI Islamabad is the capital of Pakistan',
                 'ceertain conditionas duriing seveal ggenerations aree moodified in the saame maner', 'probably my all-time favorite movie, a story of selflessness,'
                 ' sacrifice and dedication to a noble cause', "Loved the movie. It was 😘", "walk walks walking walked",
                 "He was running and eating at same time. He has bad habit of swimming after playing long hours in the Sun."
]
    }

# Convert to DataFrame
df_check = pd.DataFrame(check_data)

# Print the DataFrame
prov = df_check['review']
prov

0    @lapcat need to send 'em to my accountant tomo...
1    <html><body><p> Movie 1</p><p> Actor - Aamir K...
2    Check out my notebook https://www.kaggle.com/c...
3                                  IMHO he is the best
4             FYI Islamabad is the capital of Pakistan
5    ceertain conditionas duriing seveal ggeneratio...
6    probably my all-time favorite movie, a story o...
7                            Loved the movie. It was 😘
8                            walk walks walking walked
9    He was running and eating at same time. He has...
Name: review, dtype: object

In [None]:
prov_check = prov.apply(preprocessing)
prov_check

0    (lancet, need, send, em, account, tomorrow, od...
1    (htmlbodyp, movi, 1pp, actor, amir, khan, clic...
2                                    (check, notebook)
3            (in, my, honest, /, humbl, opinion, best)
4      (for, your, inform, islamabad, capit, pakistan)
5       (certain, condit, sever, gener, modifi, mater)
6    (probabl, alltim, favorit, movi, stori, selfle...
7              (love, movi, :, face_blowing_a_kiss, :)
8                             (walk, walk, walk, walk)
9    (run, eat, tear, eye, bad, habit, swim, play, ...
Name: review, dtype: object