In [6]:
import pandas as pd
import warnings
import pickle
warnings.filterwarnings("ignore")
import xgboost as xgb
from sklearn.model_selection import train_test_split
#For NLP
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import wordnet
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
stemmer = PorterStemmer()
#for data analysis (specially for reading and handling files)
import re, string
#data analysis (arrays, built-in functions)
import numpy as np
from itertools import chain
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

#### utils

In [7]:
def preprocess(text):
    text = text.lower()
    text=text.strip()
    text=re.compile('<.*?>').sub('', text)
    text = re.compile('[%s]' % re.escape(string.punctuation)).sub(' ', text)
    text = re.sub('\s+', ' ', text)
    text = re.sub(r'\[[0-9]*\]',' ',text)
    text=re.sub(r'[^\w\s]', '', str(text).lower().strip())
    text = re.sub(r'\d',' ',text)
    text = re.sub(r'\s+',' ',text)
    return text


# STOPWORD REMOVAL
def stopword(string):
    a= [i for i in string.split() if i not in stopwords.words('english')]
    return ' '.join(a)
#LEMMATIZATION
# Initialize the lemmatizer
wl = WordNetLemmatizer()

# This is a helper function to map NTLK position tags
def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN
# Tokenize the sentence
def lemmatizer(string):
    word_pos_tags = nltk.pos_tag(word_tokenize(string)) # Get position tags
    a=[wl.lemmatize(tag[0], get_wordnet_pos(tag[1])) for idx, tag in enumerate(word_pos_tags)] # Map the position tag and lemmatize the word/token
    return " ".join(a)

#tokenization
def tokenize(sentence):
    return nltk.word_tokenize(sentence)


def flatten_chain(matrix):
  return list(chain.from_iterable(matrix))


#Convert to bag of words
def bag_of_words(tokenized_sentence, words):
    # initialize bag with 0 for each word
    bag = np.zeros(len(words), dtype=np.float32)
    for idx, w in enumerate(words):
        if w in tokenized_sentence:
            bag[idx] = 1

    return bag

#concatinate all methods
def finalpreprocess(string):
    return tokenize(lemmatizer(stopword(preprocess(string))))

#### Data Preprocessing

In [8]:
#data loading
data = pd.read_csv("imdb_10K_sentimnets_reviews.csv")
data.head()

Unnamed: 0,review,sentiment
0,"Okay, I know this does'nt project India in a g...",1
1,Despite John Travolta's statements in intervie...,0
2,"I am a kung fu fan, but not a Woo fan. I have ...",1
3,He seems to be a control freak. I have heard h...,0
4,"Admittedly, there are some scenes in this movi...",1


In [9]:
vocab_box = list(set(flatten_chain(data.review.apply(finalpreprocess))))
with open('vocab.pkl', 'wb') as f:
  pickle.dump(vocab_box, f)


In [10]:
#data cleaning and convert to bag of words
data["processed_reviews"] = data.review.apply(finalpreprocess)
data["bag_of_words"] = data["processed_reviews"].apply(lambda x: bag_of_words(x, vocab_box))

#### Training

In [11]:
x_train, x_test, y_train, y_test = train_test_split(data["bag_of_words"].apply(pd.Series), data["sentiment"], test_size=0.2)
xgb_model = xgb.XGBClassifier()
xgb_model.fit(x_train, y_train)
#accuracy
accuracy = xgb_model.score(x_test,y_test)
#save model
pickle.dump(xgb_model, open("symantic_model.pkl", "wb"))

#### Inferences

In [15]:
#inferences
vocab_box = pickle.load(open('vocab.pkl', 'rb'))
model = pickle.load(open('symantic_model.pkl', 'rb'))
sentence = bag_of_words(finalpreprocess("kamran is a good person"), vocab_box).reshape(1,-1)
pred = model.predict(sentence)[0]
if pred == 1:
  print("positive")
else:
  print("negative")

positive
