In [None]:
import numpy as np 
import pandas as pd 
import re
import sklearn
from decimal import Decimal
import nltk
from sklearn.feature_extraction import DictVectorizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D,Bidirectional
from sklearn.model_selection import train_test_split
from keras.utils.np_utils import to_categorical
from nltk.tokenize import sent_tokenize, word_tokenize
from textblob import TextBlob, Word
from langdetect import detect
from gensim.models import Word2Vec
from sentic import SenticPhrase
import warnings
warnings.filterwarnings("ignore")


In [None]:
#Reading and cleaning the data
data = pd.read_csv(r"C:\Users\jayap\Desktop\Revs\reviews.csv",keep_default_na=False,nrows=7000)
data = data["comments"]
data = data.apply(lambda x: x.lower())
# data = data.apply((lambda x: re.sub('[^a-zA-z0-9\s]','',x)))
data_new = []
#Tokenizing the data 
tokens = []
polarity_set = []
sentic_vecs= {}
for review in data:
    try:
        if (detect(review) == 'en'):
            data_new.append(review)
            sentence = TextBlob(review)
            value = sentence.polarity
            if value > 0.5:
                polarity_set.append(1)
            elif value <= 0.5:
                polarity_set.append(0)
            tokens.append(nltk.word_tokenize(review))
    except:
        continue 
    
pos_tags = []
tags = []
for token in tokens:
    pos_tags.append(nltk.pos_tag(token))

for list in pos_tags:
    tag = []
    for word,pos in list:
        tag.append(pos)
    tags.append(tag)

# Creating the model and setting values for the various parameters
num_features = 100  # Word vector dimensionality
min_word_count = 1 # Minimum word count
num_workers = 4     # Number of parallel threads
context = 5        # Context window size
downsampling = 1e-3 # (0.001) Downsample setting for frequent words

model = Word2Vec(tokens,\
                          workers=num_workers,\
                          size=num_features,\
                          min_count=min_word_count,\
                          window=context,
                          sample=downsampling,negative=1)
tag_model = Word2Vec(tags,workers=num_workers,min_count=min_word_count,size=45,negative=1)

print("Vocabulary")
print(model)

print("POS tag vocab")
print(tag_model)



train_vecs = []
sentic_dict = DictVectorizer(sparse=False)
for review in data_new:
    sp = SenticPhrase(review)
    token = nltk.word_tokenize(review)
    token = nltk.pos_tag(token)            
    for word,pos in token:
        sentic = {}
        sentic.update(sp.get_sentics(word))
        polarity = {'polarity': sp.get_polarity(word)}
        sentic.update(polarity)
        sent = sentic_dict.fit_transform(sentic)
        senti = []
        if sent.size > 1:
            for i in sent[0]:
                senti.append('%.2E' % Decimal(i))
        else:
            for i in range(5):
                senti.append(0)
        senti = np.array(senti)
        x = np.concatenate([model[word],tag_model[pos],senti])
    train_vecs.append(x)


       
# polarity_set = pd.get_dummies(polarity_set).values

tokens = [w for s in tokens for w in s ]
tokens = set(tokens)
max_features = len(tokens)

from sklearn.preprocessing import MinMaxScaler, StandardScaler

#converting to array type
train_vecs = np.asarray(train_vecs)

X_train, X_test,Y_train, Y_test = train_test_split(train_vecs,polarity_set, test_size = 0.25, random_state = 42)
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)


In [None]:
classifier = Sequential()
classifier.add(Dense(units=50, kernel_initializer = 'uniform', activation = 'relu', input_dim = 150))
classifier.add(Dense(units=50, kernel_initializer = 'uniform', activation = 'relu'))
classifier.add(Dense(units=50, kernel_initializer = 'uniform', activation = 'relu'))
classifier.add(Dense(units=50, kernel_initializer = 'uniform', activation = 'relu'))
classifier.add(Dense(units =1,kernel_initializer = 'uniform', activation = 'sigmoid'))
classifier.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])

In [None]:
classifier.fit(X_train, Y_train, batch_size = 10, epochs = 100)
y_pred = classifier.predict(X_test)
y_pred = (y_pred > 0.5)

from sklearn.metrics import confusion_matrix
cm = confusion_matrix(Y_test, y_pred)
print(cm)
print(len(X_test))