In [33]:
import re
import csv 
import math
import random
import warnings

import numpy as np
import pandas as pd

from tqdm import tqdm
from google.colab import files

!pip install bert-for-tf2
!pip install sentencepiece 

import bert
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_datasets as tfds

from tensorflow.keras import layers 
from tensorflow.keras.datasets import imdb
from keras.layers.embeddings import Embedding
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.layers import Embedding, Flatten, GlobalMaxPool1D, Dense, Dropout
from keras.layers.convolutional import MaxPooling1D, Conv1D



 # Challenge
 
The Challenge will be to identify the sentiment in the text in the imdb_reviews. The data imdb_reviews for the training has to be taken from tensorflow tfds

In [3]:
warnings.filterwarnings('ignore')

pd.set_option('display.max_colwidth', None)

# Load traindata 

In [4]:
vocab_size = None # We take all the words 
index_from = 3

(X_train, y_train), (X_valid, y_valid) = imdb.load_data(num_words=vocab_size,
                                                      index_from=index_from)

## Exploring traindata

In [5]:
# IMDB contains 25,000 movie reviews
X_train.shape, y_train.shape, X_valid.shape, y_valid.shape 

((25000,), (25000,), (25000,), (25000,))

In [6]:
 # The sequences have different sizes
 len(X_train[0]),  len(X_train[2]), len(X_valid[34])

(218, 141, 133)

In [7]:
 # The review has been encoded as a sequence of integers
 X_train[0][:5] 

[1, 14, 22, 16, 43]

In [8]:
# Classes 0: negative review, 1: positive review.
np.unique(y_train)

array([0, 1])

In [9]:
# Restore original text from Keras’s imdb dataset

# imdb.get_word_index: dictionary that provides the word of the corresponding integer
# word_to_index      : dictionary that links each word to a unique integer

word_to_index = {k: (v + index_from) for k, v in imdb.get_word_index().items()}
word_to_index["<PAD>"]    = 0 # O is usually the padding character.
word_to_index["<START>"]  = 1 # Start of the sequence 
word_to_index["<UNK>"]    = 2 # Unknown word
word_to_index["<UNUSED>"] = 3

# Reversing the dictionary. 
index_to_word = dict(map(reversed, word_to_index.items()))

df_train = pd.DataFrame(data=list(zip(y_train, X_train)), columns=["labels", "vectors"])

df_train['reviews'] = df_train.vectors.apply(lambda sent: ' '.join([index_to_word[indx] \
                                                                 for indx in sent if indx in index_to_word.keys()]))

# Shuffle the data
df_train = df_train.sample(frac=1)

df_train.head(1)


Unnamed: 0,labels,vectors,reviews
12769,0,"[1, 6, 801, 11138, 2284, 10119, 11513, 267, 40, 6, 1665, 200, 17590, 15224, 11, 28238, 5, 430, 742, 214, 60, 19, 84, 59, 764, 28, 11135, 41, 19, 4, 339, 7, 35, 154, 2368, 2915, 15, 59, 659, 11, 4, 162, 313, 59, 5, 41, 1633, 189, 19102, 4115, 328, 4, 64, 2602, 52, 155, 44, 14, 4995, 818, 4, 33835, 184, 5006, 78, 262, 54, 10119, 22053, 19, 4, 3558, 2915, 5, 50, 26, 57, 2629, 42, 836, 8, 30, 69, 14, 22, 5346, 10031, 1346, 18, 342, 2289, 257, 1076, 53, 2526, 74, 4, 236, 84, 267, ...]",<START> a typical goth chick rainbow harvest looking like a cross between winona ryder in beetlejuice and boy george gets even with people she feels have wronged her with the help of an old haunted mirror that she finds in the new house she and her mom horror mainstay karen black the only remotely good thing about this travesty buy the acting's pretty laughably bad especially when rainbow interacts with the aforementioned mirror and there are no scares or suspense to be had this film inexplicably spawned thus for 3 sequels each slightly more atrocious than the last people looking for a similarly themed but far superior cinematic endeavor would be well advised to just search out the episode of friday the 13th the series where a geeky girl finds an old cursed compact mirror that packs more chills in it's scant 40 minutes than this whole franchise has provided across it's 4 films br br my grade d br br eye candy charlie spradling provides the obligatory t a


In [10]:
# List of unique words in our corpus
list_unique_words = list(df_train.reviews.str.split(' ', expand=True).stack().unique())
print(f"Vocabulary: {len(list_unique_words)}")

vocab_size = len(list_unique_words) + 100 if vocab_size is None else vocab_size

vocab_size, len(imdb.get_word_index())

Vocabulary: 88585


(88685, 88584)

In [11]:
# Non-alphanumeric characters & digits

non_alphanumetique, degits = set(), set()

for review in df_train.reviews:
    non_alphanumetique |= set(re.findall(r'\W', review))   
    degits |= set(re.findall(r'\d+', review))   

print(non_alphanumetique)
print(degits)

{'\x91', '…', '\x96', '\x97', '–', '§', '¨', '®', '‘', '\xa0', '\xad', '>', '´', '\uf0b7', '¦', '«', '¤', '°', '\x9e', '\x8e', '\x8d', ' ', '·', '\x08', '’', '“', '\x80', "'", '¡', '\x84', '»', '\x85', '£', '₤', '\x10', '¢', '¿', '\x9a', '<', '”', '\x95'}
{'99', '1000000', '666', '153', '1813', '1201', '00817', '142', '1929', '62', '1958', '1940', '350', '1861', '1415', '52', '087', '6000', '1000', '139', '73', '502', '5250', '24', '26', '21849907', '280', '998', '1937', '1894', '203', '428', '262', '230', '94', '5', '53', '1832', '78', '02', '225', '25', '2019', '223', '32', '1900', '1994', '61', '409', '1939', '31', '1146', '974', '1971', '2050', '1986', '1820', '1927', '1945', '171', '330', '1948', '75', '442', '2009', '1987', '5000', '3', '34', '214', '75054', '64', '735', '1992', '231', '2', '21849889', '4000', '123', '2010', '65', '1408', '8230', '1547', '420', '103', '571', '00015', '197', '63', '1809', '19', '83', '5400', '56', '1897', '1889', '15', '020410', '1', '1989', '7600

## Create the dataset for the trainset

In [12]:
# pad dataset to a maximum review length in words
# we will fill the unused word slots with zeros
max_words = 600
X_train = sequence.pad_sequences(df_train.vectors, value=word_to_index["<PAD>"], maxlen=max_words)
X_valid = sequence.pad_sequences(X_valid, value=word_to_index["<PAD>"], maxlen=max_words)

## Load testdata

In [14]:
def preprocessing(df, usecols=[], nb=5, verbose=True):

    if verbose: 
        print(f"{df.columns}")
        print(f"DF before:\n{df.head(nb)}\n")
    
    non_charac, digits = set(), set()
    for x in df[usecols[0]]:
        non_charac |= set(re.findall(r'\W', x))   
        digits     |= set(re.findall(r'\d+', x))       

    # Remove the Non-alphanumeric
    if verbose: 
        print(f"Remove all these non-alphanumeric characters :\n{non_charac}")
    df['preprocessed_text'] = df[usecols[0]].apply(lambda x: re.sub(r'\W', ' ', x))
    
    # Lower case
    df['preprocessed_text'] = df['preprocessed_text'].apply(lambda x: x.lower())

    # Start
    df['preprocessed_text'] = df['preprocessed_text'].apply(lambda x: re.sub(r'^b ', '<START> ', x))
       
    # Remove the extra spaces at the middle, the beginning and the end of the text    
    df['preprocessed_text'] = df['preprocessed_text'].apply(lambda x: re.sub(r'\s+', ' ', x))        
    df['preprocessed_text'] = df['preprocessed_text'].apply(lambda x: re.sub(r'(^\s+|\s+$)', ' ', x)) 

    df['vectors'] = df['preprocessed_text'].apply(lambda sentence: [word_to_index[word] for word in sentence.split() \
                                                                    if word in word_to_index.keys()] ) 
    
    if verbose: 
        print(f"\n{df.columns}")
        print(f"DF after:\n{df.head(nb)}\n")
    
    return df

In [15]:
df_test_raw = pd.read_csv("/content/testSentimentDataforValidation.csv")

df_test = preprocessing(df_test_raw.copy(), usecols=['sentences'], nb=1, verbose=True)

Index(['Id', 'sentences'], dtype='object')
DF before:
   Id                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                             

In [16]:
# pad dataset to a maximum review length in words
# we will fill the unused word slots with zeros
max_words = 600
X_test = sequence.pad_sequences(df_test.vectors, value=word_to_index["<PAD>"], maxlen=max_words)

## Build the model

In [17]:
# create the model

model = Sequential()

model.add(Embedding(vocab_size, 32, input_length=max_words))
model.add(Conv1D(64, 2, padding='same', activation='relu'))
model.add(Conv1D(32, 3, padding='same', activation='relu'))
model.add(GlobalMaxPool1D())
model.add(Flatten())
model.add(Dense(250, activation='relu'))
model.add(Dropout(0.25))
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 600, 32)           2837920   
_________________________________________________________________
module_wrapper (ModuleWrappe (None, 600, 64)           4160      
_________________________________________________________________
module_wrapper_1 (ModuleWrap (None, 600, 32)           6176      
_________________________________________________________________
global_max_pooling1d (Global (None, 32)                0         
_________________________________________________________________
flatten (Flatten)            (None, 32)                0         
_________________________________________________________________
dense (Dense)                (None, 250)               8250      
_________________________________________________________________
dropout (Dropout)            (None, 250)               0

# Fitting the model

In [18]:
# Fit the model

model.fit(X_train, df_train.labels, validation_data=(X_valid, y_valid), epochs=10, batch_size=128, verbose=1)

# Final evaluation of the model
scores = model.evaluate(X_valid, y_valid, verbose=0)
print("Accuracy: %.2f%%" % (scores[1] * 100))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Accuracy: 87.59%


# Testing the data

In [19]:
# Final evaluation of the model
predictions = model.predict(X_test, verbose=0)

In [20]:
print(f"Nb negative reviews: {(np.floor(predictions * 2) == 0).sum()}")
print(f"Nb positif reviews:{(np.floor(predictions * 2) == 1).sum()}")

Nb negative reviews: 5368
Nb positif reviews:4360


In [21]:
df_test["predictions"] = np.floor(predictions * 2)


In [82]:
(np.floor(predictions * 2) == 0).sum(), (np.floor(predictions * 2) == 1).sum()

(5127, 2416)

In [37]:
df_test.head(1)

Unnamed: 0,Id,sentences,preprocessed_text,vectors,predictions
0,1,"b""A blackly comic tale of a down-trodden priest, Nazarin showcases the economy that Luis Bunuel was able to achieve in being able to tell a deeply humanist fable with a minimum of fuss. As an output from his Mexican era of film making, it was an invaluable talent to possess, with little money and extremely tight schedules. Nazarin, however, surpasses many of Bunuel's previous Mexican films in terms of the acting (Francisco Rabal is excellent), narrative and theme.<br /><br />The theme, interestingly, is something that was explored again in Viridiana, made three years later in Spain. It concerns the individual's struggle for humanity and altruism amongst a society that rejects any notion of virtue. Father Nazarin, however, is portrayed more sympathetically than Sister Viridiana. Whereas the latter seems to choose charity because she wishes to atone for her (perceived) sins, Nazarin's whole existence and reason for being seems to be to help others, whether they (or we) like it or not. The film's last scenes, in which he casts doubt on his behaviour and, in a split second, has to choose between the life he has been leading or the conventional life that is expected of a priest, are so emotional because they concern his moral integrity and we are never quite sure whether it remains intact or not.<br /><br />This is a remarkable film and I would urge anyone interested in classic cinema to seek it out. It is one of Bunuel's most moving films, and encapsulates many of his obsessions: frustrated desire, mad love, religious hypocrisy etc. In my view 'Nazarin' is second only to 'The Exterminating Angel', in terms of his Mexican movies, and is certainly near the top of the list of Bunuel's total filmic output.""",<START> a blackly comic tale of a down trodden priest nazarin showcases the economy that luis bunuel was able to achieve in being able to tell a deeply humanist fable with a minimum of fuss as an output from his mexican era of film making it was an invaluable talent to possess with little money and extremely tight schedules nazarin however surpasses many of bunuel s previous mexican films in terms of the acting francisco rabal is excellent narrative and theme br br the theme interestingly is something that was explored again in viridiana made three years later in spain it concerns the individual s struggle for humanity and altruism amongst a society that rejects any notion of virtue father nazarin however is portrayed more sympathetically than sister viridiana whereas the latter seems to choose charity because she wishes to atone for her perceived sins nazarin s whole existence and reason for being seems to be to help others whether they or we like it or not the film s last scenes in which he casts doubt on his behaviour and in a split second has to choose between the life he has been leading or the conventional life that is expected of a priest are so emotional because they concern his moral integrity and we are never quite sure whether it remains intact or not br br this is a remarkable film and i would urge anyone interested in classic cinema to seek it out it is one of bunuel s most moving films and encapsulates many of his obsessions frustrated desire mad love religious hypocrisy etc in my view nazarin is second only to the exterminating angel in terms of his mexican movies and is certainly near the top of the list of bunuel s total filmic output,"[1, 6, 34929, 700, 787, 7, 6, 180, 22005, 2325, 8766, 4, 8238, 15, 6045, 14111, 16, 502, 8, 2715, 11, 112, 502, 8, 376, 6, 1685, 22568, 9108, 19, 6, 4900, 7, 8602, 17, 35, 10564, 39, 27, 2662, 999, 7, 22, 231, 12, 16, 35, 17206, 676, 8, 6491, 19, 117, 278, 5, 576, 2706, 25786, 190, 9471, 111, 7, 14111, 590, 960, 2662, 108, 11, 1303, 7, 4, 116, 3698, 9, 321, 1321, 5, 756, 10, 10, 4, 756, 5693, 9, 142, 15, 16, 4087, 174, 11, 93, 289, 153, 303, 11, 5001, 12, 3277, 4, 2267, ...]",1.0


## Save the model



In [32]:
df_test[['Id', 'predictions']].to_csv('results.csv') 

files.download('results.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>