This notebook was created and run in Google Colab.

In [None]:
import tensorflow as tf
import pandas as pd
from sklearn.model_selection import train_test_split
import re
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
stopw = stopwords.words('english')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [39]:
from tensorflow import keras
import numpy as np

In [40]:
import tensorflow_hub as hub

In [10]:

movie_rev_df = pd.read_csv('/content/drive/MyDrive/kaggle/IMDB Dataset.csv')
sentiment_df = pd.read_csv('/content/drive/MyDrive/kaggle/sentiment_analysis.csv')
book_rev_df = pd.read_csv('/content/drive/MyDrive/kaggle/Books_rating.csv')

In [30]:
movie_rev_df.columns, sentiment_df.columns, book_rev_df.columns

(Index(['review', 'sentiment'], dtype='object'),
 Index(['text', 'sentiment'], dtype='object'),
 Index(['Id', 'Title', 'Price', 'User_id', 'profileName', 'review/helpfulness',
        'review/score', 'review/time', 'review/summary', 'review/text'],
       dtype='object'))

In [13]:
sentiment_df = sentiment_df[['text', 'sentiment']]

In [33]:
#labels the data 1 or 0 based on positive/negative sentiment. Makes sure data is split evenly in half between positive/negative.
def format_data(df):
  df_pos = df.loc[df['label'] == 1]
  df_neg = df.loc[df['n_label'] == 1]
  print(df_pos.shape, df_neg.shape)
  if (df_pos.shape[0] > df_neg.shape[0]) :
    df_pos = df_pos.sample(n = df_neg.shape[0], random_state = 123)
  elif (df_pos.shape[0] < df_neg.shape[0]) :
    df_neg = df_neg.sample(n = df_pos.shape[0], random_state = 123)
  print(df_pos.shape, df_neg.shape)
  df_labeled = pd.concat([df_pos, df_neg])
  df_labeled = df_labeled[['text', 'label']]
  df_labeled = df_labeled.sample(frac = 1)
  return df_labeled

In [34]:
m_df = pd.DataFrame()
m_df['text'] = movie_rev_df['review']
m_df['sentiment'] = movie_rev_df['sentiment']

m_df['label'] = (movie_rev_df.sentiment == 'positive').astype(int)
m_df['n_label'] = (movie_rev_df.sentiment == 'negative').astype(int)

m_df_labeled = format_data(m_df)
m_df_labeled.head()

(25000, 4) (25000, 4)
(25000, 4) (25000, 4)


Unnamed: 0,text,label
12357,It is fitting that the title character in Sydn...,0
45555,"OK, so the following review is more of a synop...",1
15079,It is quite rare that a movie comes along that...,0
9321,this a great Disney flick.it is the story of a...,1
12778,Both Disney and Bill Paxton did a fine job in ...,1


In [35]:
#repeat for sentiment df
s_df = pd.DataFrame()
s_df['text'] = sentiment_df['text']
s_df['sentiment'] = sentiment_df['sentiment']

s_df['label'] = (sentiment_df.sentiment == 'positive').astype(int)
s_df['n_label'] = (sentiment_df.sentiment == 'negative').astype(int)

s_df_labeled = format_data(s_df)
s_df_labeled.head()

(166, 4) (134, 4)
(134, 4) (134, 4)


Unnamed: 0,text,label
63,"Always love everyone, love all animals. Most i...",1
349,"90 degrees, gross skies, and thunderstorms...p...",1
459,In Arch. Drawing. Checking out MVCC`s CAD degr...,1
180,Happy birthday! Just woke up on this side of ...,1
226,JONAS BROTHERS - Live to party. ...,1


In [36]:
b_df = pd.DataFrame()
b_df['rating'] = book_rev_df['review/score']
b_df['text'] = book_rev_df['review/text']
b_df = b_df[b_df['text'].notnull()]

b_df['label'] = (b_df.rating == 5).astype(int)
b_df['n_label'] = (b_df.rating == 1).astype(int)

b_df_labeled = format_data(b_df)
b_df_labeled.head()


(1807335, 4) (201688, 4)
(201688, 4) (201688, 4)


Unnamed: 0,text,label
2733436,Whatever advice you find here that is differen...,0
258480,This book has many important points that I am ...,1
404934,The Anatomy of Melancholy may be a worthwhile ...,0
1048947,I put off reading this book for quite a long t...,1
2055014,It is a story that will keep you on the edge o...,1


In [37]:
data = pd.concat([b_df_labeled, m_df_labeled, s_df_labeled])

In [42]:
data = data.sample(frac=1)
data.head()

Unnamed: 0,text,label
1672403,I own this deck and Kisma's two Faery-Faith bo...,1
1735864,I've always purchased the newest Grisham offer...,0
308532,"Homeopathy has, for much of the prior century,...",1
2480409,"Call It Sleep, the first novel of Henry Roth, ...",1
1877587,"The book Hell's Angels, by Hunter S. Thompson,...",1


In [43]:
#preprocessing
data['text'] = data['text'].apply(lambda x: ' '.join([word for word in x.split() if any(ch.isdigit() for ch in word) == False]))
data['text'] = data['text'].apply(lambda txt: re.sub(r"[,.:;@#?/!&$]+", ' ', txt))
data['text'] = data['text'].apply(lambda x: x.lower())
data['text'] = data['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stopw)]))

In [44]:
train, test = train_test_split(data, test_size = 0.4, random_state = 111)

In [45]:
def df_to_dataset(dataframe, shuffle=True, batch_size=32):
  df = dataframe.copy()
  labels = df.pop('label')
  df = df['text']
  ds = tf.data.Dataset.from_tensor_slices((df, labels))
  if shuffle:
    ds = ds.shuffle(buffer_size=len(dataframe))
  ds = ds.batch(batch_size)
  ds = ds.prefetch(tf.data.AUTOTUNE)
  return ds

In [46]:
train_data = df_to_dataset(train)
test_data = df_to_dataset(test)

Creating Model

In [47]:
hub_layer = hub.KerasLayer("https://kaggle.com/models/google/nnlm/frameworks/TensorFlow2/variations/en-dim128-with-normalization/versions/1",
                           input_shape=[], dtype=tf.string)

In [48]:
model = tf.keras.Sequential()
model.add(hub_layer)
model.add(tf.keras.layers.Dense(16, activation='relu'))
model.add(tf.keras.layers.Dropout(0.3))
model.add(tf.keras.layers.Dense(16, activation='relu'))
model.add(tf.keras.layers.Dropout(0.3))
model.add(tf.keras.layers.Dense(1, activation='sigmoid'))

In [49]:
model.compile(optimizer = keras.optimizers.Adam(learning_rate=0.003), loss= tf.losses.BinaryCrossentropy() , metrics=['accuracy'])

In [50]:
model.evaluate(train_data)



[0.6944547891616821, 0.5009809732437134]

In [51]:
history = model.fit(train_data, epochs = 5, validation_data=test_data)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [None]:
model.save('posneghub_3_legacy_Adam.keras')