# Train sentiment analysis model with Layer

[![Open in Layer](https://development.layer.co/assets/badge.svg)](https://development.layer.co/layer/sentiment-analysis) [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/layerai/examples/blob/main/sentiment-analysis/sentiment_analysis.ipynb) [![Layer Examples Github](https://badgen.net/badge/icon/github?icon=github&label)](https://github.com/layerai/examples/tree/main/sentiment-analysis)

In this project we train sentiment analysis model using Recurrent Neural Networks in TensorFlow.

In [None]:
!pip install layer --upgrade

In [None]:
!pip install -U ipython

In [None]:
!layer --version

Layer, version 0.9.340226


In [None]:
import os
# Obtain from https://www.kaggle.com/username/account
os.environ["KAGGLE_USERNAME"]="KAGGLE_USERNAME"
os.environ["KAGGLE_KEY"]="KAGGLE_KEY"

In [None]:
import kaggle

In [None]:
!kaggle datasets download lakshmi25npathi/imdb-dataset-of-50k-movie-reviews

In [None]:
import zipfile
with zipfile.ZipFile('imdb-dataset-of-50k-movie-reviews.zip', 'r') as zip_ref:
    zip_ref.extractall('imdb-dataset-of-50k-movie-reviews')

In [None]:
import layer
layer.login()

In [None]:
# ++ init Layer
layer.init("sentiment-analysis")

In [None]:
from layer.decorators.assertions import assert_true, assert_valid_values, assert_not_null, assert_unique
from layer.decorators import dataset, model,resources, fabric, pip_requirements
from layer.client import Dataset

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from numpy import array
import tensorflow as tf
from sklearn.model_selection import train_test_split 
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory


In [None]:
data_file = 'imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv'

In [None]:
test = pd.read_csv(data_file)

In [None]:
test.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [None]:
@dataset('imdb-dataset-of-50k-movie-reviews')
@resources(data_file)
def read_movies():
    df = pd.read_csv(data_file) 
    return df

In [None]:
# Read and save the reviews using Layer infra
layer.run([read_movies])

Output()

Run(project_name='sentiment-analysis')

In [None]:
def remove_stop_words(review):
    import nltk
    from nltk.corpus import stopwords
    nltk.download('stopwords')
    nltk.data.path.append(".")
    review_minus_sw = []
    stop_words = stopwords.words('english')
    review = review.split()
    cleaned_review = [review_minus_sw.append(word) for word in review if word not in stop_words]            
    cleaned_review = ' '.join(review_minus_sw)
    return cleaned_review       

In [None]:
@dataset('clean_imdb-dataset-of-50k-movie-reviews')
@pip_requirements(packages=["nltk","scikit-learn"])
def clean_reviews():
    dataset = layer.get_dataset('layer/sentiment-analysis/datasets/imdb-dataset-of-50k-movie-reviews').to_pandas()
    dataset['review'] = dataset['review'].apply(remove_stop_words)
    labelencoder = LabelEncoder()
    df = dataset.assign(sentiment = labelencoder.fit_transform(dataset["sentiment"]))
    return df

In [None]:
# Read and save the reviews using Layer infra
layer.run([clean_reviews])

Output()

Run(project_name='sentiment-analysis')

In [None]:
@fabric("f-medium")
@pip_requirements(packages=["tensorflow","keras"])
@model(name='imdb_data_tokenizer')
def save_tokenizer():
  from numpy import array
  from tensorflow.keras.preprocessing.text import Tokenizer
  from sklearn.model_selection import train_test_split

  df = layer.get_dataset('layer/sentiment-analysis/datasets/clean_imdb-dataset-of-50k-movie-reviews').to_pandas()
  docs = df['review']
  labels = array(df['sentiment'])
  X_train, X_test , y_train, y_test = train_test_split(docs, labels , test_size = 0.20, random_state=0)
  oov_token = "<OOV>"
  tokenizer = Tokenizer(oov_token=oov_token)
  tokenizer.fit_on_texts(X_train)
  return tokenizer

In [None]:
layer.run([save_tokenizer])

In [None]:
# # Run the clean_reviews function locally and save the output to Layer infra
# nltk.download('stopwords')
# reviews = clean_reviews()

In [None]:
# reviews.head()

In [None]:
@fabric("f-gpu-small")
@pip_requirements(packages=["tensorflow","keras"])
@model(name='tensorflow-sentiment-analysis')
def train():
    from tensorflow.keras.preprocessing.text import Tokenizer
    from tensorflow.keras.preprocessing.sequence import pad_sequences
    from tensorflow.keras.models import Sequential
    from tensorflow.keras.layers import Dense,Embedding, LSTM, Bidirectional
    from tensorflow.keras.layers import Flatten
    from tensorflow.keras.layers import Embedding

    df = layer.get_dataset('layer/sentiment-analysis/datasets/clean_imdb-dataset-of-50k-movie-reviews').to_pandas()
    docs = df['review']
    labels = array(df['sentiment'])
    X_train, X_test , y_train, y_test = train_test_split(docs, labels , test_size = 0.20, random_state=0)
    tokenizer = layer.get_model('layer/sentiment-analysis/models/imdb_data_tokenizer').get_train()
    word_index = tokenizer.word_index
    X_train_sequences = tokenizer.texts_to_sequences(X_train)
    X_test_sequences = tokenizer.texts_to_sequences(X_test)
    padding_type = "post"
    truncation_type="post"
    max_length = 512
    X_train_padded = pad_sequences(X_train_sequences,maxlen=max_length, padding=padding_type,
                       truncating=truncation_type)
    X_test_padded = pad_sequences(X_test_sequences,maxlen=max_length, padding=padding_type,
                              truncating=truncation_type)
    tf.random.set_seed(0)
    training_data = tf.data.Dataset.from_tensor_slices((X_train_padded, y_train))
    validation_data = tf.data.Dataset.from_tensor_slices((X_test_padded, y_test))
    batch_size = 32
    training_data = training_data.batch(batch_size)
    validation_data = validation_data.batch(batch_size)
    callback = tf.keras.callbacks.EarlyStopping(monitor='loss', patience=3)
    epochs=1
    vocab_size = len(tokenizer.word_index) + 1
    input_length = 512
    model = Sequential([
    Embedding(vocab_size, 64, input_length=input_length),
    Bidirectional(LSTM(64, return_sequences=True)),
    Bidirectional(LSTM(64,)),
    Dense(32, activation='relu'),
    Dense(1, activation='sigmoid')])

    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])
    model.fit(training_data, epochs=epochs, verbose=1,validation_data = validation_data, callbacks = [callback])
    loss, accuracy = model.evaluate(training_data, verbose=1)
    layer.log({"accuracy":accuracy})
    print('Training Accuracy is {}'.format(accuracy*100))
    layer.log({"loss":loss})
    loss, accuracy = model.evaluate(validation_data)
    layer.log({"validation_accuracy":accuracy})
    layer.log({"validation_loss":loss})
    print('Testing Accuracy is {} '.format(accuracy*100))
    return model

In [None]:
# Train on Layer infra
layer.run([train])

In [None]:
# Train on your own infra
train()

In [None]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
from numpy import array
import numpy as np
import layer
review = "That was such a horrible movie, I hated it."
tokenizer = layer.get_model('layer/sentiment-analysis/models/imdb_data_tokenizer').get_train()
model = layer.get_model('layer/sentiment-analysis/models/tensorflow-sentiment-analysis')
classifier = model.get_train()
word_index = tokenizer.word_index
X_test_sequences = tokenizer.texts_to_sequences(review)
padding_type = "post"
truncation_type="post"
max_length = 512
X_test_padded = pad_sequences(X_test_sequences,maxlen=max_length, padding=padding_type,
                          truncating=truncation_type)
test_data = np.expand_dims(X_test_padded[0], axis=0)
prediction = classifier.predict(test_data)
if prediction[0][0]>0.5:
  print("Is positive")
else:
   print("Is negative")