### D213 - Advanced Data Analytics - PA2

### Background Info:

**Build a neural network designed to learn word usage and context using NLP techniques.**

_You will provide visualizations and a report, as well as build your network in an interactive development environment._

**A1 _Question: Using historical product reviews, what effect will Natural Language Processing and Neural Networks have on the ability to accurately predict future consumer sentiment?_**

--- 
### Import Libraries
---

In [241]:
from platform import python_version
print('The python version used is: %s' % python_version())

The python version used is: 3.7.13


In [242]:
# Standard libraries

import numpy as np # Array manipulation
import pandas as pd # DataFrame manipulation

# Plotting
import seaborn as sns # for plotting
import matplotlib.pyplot as plt
%matplotlib inline

# Local File Manipulation
import sys
import os # System and OS commands
import re # regex
import gzip # for reading compressed files

# Warnings
import warnings
warnings.filterwarnings('ignore')
# Warnings filter
from warnings import simplefilter
# ignore all future warnings
simplefilter(action='ignore', category=FutureWarning)

# Timer
%time
%timeit

CPU times: user 2 µs, sys: 0 ns, total: 2 µs
Wall time: 5.25 µs


In [243]:
# Analytics Library Imports

from scipy import signal 
import sklearn # Predictive Analytics
from sklearn import preprocessing
from sklearn.preprocessing import OneHotEncoder
from sklearn import model_selection
from sklearn.model_selection import train_test_split

In [244]:
# TensorFlow and Keras

import tensorflow as tf # ML / Deep Learning
import keras # Deep Learning API
from keras import preprocessing

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.models import load_model
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding
from tensorflow.keras.layers import Dense, Embedding, Dropout, SpatialDropout1D
from tensorflow.keras.preprocessing.sequence import pad_sequences

import nltk # Natural Language Processing
nltk.download('omw-1.4')
from nltk.corpus import stopwords
from nltk import word_tokenize
from nltk.stem import PorterStemmer
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/jasonewillis/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/jasonewillis/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/jasonewillis/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/jasonewillis/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [245]:
# %lsmagic

In [246]:
# Verify Input File Locations
print(os.listdir(path='./Data/sentiment_labelled_sentences/'))
# Output: ['.DS_Store', 'combo.txt', 'amazon_cells_labelled.txt', 'readme.txt', 'yelp_labelled.txt', 'imdb_labelled.txt']

['.DS_Store', 'combo.txt', 'amazon_cells_labelled.txt', 'readme.txt', '.ipynb_checkpoints', 'yelp_labelled.txt', 'imdb_labelled.txt']


---
### Load Data From UCI *.txt Files to DFs
---

In [247]:
# load data file
df_amz = pd.read_csv('./Data/sentiment_labelled_sentences/amazon_cells_labelled.txt', delimiter='\t', header=None)
df_amz.columns = ['review', 'sentiment']

df_imdb = pd.read_csv('./Data/sentiment_labelled_sentences/imdb_labelled.txt', delimiter='\t', header=None)
df_imdb.columns = ['review', 'sentiment']

df_yelp = pd.read_csv('./Data/sentiment_labelled_sentences/yelp_labelled.txt', delimiter='\t', header=None)
df_yelp.columns = ['review', 'sentiment']

In [248]:
# Concatinate and Read in Combo File to DF

df_concat = pd.concat([df_amz, df_imdb, df_yelp], axis=0)
df = df_concat.copy()

---
### Data Cleaning & Exploratory Data Analysis
---

In [249]:
print("******"*5)
print("* DataFrame Shape: ", df.shape)
print("******"*5)
pd.set_option('display.max_colwidth', 5000) # set max width
df.head(-10)

******************************
* DataFrame Shape:  (2748, 2)
******************************


Unnamed: 0,review,sentiment
0,So there is no way for me to plug it in here in the US unless I go by a converter.,0
1,"Good case, Excellent value.",1
2,Great for the jawbone.,1
3,Tied to charger for conversations lasting more than 45 minutes.MAJOR PROBLEMS!!,0
4,The mic is great.,1
...,...,...
985,The problem I have is that they charge $11.99 for a sandwich that is no bigger than a Subway sub (which offers better and more amount of vegetables).,0
986,Shrimp- When I unwrapped it (I live only 1/2 a mile from Brushfire) it was literally ice cold.,0
987,"It lacked flavor, seemed undercooked, and dry.",0
988,It really is impressive that the place hasn't closed down.,0


In [250]:
# Character Count
commentary = df['review']
list_of_chars = []
for comment in commentary:
    for character in comment:
        if character not in list_of_chars:
            list_of_chars.append(character)
num_of_chars = len(list_of_chars)
print("Number of Characters: ", num_of_chars)
print(list_of_chars) # Notice the presence of unusual characters (B1a)

Number of Characters:  91
['S', 'o', ' ', 't', 'h', 'e', 'r', 'i', 's', 'n', 'w', 'a', 'y', 'f', 'm', 'p', 'l', 'u', 'g', 'U', 'I', 'b', 'c', 'v', '.', 'G', 'd', ',', 'E', 'x', 'j', 'T', '4', '5', 'M', 'A', 'J', 'O', 'R', 'P', 'B', 'L', '!', 'z', 'N', 'W', 'q', 'H', '+', 'V', '"', 'Y', 'D', 'F', 'k', "'", 'K', 'C', '/', '7', '3', '6', '8', '0', '2', '?', 'Z', '-', '1', ':', ')', '(', 'Q', '&', '$', '*', ';', 'X', '%', '9', '#', '[', ']', '\x96', '\t', '\n', 'é', '\x85', 'å', '\x97', 'ê']


In [251]:
# B2 Tokenization
#tokenize = Tokenizer(num_words=vocab_size, oov_token=oov_tok)
#tokenize.fit_on_text(X_train)
#word_index = tokenizer.word_index
#print(word_index)

In [252]:
# Remove Stop Words
review_df = df.copy()
description_list = []
stop_word_list = stopwords.words('english')
for description in review_df.review:
    description = re.sub("[^a-zA-Z]","", description)

# Convert to Lower Case
description.lower()

# Tokenization
description=nltk.word_tokenize(description)

# Perform Lemmatization
lemma = nltk.WordNetLemmatizer()
description = [lemma.lemmatize(word) for word in description]

# Removing stopwords
description = [word for word in description if not word in stop_word_list]
description = "".join(description)
description_list.append(description)

description_list = []
stop_word_list = stopwords

In [253]:
# Vocab Size (B1b)
tokenizer = Tokenizer()
tokenizer.fit_on_texts(review_df['review'])
print('Vocab Size: ', len(tokenizer.word_index)+1)

Vocab Size:  5272


In [254]:
# Use Keras Tokenizer to Filter Data and Quantify Vocab Size
keras_token = Tokenizer(filters='\t\n})~!|#*=&><:;+.{/,(?$^%_-`@)[\\]', oov_token = '[UNK]', lower = True)

#Fit Commentary Dict Based on Frequency
keras_token.fit_on_texts(commentary)

word_count = keras_token.word_index # Create Dict Var
print('Size of data vocab: ', len(word_count)+1)
list(word_count.items())[2015:2022] # Random Sample


Size of data vocab:  5332


[('typical', 2016),
 ('sci', 2017),
 ('revealing', 2018),
 ('africa', 2019),
 ('process', 2020),
 ('shakespear', 2021),
 ('macbeth', 2022)]

In [255]:
max_sequence_embedding = int(round(np.sqrt(np.sqrt(vocab_size)),0))
max_sequence_embedding

11

In [256]:
# Convert Sentiment Scores
sentiment_label = df.sentiment.factorize()
sentiment_label

(array([0, 1, 1, ..., 0, 0, 0]), Int64Index([0, 1], dtype='int64'))

In [257]:
# Apply Tokenizer and Pad to max length
reviews = df.review.values
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(reviews)
vocab_size = len(tokenizer.word_index) + 1
encoded_docs = tokenizer.texts_to_sequences(reviews)
padded_sequences = pad_sequences(encoded_docs, maxlen=200) # max length of 200

In [258]:
print(tokenizer.word_index)



In [259]:
# Reviews Encoded and Padded
print(reviews[0])
print(encoded_docs[0])

So there is no way for me to plug it in here in the US unless I go by a converter.
[27, 50, 5, 58, 118, 12, 72, 7, 371, 6, 11, 66, 11, 1, 188, 579, 3, 77, 62, 4, 2267]


In [260]:
print(padded_sequences[0])

[   0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0   27   50    5
   58  118   12   72    7  371    6   11   66   11    1  188  579    3
   77 

In [261]:
commentary_length = []
for char_len in commentary:
    commentary_length.append(len(char_len.split(' ')))

# Set max, min, median vars
commentary_max = np.max(commentary_length)
commentary_min = np.min(commentary_length)
commentary_median = np.median(commentary_length)

print("The minimum length or our sequences is: ", commentary_min)
print("The median length or our sequences is: ", commentary_median)
print("The maximum length or our sequences is: ", commentary_max)

The minimum length or our sequences is:  1
The median length or our sequences is:  11.0
The maximum length or our sequences is:  1393


In [262]:
vocab_size = 15000
oov_tok = "<oov>"
embedding_dim = 16
max_length = 50
trunc_type='post'
padding_type='post'

In [263]:
X = padded_sequences
y = np.array(df[['sentiment']])
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    random_state = 40,
                                                    test_size = 0.20,  
                                                    stratify = y)

X_train, X_validation, y_train, y_validation = train_test_split(X_train, 
                                                    y_train, 
                                                    random_state = 40,
                                                    test_size = 0.20,  
                                                    stratify = y_train)

In [264]:
# Verify Shapes
print(X_train.shape, X_test.shape, X_validation.shape)

(1758, 200) (550, 200) (440, 200)


In [265]:
# Export padded train/test data
pd.DataFrame(X_train).to_csv('X_training_df.csv')
pd.DataFrame(X_test).to_csv('X_testing_df.csv')
pd.DataFrame(y_train).to_csv('y_training_df.csv')
pd.DataFrame(y_test).to_csv('y_testing_df.csv')
pd.DataFrame(X_validation).to_csv('X_validation_df.csv')
pd.DataFrame(y_validation).to_csv('y_validation_df.csv')

In [266]:
# Train  --48:00
sequences_train = tokenizer.texts_to_sequences(X_train)
padded_train = pad_sequences(sequences_train, maxlen=max_length, padding=padding_type, truncating=trunc_type)

# Test
sequences_test = tokenizer.texts_to_sequences(X_test)
padded_test = pad_sequences(sequences_test, maxlen=max_length, padding=padding_type, truncating=trunc_type)

# Display the Padded Sequence
np.set_printoptions(threshold=sys.maxsize)
padded_train[1]

AttributeError: 'numpy.ndarray' object has no attribute 'lower'

In [None]:
# B4 - Sentiment Analysis
activation = 'softmax'
loss = 'categorical_crossentropy'
optimzer = 'adam' #rmsprop

num_epochs = 20

# Define early_stopping_monitor
early_stopping_monitor=EarlyStopping(patience=2)

model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
    tf.keras.layers.GlobalAveragePooling1D(),
    tf.keras.layers.Dense(100, activation='relu'),
    tf.keras.layers.Dense(50, activation='relu'),
    tf.keras.layers.Dense(2, activation=activation)
])

model.compile(loss=loss, optimizer=optimzer, metrics=['accuracy'])

In [None]:
df.columns

In [None]:
# View Data Distro
sns.countplot(data=df, x='sentiment', palette='viridis').set(title='Sentiment Data Distribution');

## Clean Data

In [None]:
# Convert Sentiment Field to Int
df['sentiment'] = df['sentiment'].astype(int)
df.info()

In [None]:
# Drop any null columns
df = df.dropna()

# Any Null Values?
df.isnull().any()

### Check for Missing Values

In [None]:
# Mapping to view missing data...none present.
sns.heatmap(df.isnull(), yticklabels=False, cbar=False, cmap='viridis');

In [None]:
# Convert non-string values (if any)
for i in range (0, len(df)-1):
    if type(df.iloc[i] ['review'])!= str:
        count+=1
        df.iloc[i] ['review'] = str(df.iloc[i] ['review'])
        print("Converted Review(s): ", count)

In [None]:
# Tally Review Field Counts
df['sentiment'].value_counts() # pos/neg ~approx equal

## Train, Test, and Split

In [None]:
# Split for Training and Testing

X_train = df_stationary.loc[:'2020-09-30'] # Get all but the last 90 days for training
X_test = df_stationary['2020-10-01':] # Get last 90 days of data to test

print('Shape of X_train: ', X_train.shape)
print('Shape of X_test: ', X_test.shape)

## C5 - Prepared Dataset

In [None]:
# Export stationary data
pd.DataFrame(df_stationary).to_csv("df_cleaned_stationary.csv")

### Standard Error Metric: Train, Test and Actual Data

In [None]:
# Print mean absolute error
mae = np.mean(np.abs(a_model.resid))
mae_train = np.mean(np.abs(a_model_train.resid))
mae_test = np.mean(np.abs(a_model_test.resid))

print("Actual - Mean Absolute Error Data: ", mae)
print("Actual - Mean Absolute Error Training Data: ", mae_train)
print("Actual - Mean Absolute Error Test Data: ", mae_test)

In [None]:
from sklearn.metrics import mean_squared_error
from math import sqrt
rmse = sqrt(mean_squared_error(pred,X_test['Revenue']))
print("RMSE of test data: ", rmse)


A visualization that shows a true out-of-sample forecast over the test-set horizon, 
as well as the test-set actuals, is not readily evident. 

Please provide a chart that compares out-of-sample predictions to actuals.
out-of-sample = future dates

In [None]:
# Save model
#joblib.dump(model, "time_series_model.pkl")

# Terminal: nbconvert --to pdf D213_PA1.ipynb 

## End 