# NLP: Recommendations and Sentiment Analysis


# Part 0: Setup

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

# Part 1: Generate Recommendations from LDA Transformation

In this part we will transform a set of product descriptions using TfIdf and LDA topic modeling to generate product recommendations based on similarity in LDA space. 

## Load data and transform text using TfIDF

In [3]:
#Load the Data

# The dataset we'll be working with is a set of product descriptions from the JCPenney department store.

# Load product information from ../data/jcpenney-products_subset.csv.zip
df_jcp = pd.read_csv('../data/jcpenney-products_subset.csv.zip')
print(df_jcp.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   name_title   5000 non-null   object
 1   description  5000 non-null   object
dtypes: object(2)
memory usage: 78.2+ KB
None


In [4]:
# Print an Example

print(df_jcp['name_title'][0])
print('-'*50) 
print(df_jcp['description'][0])

Invicta® Sl Rally Mens Black Leather Strap Chronograph Watch 16012
--------------------------------------------------
A timepiece you can enjoy every day of the week, this sports car-inspired chronograph watch packs plenty of information into an easy-to-read dial.   Brand: Invicta Dial Color: Black Strap: Black leather Clasp: Buckle Movement: Quartz Water Resistance: 100m Case Width: 48mm Case Thickness: 13.5mm Bracelet Dimensions: 210mm long; 22mm wide Model No.: 16012 Special Features: Stopwatch; 3 multifunction sub dials   Jewelry photos are enlarged to show detail.


In [5]:
# Transform Descriptions using TfIdf

# In order to pass our product descriptions to the LDA model, we first
#   need to vectorize from strings to fixed length vectors of floats.
# To do this we will transform our documents into a TfIdf representation.

from sklearn.feature_extraction.text import TfidfVectorizer

# TfidfVectorizer
tfidf = TfidfVectorizer(ngram_range=(1, 2), min_df=10, max_df=0.1)
X_tfidf = tfidf.fit_transform(df_jcp['description'])

# Print the shape of X_tfidf 
print(X_tfidf.shape)

(5000, 5678)


In [6]:
# Show The Terms Extracted From Row 0
print(tfidf.inverse_transform(X_tfidf[0]))

[array(['jewelry photos', 'features stopwatch', 'special features',
       'model no', 'wide model', '22mm wide', 'long 22mm',
       'bracelet dimensions', 'case thickness', 'case width',
       'resistance 100m', 'water resistance', 'quartz water',
       'movement quartz', 'buckle movement', 'clasp buckle',
       'leather clasp', 'black leather', 'strap black', 'black strap',
       'color black', 'dial color', 'to read', 'easy to', 'an easy',
       'plenty of', 'of the', 'day of', 'every day', 'you can', 'sub',
       'stopwatch', 'special', 'no', 'model', 'wide', '22mm',
       'dimensions', 'bracelet', '5mm', '13', 'thickness', 'width',
       'case', '100m', 'resistance', 'water', 'quartz', 'movement',
       'buckle', 'clasp', 'leather', 'strap', 'black', 'color', 'brand',
       'dial', 'read', 'into', 'plenty', 'watch', 'chronograph',
       'inspired', 'car', 'sports', 'week', 'day', 'every', 'enjoy',
       'can'], dtype='<U24')]


In [7]:
#Format Bigrams and Print Sample of Extracted Vocabulary 

# The learned vocabulary can be retrieved from tfidf as a list using .get_feature_names_out()
# Store the extracted vocabulary as vocab
vocab = tfidf.get_feature_names_out()

# Sklearn joins bigrams with a space character.
# To make our output easier to read, replace the spaces in each term in vocab (a list of strings) with an underscore.
vocab = [term.replace(' ', '_') for term in vocab]

# Print the last 5 terms in the vocab
#  The first term printed should be 'zipper_pocket'
print(vocab[-5:])

['zipper_pocket', 'zipper_pockets', 'zippered', 'zirconia', 'zone']


## Transform product descriptions into topics and print sample terms from topics


In [8]:
# Perform Topic Modeling with LDA

# Now that we have our vectorized data, we can use Latent Direchlet Allocation to learn per-document topic distributions and per-topic term distributions.
# Though the documents are likely composed of more, we'll model our dataset using 20 topics for ease of printing.

# Import LatentDirichletAllocation from sklearn
from sklearn.decomposition import LatentDirichletAllocation

# Instantiate a LatentDirichletAllocation model
lda = LatentDirichletAllocation(n_components=20, random_state=512, n_jobs=-1)

# Run fit_transform on lda using X_tfidf.
X_lda = lda.fit_transform(X_tfidf)
X_lda.shape

(5000, 20)

In [9]:
# Get Assigned Topics for Product at df_jcp row 0

# Get the assigned topic proportions for the document at row 0 of X_lda
theta_0 = np.round(X_lda[0], 2)
print(f'{theta_0 = :}\n')

# LDA will assign a small weight (or proability) to each topic for a document
n_topics_assigned_0 = sum(theta_0 > 0.01)
print(f'{n_topics_assigned_0 = :}\n')

# Indices of the assigned topics, sorted descending by the values in theta_0
assigned_topics_0 = np.argsort(theta_0)[::-1][:n_topics_assigned_0]
print(f'{assigned_topics_0 = :}')

theta_0 = [0.01 0.76 0.01 0.01 0.01 0.01 0.01 0.01 0.01 0.01 0.01 0.01 0.01 0.01
 0.14 0.01 0.01 0.01 0.01 0.01]

n_topics_assigned_0 = 2

assigned_topics_0 = [ 1 14]


In [10]:
#Print Top Topic Terms

# To get a sense of what each topic is composed of, we can print the most likely terms for each topic.

# To make indexing easier, first convert vocab from a list to np.array()
vocab = np.array(vocab)

# assert that vocab is the correct datatype
assert type(vocab) is np.ndarray, "vocab needs to be converted to a numpy array"

# For each topic print f'Topic #{topic_idx:2d} : ' followed by the top 5 most likely terms in that topic.
for topic_idx, topic in enumerate(lda.components_):
    top_terms_idx = topic.argsort()[-5:][::-1]
    top_terms = vocab[top_terms_idx]
    print(f'Topic #{topic_idx:2d} : {" ".join(top_terms)}')

Topic # 0 : upper sole rubber rubber_sole synthetic
Topic # 1 : dial case watch strap bracelet
Topic # 2 : inseam waist zip pants shorts
Topic # 3 : trunks side_pocket swim_trunks tongue_and take_your
Topic # 4 : sleeveless machine_wash dress machine shoulder
Topic # 5 : star_wars wars 16_piece photo star
Topic # 6 : safe dishwasher dishwasher_safe set glass
Topic # 7 : must_be garment must be_returned returned
Topic # 8 : king comforter wipe set shams
Topic # 9 : sold what clean_imported rod skin
Topic #10 : nylon nylon_spandex bra straps hand_wash
Topic #11 : ci bamboo count thread_count cotton_cover
Topic #12 : short shirt collar short_sleeves cotton_washable
Topic #13 : sterling jewelry_photos silver may stones
Topic #14 : steel stainless stainless_steel cooking large
Topic #15 : leather split coil mattress grain
Topic #16 : moisture wicking moisture_wicking fabric tee
Topic #17 : tone gold_tone silver_tone tone_metal metal_gold
Topic #18 : resistant rug slip synthetic yes
Topic #1

In [11]:
# Looking at the description column of row 0, the assigned_topics_0 and 
# the top terms per topic above, our LDA model seems to have generated
# topics that make sense given descriptions of department store goods, 
# with some a better fit than others.

## Generate recommendations using topics

In [12]:
#  Generate Similarity Matrix

# We'll use Content-Based Filtering to make recommendations based on a query product.

# Import cosine_distances (not cosine_similarity) from sklearn.metrics.pairwise
from sklearn.metrics.pairwise import cosine_distances

# Use cosine_distances to generate similarity scores on our X_lda data
distances = cosine_distances(X_lda)
print(distances.shape)

(5000, 5000)


In [13]:
# Find Recommended Products

# Let's test our proposed recommendation engine using the product at row 0 in df_jcp.

# Print the names for the top 10 most similar products to this query.
similar = np.argsort(distances[0])
names = df_jcp['name_title'].values[similar[:10]]
print(names)

['Invicta® Sl Rally Mens Black Leather Strap Chronograph Watch 16012'
 'Bulova® Mens Black & Rose Gold-Tone Chronograph Sport Watch 98B104'
 'Armitron® Now® Womens Two-Tone Crisscross Bangle Watch'
 'Elgin® Mens Gold-Tone Skeleton Watch'
 'Marvel Spiderman Tween Black Leather Strap Watch'
 'Womens Nylon Strap Digital Sport Watch'
 'Timex® Easy Reader Womens Black Strap Watch 2H341'
 'Disney Womens Snow White Rose-Tone Black Enamel Watch'
 'Citizen® Eco-Drive® Womens Stainless Steel Watch EW1250-54A'
 'Zunammy® Mens Black Silicone Strap Sport Watch']


# Part 2: Sentiment Analysis Using Pipelines

Here we will train a model to classify positive vs negative sentiment on a set of pet supply product reviews using sklearn Pipelines.

In [14]:
#  Load the Data

# The dataset we'll be working with is a set of product reviews
#   of pet supply items on Amazon.
# This data is taken from https://nijianmo.github.io/amazon/index.html
#   "Justifying recommendations using distantly-labeled reviews and fined-grained aspects"
#   Jianmo Ni, Jiacheng Li, Julian McAuley
#   Empirical Methods in Natural Language Processing (EMNLP), 2019

# Load product reviews from ../data/amazon-petsupply-reviews_subset.csv.zips
# Use pandas read_csv function with the default parameters as in part 1.
# Store the resulting dataframe as df_amzn.
df_amzn = pd.read_csv('../data/amazon-petsupply-reviews_subset.csv.zip')


# print a summary of df_amzn using .info()
# there should be 10000 rows with 2 columns
df_amzn.info()

# print blank line
print() 

# print the review in the first row of the dataframe as an example
print(df_amzn['review'][0])

# print the rating in the first row of the dataframe as an example
print(df_amzn['rating'][0])

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   review  10000 non-null  object
 1   rating  10000 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 156.4+ KB

My cats are considerably more happy with this toy...and I don't have to leave the sofa to use it, given the long wand length. yay laziness!!
5


In [15]:
#Transform Target

# The ratings are originally in a 5 point scale
# We'll turn this into a binary classification task to approximate positive vs negative sentiment

# Print the proportions of values seen in the rating column
print(df_amzn['rating'].value_counts(normalize=True).round(2))

# Create a new binary target by setting
#  rows where rating is 5 to True
#  rows where rating is not 5 to False
# Store in y
y = df_amzn['rating'] == 5

# print a blank line
print()

# Print the proportions of values seen in y
#  using value_counts() with normalize=True
# True here means a rating of 5 (eg positive)
# False means a rating less than 5 (eg negative)
print(y.value_counts(normalize=True).round(2))

5    0.66
4    0.14
3    0.09
1    0.06
2    0.05
Name: rating, dtype: float64

True     0.66
False    0.34
Name: rating, dtype: float64


In [16]:
#Train-test split

# Import train_test_split from sklearn
from sklearn.model_selection import train_test_split

# Split df_amzn.review and y into a train and test set
# Store as reviews_train,reviews_test,y_train,y_test
reviews_train, reviews_test, y_train, y_test = train_test_split(
    df_amzn['review'], y, test_size=0.2, stratify=y, random_state=512
)


# print the proportion of values seen in y_train
#  to confirm that the class distributions are the same
print(y_train.value_counts(normalize=True).round(2))

True     0.66
False    0.34
Name: rating, dtype: float64


In [17]:
# Create a Pipeline of TfIdf transformation and Classification

# import Pipeline and GradientBoostingClassifier from sklearn
from sklearn.pipeline import Pipeline
from sklearn.ensemble import GradientBoostingClassifier

# Create a pipeline with two steps: 
#  TfIdfVectorizer with min_df=5 and max_df=.5 named 'tfidf'
#  GradientBoostingClassifier with 20 trees named 'gbc'
pipe_gbc = Pipeline([
    ('tfidf', TfidfVectorizer(min_df=5, max_df=0.5)),
    ('gbc', GradientBoostingClassifier(n_estimators=20))
])

# Print out the pipeline
# You should see both steps: tfidf and gbc
print(pipe_gbc)

Pipeline(steps=[('tfidf', TfidfVectorizer(max_df=0.5, min_df=5)),
                ('gbc', GradientBoostingClassifier(n_estimators=20))])


In [18]:
#  Perform Grid Search on pipe_gbc

# import GridSearchCV from sklearn
from sklearn.model_selection import GridSearchCV

# Create a parameter grid to test using:
#   unigrams or unigrams + bigrams in the tfidf step
#   max_depth of 2 or 10 in the gbc step
param_grid = {
    'tfidf__ngram_range': [(1, 1), (1, 2)],
    'gbc__max_depth': [2, 10]
}

# Instantiate GridSearchCV to evaluate pipe_gbc on the values in param_grid
#   use cv=2 and n_jobs=-1 to reduce run time
# Fit on the training set of reviews_train,y_train
# Store as gs_pipe_gbc
gs_pipe_gbc = GridSearchCV(pipe_gbc, param_grid, cv=2, n_jobs=-1)
gs_pipe_gbc.fit(reviews_train, y_train)

# Print the best parameter settings in gs_pipe_gbc found by grid search
print(gs_pipe_gbc.best_params_)

# Print the best cv score found by grid search, with a precision of 2
print(round(gs_pipe_gbc.best_score_, 2))

{'gbc__max_depth': 10, 'tfidf__ngram_range': (1, 2)}
0.75


In [19]:
#Evaluate on the test set

# Calculate the test set (reviews_test,y_test) score using the trained gs_pipe_gbc 
test_score = gs_pipe_gbc.score(reviews_test, y_test)
print(round(test_score, 2))

0.76


In [20]:
# Evaluate on example reviews

# Generate predictions for these two sentences using the fit gs_pipe_gbc:
predictions = gs_pipe_gbc.predict([
    'This is a great product.',
    'This product is not great.'
])
print(predictions)

[ True False]
