# Hugging Face Transformers

## 0. Read in Data

In [1]:
import pandas as pd

# modify the column width
pd.set_option('display.max_colwidth', None)

# look at a subset of the reviews
df = pd.read_excel('../Data/Popchip_Reviews_Sentiment.xlsx').head(30)
df.head(2)

Unnamed: 0,Id,UserId,Rating,Priority,Title,Text,Sentiment_VADER
0,23689,A21SYVGVNG8RAS,5,Low,Yummy snacks!,Popchips are the bomb!! I use the parmesan garlic to scoop up cottage cheese as a healthy alternative to chips and dip. My healthy eating program is saved.,0.9244
1,23690,AQJYXC0MPRQJL,5,Low,Great chip that is different from the rest,"I like the puffed nature of this chip that makes it more unique in the chip market. I ordered the Salt and Vinegar and absolutely love that flavor, hands down my favorite chip ever. I have tried the cheddar and regular flavors as well. The cheddar is about a 4/5 and the regular is about a 3/5 because I prefer strong flavors and obviously that would not be the case for the regular. The Salt and Vinegar is kind of weak compared to some regular S&V chips, but is quite flavorful and makes you wanting to come back for more.",0.7269


In [2]:
# confirm the number of reviews
df.shape

(30, 7)

## 1. Sentiment Analysis

### a. Simple Example

In [3]:
# sentiment analysis with hugging face
from transformers import pipeline

sentiment_analyzer = pipeline("sentiment-analysis", # set the task to sentiment analysis
                              model="distilbert/distilbert-base-uncased-finetuned-sst-2-english", # specify the default distilbert model
                              device=-1) # use the computer's cpu

text1 = 'When life gives you lemons, make lemonade! ðŸ™‚'
text2 = 'A dozen lemons will make a gallon of lemonade.'
text3 = 'I didn\'t like the taste of that lemonade at all.'

Device set to use cpu


In [4]:
sentiment_analyzer(text1)

[{'label': 'POSITIVE', 'score': 0.996239423751831}]

In [5]:
sentiment_analyzer(text2)

[{'label': 'POSITIVE', 'score': 0.7781568765640259}]

In [6]:
sentiment_analyzer(text3)

[{'label': 'NEGATIVE', 'score': 0.9955589771270752}]

### b. Practical Example

In [7]:
# calculate the sentiment scores
# adding truncation here to truncate text before analyzing sentiment 
# because at least 1 of the Text values has a length greater than the max sequence length of 512
sentiment_analyzer = pipeline("sentiment-analysis",
                              model="distilbert/distilbert-base-uncased-finetuned-sst-2-english",
                              device=-1,
                              truncation=True) 

sentiment_scores = df['Text'].apply(sentiment_analyzer)
sentiment_scores[:5]

Device set to use cpu


0    [{'label': 'POSITIVE', 'score': 0.9935212731361389}]
1     [{'label': 'POSITIVE', 'score': 0.999605119228363}]
2    [{'label': 'NEGATIVE', 'score': 0.6984901428222656}]
3    [{'label': 'NEGATIVE', 'score': 0.9996308088302612}]
4    [{'label': 'POSITIVE', 'score': 0.9991814494132996}]
Name: Text, dtype: object

In [8]:
%%time
# Above, %%time, is a Jupyter Notebook 'magic function'...has to be first line of cell

# add a timer and hide all non-critical warnings
from transformers import pipeline, logging

logging.set_verbosity_error()

sentiment_analyzer = pipeline("sentiment-analysis",
                              model="distilbert/distilbert-base-uncased-finetuned-sst-2-english",
                              device=-1,
                              truncation=True)

sentiment_scores = df['Text'].apply(sentiment_analyzer)
sentiment_scores[:5]

CPU times: total: 17.5 s
Wall time: 3.22 s


0    [{'label': 'POSITIVE', 'score': 0.9935212731361389}]
1     [{'label': 'POSITIVE', 'score': 0.999605119228363}]
2    [{'label': 'NEGATIVE', 'score': 0.6984901428222656}]
3    [{'label': 'NEGATIVE', 'score': 0.9996308088302612}]
4    [{'label': 'POSITIVE', 'score': 0.9991814494132996}]
Name: Text, dtype: object

In [9]:
import torch

if torch.cuda.is_available():
    device = torch.device("cuda")
    print(f"Using GPU: {torch.cuda.get_device_name(0)}")
elif torch.backends.mps.is_available():
    device = torch.device("mps")
    print("Using Apple MPS")
else:
    device = torch.device("cpu")
    print("Using CPU")

Using GPU: NVIDIA GeForce GTX 1650 with Max-Q Design


In [11]:
%%time

# utilize mac's silicon chip (gpu)
# sentiment_analyzer = pipeline("sentiment-analysis",
#                               model="distilbert/distilbert-base-uncased-finetuned-sst-2-english",
#                               device='mps', # update from -1 to mps
#                               truncation=True)

# utilize laptop's NVIDIA GPU silicon chip (gpu)
sentiment_analyzer = pipeline("sentiment-analysis",
                              model="distilbert/distilbert-base-uncased-finetuned-sst-2-english",
                              device=device, # update from -1 to cuda
                              truncation=True)

sentiment_scores = df['Text'].apply(sentiment_analyzer)
sentiment_scores[:5]

CPU times: total: 1.08 s
Wall time: 1.39 s


0    [{'label': 'POSITIVE', 'score': 0.9935213923454285}]
1     [{'label': 'POSITIVE', 'score': 0.999605119228363}]
2    [{'label': 'NEGATIVE', 'score': 0.6984850764274597}]
3    [{'label': 'NEGATIVE', 'score': 0.9996308088302612}]
4    [{'label': 'POSITIVE', 'score': 0.9991814494132996}]
Name: Text, dtype: object

In [12]:
# extract the label for a single review
sentiment_scores[0][0]['label']

'POSITIVE'

In [13]:
# extract the score for a single review
sentiment_scores[0][0]['score']

0.9935213923454285

In [14]:
# extract the label and score and create a sentiment score for all reviews
df['Label_HF'] = sentiment_scores.apply(lambda x: x[0]['label'])
df['Score_HF'] = sentiment_scores.apply(lambda x: x[0]['score'])

# Note: When using 'apply' on a whole dataframe (rather than a series), by default axis=0 which applies lambda to each column
# ...but we want lambda applied to each row -> axis=1
df['Sentiment_HF'] = df.apply(lambda row: row['Score_HF'] if row['Label_HF'] == 'POSITIVE' else -row['Score_HF'], axis=1)

In [15]:
# view the calculations
df[['Rating', 'Text', 'Sentiment_VADER', 'Label_HF', 'Score_HF', 'Sentiment_HF']].head()

Unnamed: 0,Rating,Text,Sentiment_VADER,Label_HF,Score_HF,Sentiment_HF
0,5,Popchips are the bomb!! I use the parmesan garlic to scoop up cottage cheese as a healthy alternative to chips and dip. My healthy eating program is saved.,0.9244,POSITIVE,0.993521,0.993521
1,5,"I like the puffed nature of this chip that makes it more unique in the chip market. I ordered the Salt and Vinegar and absolutely love that flavor, hands down my favorite chip ever. I have tried the cheddar and regular flavors as well. The cheddar is about a 4/5 and the regular is about a 3/5 because I prefer strong flavors and obviously that would not be the case for the regular. The Salt and Vinegar is kind of weak compared to some regular S&V chips, but is quite flavorful and makes you wanting to come back for more.",0.7269,POSITIVE,0.999605,0.999605
2,5,"I just love these chips! I was always a big fan of potato chips, but haven't had one since I discovered popchips. They are great for dipping or all alone. I am constantly re-ordering them. One note however-if you are on a low salt diet these chips are probably not for you. They are high in sodium. We go through a case every two months. If you love them it pays to join the subscribe and save program through Amazon. You save money and stay supplied!",0.979,NEGATIVE,0.698485,-0.698485
3,3,"These tasted like potatoe stix, that we got in grade school with our lunches usually on pizza day. They were the bomb then, not so much now. Won't buy again unless I get them for cheap or free.",0.8689,NEGATIVE,0.999631,-0.999631
4,5,"These chips are great! They look almost like a flattened rice cake, but taste so much better, more like a potato chip. The bbq flavor is delicious. They are very low in fat and full of flavor. It is easy to eat an entire bag of these!",0.9613,POSITIVE,0.999181,0.999181


In [16]:
# view the most positive review
df.sort_values('Sentiment_HF', ascending=False).head(1).Text

28    These Pop Chips are incredible. They taste so much better than baked chips and the quantity you get for 2 points is so much more. I buy the variety case and love them all!
Name: Text, dtype: object

In [17]:
# view the most negative review
df.sort_values('Sentiment_HF', ascending=True).head(1).Text

3    These tasted like potatoe stix, that we got in grade school with our lunches usually on pizza day.  They were the bomb then, not so much now.  Won't buy again unless I get them for cheap or free.
Name: Text, dtype: object

### c. Speed Up Code

In [18]:
%%time

# no optimizations
from transformers import pipeline

sentiment_analyzer = pipeline(
    "sentiment-analysis",
    model="distilbert-base-uncased-finetuned-sst-2-english",
    device=-1, # running on CPU
    truncation=True
)

sentiment_scores = df['Text'].apply(sentiment_analyzer)
sentiment_scores[:5]

config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

CPU times: total: 25.2 s
Wall time: 27 s


0    [{'label': 'POSITIVE', 'score': 0.9935212731361389}]
1     [{'label': 'POSITIVE', 'score': 0.999605119228363}]
2    [{'label': 'NEGATIVE', 'score': 0.6984901428222656}]
3    [{'label': 'NEGATIVE', 'score': 0.9996308088302612}]
4    [{'label': 'POSITIVE', 'score': 0.9991814494132996}]
Name: Text, dtype: object

In [19]:
%%time

# four things to try if you can't use GPU
from transformers import pipeline

sentiment_analyzer = pipeline(
    "sentiment-analysis",
    model="distilbert-base-uncased-finetuned-sst-2-english", # 1. smaller model
    device=-1, # running on CPU
    truncation=True,
    use_fast=True # 2. faster tokenization
)

import torch
torch.set_num_threads(1)  # 3. specify multi-threading

with torch.no_grad(): # 4. disable gradients
    sentiment_scores = df['Text'].apply(sentiment_analyzer)
sentiment_scores[:5]

CPU times: total: 6.78 s
Wall time: 7.08 s


0    [{'label': 'POSITIVE', 'score': 0.9935212731361389}]
1     [{'label': 'POSITIVE', 'score': 0.999605119228363}]
2    [{'label': 'NEGATIVE', 'score': 0.6984868049621582}]
3    [{'label': 'NEGATIVE', 'score': 0.9996308088302612}]
4    [{'label': 'POSITIVE', 'score': 0.9991814494132996}]
Name: Text, dtype: object

## 2. Named Entity Recognition

### a. Simple Example

In [None]:
# view warning options
logging.set_verbosity_warning() # view more warnings
logging.set_verbosity_error() # view fewer warnings

In [None]:
# ner with hugging face
ner_analyzer = pipeline("ner",
                        model="dbmdz/bert-large-cased-finetuned-conll03-english",
                        device=-1,
                        aggregation_strategy='SIMPLE')

text4 = "I ordered an Arnold Palmer at Applebee's in Springfield."

In [None]:
ner_analyzer(text4)

In [None]:
# try a different model
ner_analyzer2 = pipeline("ner",
                        model="dslim/bert-base-NER",
                        device=-1,
                        aggregation_strategy='SIMPLE')

In [None]:
ner_analyzer2(text4)

### b. Practical Example

In [None]:
# find the named entities in each review
ner_analyzer = pipeline("ner",
                        model="dbmdz/bert-large-cased-finetuned-conll03-english",
                        device='mps',
                        aggregation_strategy='SIMPLE')

In [None]:
# apply to one review
ner_analyzer(df.Text[1])

In [None]:
# extract the words
[entity['word'] for entity in ner_analyzer(df.Text[1])]

In [None]:
# apply to all reviews
df['Named_Entities'] = df['Text'].apply(lambda x: [entity['word'] for entity in ner_analyzer(x)])

In [None]:
# view the named entities
df[['Text', 'Named_Entities']].head()

In [None]:
# create a unique list of named entities
named_entities = list(set(df.Named_Entities.explode().dropna().tolist()))
named_entities[:10]

In [None]:
# view the number of named entities found
len(named_entities)

In [None]:
# exclude subwords from the list
named_entities_clean = [entity for entity in named_entities if '#' not in entity]
sorted(named_entities_clean)

In [None]:
# view the number of named entities found
len(named_entities_clean)

## 3. Zero-Shot Classification

### a. Simple Example

In [None]:
# zero-shot classification with hugging face
classifier = pipeline("zero-shot-classification",
                      model="facebook/bart-large-mnli",
                      device=-1)

In [None]:
text1, text4

In [None]:
classifier(text1, candidate_labels = ['quote', 'food & drinks', 'technology'])

In [None]:
classifier(text4, candidate_labels = ['quote', 'food & drinks', 'technology'])

### b. Practical Example

In [None]:
# remember our topics from the machine learning section: 'order', 'taste & texture', 'good', 'flavor', 'health'
classifier = pipeline("zero-shot-classification",
                      model="facebook/bart-large-mnli",
                      device='mps')

In [None]:
# try on one review
classifier(df.Text[0], ['order', 'taste & texture', 'good', 'flavor', 'health'])

In [None]:
# try on another review
classifier(df.Text[1], ['order', 'taste & texture', 'good', 'flavor', 'health'])

In [None]:
# extract just the top label
classifier(df.Text[1], ['order', 'taste & texture', 'good', 'flavor', 'health'])['labels'][0]

In [None]:
# apply to all reviews
df['Category'] = df.Text.apply(lambda x: classifier(x, ['order', 'taste & texture', 'good', 'flavor', 'health'])['labels'][0])

In [None]:
# view the category labels
df[['Text', 'Category']].head()

## 4. Text Summarization

### a. Simple Example

In [None]:
# text summarization with hugging face
summarizer = pipeline("summarization",
                      model="facebook/bart-large-cnn",
                      device=-1)

text5 = """
            The lemon tree produces a pointed oval yellow fruit. Botanically this is a hesperidium, 
            a modified berry with a tough, leathery rind. The rind is divided into an outer colored layer or zest, 
            which is aromatic with essential oils, and an inner layer of white spongy pith. 
            Inside are multiple carpels arranged as radial segments. The seeds develop inside the carpels. 
            The space inside each segment is a locule filled with juice vesicles. 
            Lemons contain many phytochemicals, including polyphenols, terpenes, and tannins.[3] 
            Their juice contains slightly more citric acid than lime juice (about 47 g/L), 
            nearly twice as much as grapefruit juice, and about five times as much as orange juice.[4]
        """

In [None]:
# try it with the default parameters
summarizer(text5)

In [None]:
# specify the parameters
summarizer(text5, min_length=20, max_length=50)

In [None]:
# will get the same results
summarizer(text5, min_length=20, max_length=50)

In [None]:
# will get more random results
summarizer(text5, min_length=20, max_length=50, do_sample=True)

In [None]:
# extract just the text portion
summarizer(text5, min_length=20, max_length=50)[0]['summary_text']

### b. Practical Example

In [None]:
# load pipelines
summarizer = pipeline("summarization", model="facebook/bart-large-cnn", device='mps')
sentiment_analyzer = pipeline("sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english", device='mps', truncation=True)

In [None]:
# step 1: summarize reviews
df['Summary'] = df['Text'].apply(lambda x: summarizer(x, min_length=20, max_length=50)[0]['summary_text'])
df[['Text', 'Summary']].head(2)

In [None]:
# step 2: find sentiment scores
sentiment_scores2 = df.Summary.apply(sentiment_analyzer)
sentiment_scores2[:5]

In [None]:
# extract label and score and create a sentiment score
df['Label_HF2'] = sentiment_scores2.apply(lambda x: x[0]['label'])
df['Score_HF2'] = sentiment_scores2.apply(lambda x: x[0]['score'])
df['Sentiment_HF2'] = df.apply(lambda row: row['Score_HF2'] if row['Label_HF2'] == 'POSITIVE' else -row['Score_HF2'], axis=1)

In [None]:
# view the calculations
df[['Text', 'Label_HF2', 'Score_HF2', 'Sentiment_HF2']].head()

In [None]:
# compare the sentiment scores
df[['Text', 'Sentiment_VADER', 'Sentiment_HF', 'Sentiment_HF2']].head()

## 5. Text Generation

In [None]:
# text generation with hugging face
generator = pipeline("text-generation", model="gpt2", max_length=20, device=-1)

prompt = "On a hot summer day, I love to drink cold lemonade because"

In [None]:
# set general parameters
generator(prompt, max_length=50, num_return_sequences=1, do_sample=False)

In [None]:
# get a more random output
generator(prompt, max_length=50, num_return_sequences=1, do_sample=True)

In [None]:
# get a more random output
generator(prompt, max_length=50, num_return_sequences=1, do_sample=True)

## 6. Document Similarity with Embeddings

### a. Simple Example

In [None]:
# feature extraction with hugging face
feature_extractor = pipeline("feature-extraction",
                             model="sentence-transformers/all-MiniLM-L6-v2",
                             device=-1)

In [None]:
# view the text
text1

In [None]:
# view the embedding
feature_extractor(text1)[0][0][:10]

In [None]:
# view the shape
len(feature_extractor(text1)[0][0])

### b. Practical Example

#### Step 1: Extract Embeddings

In [None]:
# modify the column width
pd.set_option('display.max_colwidth', 50)

# read in the movies data
movies = pd.read_csv('../Data/movie_reviews.csv')
movies.head(2)

In [None]:
# extract the embedding representation for each review
feature_extractor = pipeline("feature-extraction",
                             model="sentence-transformers/all-MiniLM-L6-v2",
                             device='mps')

embeddings = movies['movie_info'].apply(lambda x: feature_extractor(x)[0][0])
embeddings.head(2)

#### Step 2: Specify the Captain Marvel Embedding

In [None]:
# view one movie - Captain Marvel
movies[movies.movie_title == 'Captain Marvel']

In [None]:
# save the embedding for that movie
import numpy as np

embedding_cm = np.array(embeddings[25]).reshape(1, -1)
embedding_cm.shape

#### Step 3: Specify the Embeddings for All Movies

In [None]:
# save the embeddings for all movies
embeddings_movies = np.vstack(embeddings)
embeddings_movies.shape

#### Step 4: Calculate Cosine Similarity Scores

In [None]:
# calculate the cosine similarity scores
from sklearn.metrics.pairwise import cosine_similarity

similarity_scores_cm = cosine_similarity(embedding_cm, embeddings_movies)
similarity_scores_cm_series = pd.Series(similarity_scores_cm.flatten(), name='similarity_score')
similarity_scores_cm_series.head()

In [None]:
# combine movie titles, descriptions and scores
similarity_scores_cm_df = pd.concat([movies[['movie_title', 'movie_info']], similarity_scores_cm_series], axis=1)
similarity_scores_cm_df.head()

In [None]:
# view the top 5 most similar movies
similarity_scores_cm_df.sort_values('similarity_score', ascending=False).head()

#### DEMO: Create a function to find the most similar movie

In [None]:
# step 1: specify our feature extraction model
feature_extractor = pipeline('feature-extraction',
                     model='sentence-transformers/all-MiniLM-L6-v2',
                     device='mps')

In [None]:
# step 2: create a movies x embeddings array (166 x 384)
embeddings = movies.movie_info.apply(lambda row: feature_extractor(row)[0][0])
embeddings_movies = np.vstack(embeddings)
embeddings_movies.shape

In [None]:
# step 3: create a get_similar_movies function with the inputs: embeddings, movie_index, movie_details, top_n
def get_similar_movies(embeddings, movie_index, movie_details, top_n=3):

    # create movie embedding for movie_index
    m_embedding = np.array(embeddings[movie_index]).reshape(1, -1)
    
    # calculate similarity scores
    similarity_scores = cosine_similarity(m_embedding, embeddings)
    similarity_scores_series = pd.Series(similarity_scores.flatten(), name='similarity_score')
    
    # bring in movie info
    movies_similarity_scores_df = pd.concat([movie_details, similarity_scores_series], axis=1)

    # display movie recs
    return movies_similarity_scores_df.sort_values('similarity_score', ascending=False).iloc[0:top_n+1]

In [None]:
# modify the column width
pd.set_option('display.max_colwidth', None)

In [None]:
# find movies similar to Captain Marvel
get_similar_movies(embeddings_movies, 25, movies[['movie_title', 'movie_info']])

In [None]:
# find movies similar to The LEGO Movie 2
get_similar_movies(embeddings_movies, 131, movies[['movie_title', 'movie_info', 'rating']], top_n=5)