
# Wine in a Million

### Authors: __[Zephyr Headley](https://github.com/jzheadley)__ and __[John Naylor](https://jonaylor.xyz)__

[![Open In SageMaker Studio Lab](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/jonaylor89/WineInAMillion/blob/main/notebooks/Wine%20In%20A%20Million.ipynb)



In [146]:
!pip install sentence_transformers
!pip install nvidia-ml-py3

import os
import tarfile
import json
import time
import pandas as pd
import boto3
import joblib
import sagemaker
from time import gmtime, strftime
from sagemaker import get_execution_role
from sagemaker.amazon.amazon_estimator import image_uris
from sagemaker.serializers import JSONSerializer
from sagemaker.deserializers import JSONDeserializer
from sentence_transformers import SentenceTransformer
from sagemaker.sklearn import SKLearnModel
from sagemaker.pytorch import PyTorch, PyTorchModel
from sagemaker.predictor import Predictor
from sagemaker.inputs import TrainingInput
from sklearn.neighbors import NearestNeighbors
from sagemaker.pipeline import PipelineModel

# Preprocessing
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from string import punctuation

from tqdm.notebook import tqdm

tqdm.pandas()

nltk.download("stopwords")
nltk.download("punkt")
nltk.download("wordnet")

print(f"SageMaker SDK Version: {sagemaker.__version__}")

You should consider upgrading via the '/home/ec2-user/anaconda3/envs/pytorch_p36/bin/python -m pip install --upgrade pip' command.[0m
You should consider upgrading via the '/home/ec2-user/anaconda3/envs/pytorch_p36/bin/python -m pip install --upgrade pip' command.[0m


[nltk_data] Downloading package stopwords to
[nltk_data]     /home/ec2-user/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/ec2-user/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /home/ec2-user/nltk_data...


SageMaker SDK Version: 2.68.0


[nltk_data]   Package wordnet is already up-to-date!


<IPython.core.display.Javascript object>

In [38]:
!pip install nb_black
%load_ext nb_black


Collecting nb_black
  Downloading nb_black-1.0.7.tar.gz (4.8 kB)
Collecting black>='19.3'
  Downloading black-21.12b0-py3-none-any.whl (156 kB)
[K     |████████████████████████████████| 156 kB 21.8 MB/s eta 0:00:01
Collecting pathspec<1,>=0.9.0
  Downloading pathspec-0.9.0-py2.py3-none-any.whl (31 kB)
Collecting platformdirs>=2
  Downloading platformdirs-2.4.0-py3-none-any.whl (14 kB)
Collecting tomli<2.0.0,>=0.2.6
  Downloading tomli-1.2.2-py3-none-any.whl (12 kB)
Collecting mypy-extensions>=0.4.3
  Downloading mypy_extensions-0.4.3-py2.py3-none-any.whl (4.5 kB)
Building wheels for collected packages: nb-black
  Building wheel for nb-black (setup.py) ... [?25ldone
[?25h  Created wheel for nb-black: filename=nb_black-1.0.7-py3-none-any.whl size=5320 sha256=5fc093aa7775b446bbc8b90edab8c4224b89506eac641ad27a3b1df73168b3fe
  Stored in directory: /home/ec2-user/.cache/pip/wheels/b8/d1/fe/2f4f49a959887ffe9ebdf841c1a221a5b4eb047a1ca09b50a9
Successfully built nb-black
Installing collected 

<IPython.core.display.Javascript object>

In [18]:
role = get_execution_role()

# bucket = "<S3_BUCKET>"
# prefix = "<S3_KEY_PREFIX>"
# filename = "<DATASET_FILENAME>"

bucket = "wineinamillion"
prefix = "data/"
filename = "winemag-data-130k-v2.csv"

assert bucket != "<S3_BUCKET>"
assert prefix != "<S3_KEY_PREFIX>"
assert filename != "<DATASET_FILENAME>"

raw_data_location = f"s3://{bucket}/{prefix}raw/{filename}"

In [26]:
# https://www.analyticsvidhya.com/blog/2021/04/how-to-download-kaggle-datasets-using-jupyter-notebook/
!pip install opendatasets
import opendatasets as od
od.download("https://www.kaggle.com/zynicide/wine-reviews")
inputs = boto3.resource("s3").Bucket(bucket).upload_file(f"wine-reviews/{filename}", f"{prefix}raw/{filename}")

You should consider upgrading via the '/home/ec2-user/anaconda3/envs/pytorch_p36/bin/python -m pip install --upgrade pip' command.[0m
Skipping, found downloaded files in "./wine-reviews" (use force=True to force download)


In [4]:
df = pd.read_csv(raw_data_location)
df.describe()

Unnamed: 0.1,Unnamed: 0,points,price
count,129971.0,129971.0,120975.0
mean,64985.0,88.447138,35.363389
std,37519.540256,3.03973,41.022218
min,0.0,80.0,4.0
25%,32492.5,86.0,17.0
50%,64985.0,88.0,25.0
75%,97477.5,91.0,42.0
max,129970.0,100.0,3300.0


In [5]:
df.head(5)

Unnamed: 0.1,Unnamed: 0,country,description,designation,points,price,province,region_1,region_2,taster_name,taster_twitter_handle,title,variety,winery
0,0,Italy,"Aromas include tropical fruit, broom, brimston...",Vulkà Bianco,87,,Sicily & Sardinia,Etna,,Kerin O’Keefe,@kerinokeefe,Nicosia 2013 Vulkà Bianco (Etna),White Blend,Nicosia
1,1,Portugal,"This is ripe and fruity, a wine that is smooth...",Avidagos,87,15.0,Douro,,,Roger Voss,@vossroger,Quinta dos Avidagos 2011 Avidagos Red (Douro),Portuguese Red,Quinta dos Avidagos
2,2,US,"Tart and snappy, the flavors of lime flesh and...",,87,14.0,Oregon,Willamette Valley,Willamette Valley,Paul Gregutt,@paulgwine,Rainstorm 2013 Pinot Gris (Willamette Valley),Pinot Gris,Rainstorm
3,3,US,"Pineapple rind, lemon pith and orange blossom ...",Reserve Late Harvest,87,13.0,Michigan,Lake Michigan Shore,,Alexander Peartree,,St. Julian 2013 Reserve Late Harvest Riesling ...,Riesling,St. Julian
4,4,US,"Much like the regular bottling from 2012, this...",Vintner's Reserve Wild Child Block,87,65.0,Oregon,Willamette Valley,Willamette Valley,Paul Gregutt,@paulgwine,Sweet Cheeks 2012 Vintner's Reserve Wild Child...,Pinot Noir,Sweet Cheeks


In [6]:
print(df["description"][0])

Aromas include tropical fruit, broom, brimstone and dried herb. The palate isn't overly expressive, offering unripened apple, citrus and dried sage alongside brisk acidity.


# Preprocess Dataframe & Clean Data

In [8]:
def clean_data(desc):
    words = stopwords.words('english')
    lower = " ".join([w for w in desc.lower().split() if not w in words])
    punct = ''.join(ch for ch in lower if ch not in punctuation)
    wordnet_lemmatizer = WordNetLemmatizer()

    word_tokens = nltk.word_tokenize(punct)
    lemmatized_word = [wordnet_lemmatizer.lemmatize(word) for word in word_tokens]

    word_joined = " ".join(lemmatized_word)
    
    return word_joined
    

df['clean_desc'] = df["description"].apply(clean_data)

print(df['clean_desc'].head(5))

0    aroma include tropical fruit broom brimstone d...
1    ripe fruity wine smooth still structured firm ...
2    tart snappy flavor lime flesh rind dominate gr...
3    pineapple rind lemon pith orange blossom start...
4    much like regular bottling 2012 come across ra...
Name: clean_desc, dtype: object


In [19]:
# Upload the preprocessed dataset to S3
df.to_csv("cleaned_dataset.csv")
clean_data_location = f"{prefix}clean/cleaned_dataset.csv"
inputs = boto3.resource("s3").Bucket(bucket).upload_file('cleaned_dataset.csv', clean_data_location)


# Sentence-BERT Embeddings


In [20]:
# Save the model to disk which we will host at sagemaker

model_name = 'sentence-transformers/all-MiniLM-L6-v2'

saved_model_dir = 'transformer'
if not os.path.isdir(saved_model_dir):
    os.makedirs(saved_model_dir)

model = SentenceTransformer(model_name)
model.save(saved_model_dir)

embeddings = model.encode(df["clean_desc"][0])
print(len(embeddings))

Downloading:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/10.2k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/612 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/349 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/350 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/13.2k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/190 [00:00<?, ?B/s]

384


In [27]:
# Zip the model .gz format and upload to s3

export_dir = 'transformer'
with tarfile.open('model.tar.gz', mode='w:gz') as archive:
    archive.add(export_dir, recursive=True)


#Upload the model to S3
boto3.Session().resource("s3").Bucket(bucket).upload_file('model.tar.gz', 'model/transformer/model.tar.gz')

In [29]:
# Generates embeddings from the model
embeddings = []
for i in tqdm(range(len(df["clean_desc"])-100,len(df["clean_desc"]))):
    vector = model.encode([df["clean_desc"][i]])
    embeddings.append(vector)
    
embeddings_flattened = list(map(lambda x:x[0], embeddings))
embeddings_df = pd.DataFrame(embeddings_flattened)

  0%|          | 0/100 [00:00<?, ?it/s]

In [30]:
#write embeddings to csv
embeddings_df.to_csv('embeddings.csv.gz', compression='gzip')

In [44]:
# Upload the embeddings to S3
embeddings_location = (
    boto3.Session()
    .resource("s3")
    .Bucket(bucket)
    .upload_file("embeddings.csv.gz", "model/embeddings/embeddings.csv.gz")
)

print(embeddings_location)

None


<IPython.core.display.Javascript object>

In [34]:
# need to have the embeddings as a tarball in s3
embeddings_location = f"s3://{bucket}/embeddings.csv.gz"
df = pd.read_csv(embeddings_location)
df.head(1)

Unnamed: 0,embeddings.csv,0,1,2,3,4,5,6,7,8,...,374,375,376,377,378,379,380,381,382,383
0,0.0,0.007083,-0.015033,0.067645,0.063208,0.013292,0.045818,0.020863,-0.091562,0.027603,...,-0.043422,0.018685,-0.000122,-0.059448,0.07432,-0.023519,0.072006,-0.009756,-0.012147,-0.010488


# Create Embedding Model

In [35]:
embeddings_model = PyTorchModel(
    model_data=f"s3://{bucket}/model/transformer/model.tar.gz",
    role = role, 
    entry_point ='encode_inference.py',
    source_dir = './src', 
    framework_version = '1.9.0',
    py_version = 'py38',
    sagemaker_session=sagemaker.Session(),
)

### Test Embedding Model

In [36]:
timestamp_prefix = strftime("%Y-%m-%d-%H-%M-%S", gmtime())
embeddings_endpoint_name = "embeddings-model-ep-" + timestamp_prefix

embedding_predictor = embeddings_model.deploy(
    instance_type='ml.m4.xlarge',
    initial_instance_count=1,
    endpoint_name=embeddings_endpoint_name,
    serializer=JSONSerializer(),
    deserializer=JSONDeserializer(),
)

-----------!

In [235]:
# test_payload = {'data': 'sweet wine with a hint of tartness'}
# test_features = embeddings_predictor.predict(test_payload)
# test_embedding = json.loads(test_features)

# len(test_embedding)

test_embedding = embedding_predictor.predict(
    {"data": "sweet wine with a hint of tartness"}
)
print(len(test_embedding["embeddings"]))

ValidationError: An error occurred (ValidationError) when calling the InvokeEndpoint operation: Endpoint embeddings-model-ep-2021-12-10-17-27-26 of account 985074551727 not found.

<IPython.core.display.Javascript object>



# Nearest Neighbors Model "Training"


In [156]:
# This is only a caching sort of step.  Instead of regenerating on subsequent runs, this can be run to pull the intermediary data from s3
embeddings_df = pd.read_csv(f"s3://{bucket}/model/embeddings/embeddings.csv.gz")

<IPython.core.display.Javascript object>

In [158]:
embeddings_df = embeddings_df[:-1]

embeddings_df.tail(5)
del embeddings_df["embeddings.csv"]
embeddings_df.tail(5)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,374,375,376,377,378,379,380,381,382,383
129966,-0.014324,-0.083236,0.021317,0.05589,-0.014183,0.088444,0.044363,-0.069194,0.002302,-0.02766,...,0.053791,0.031073,0.052278,-0.01938,0.022829,0.036835,0.019275,-0.022067,0.055997,-0.046584
129967,0.067471,-0.044725,-2.8e-05,0.059453,0.028468,0.070333,-0.028098,0.062695,-0.033304,-0.010947,...,-0.026852,0.003229,0.039433,-0.010654,-0.005636,-0.012233,0.169666,-0.054893,0.075262,0.021987
129968,0.024301,-0.016125,0.075865,0.050819,0.00611,0.016331,-0.041206,-0.076532,-0.063111,-0.045451,...,0.01901,0.086116,0.029834,0.000319,0.012753,0.001713,0.011256,-0.01998,0.018332,-0.11134
129969,0.015386,-0.082658,-0.027251,0.032489,0.007341,-0.017528,-0.017251,-0.020987,-0.010176,-0.079221,...,-0.004047,0.056436,0.01452,-0.080968,0.071581,-0.031942,0.101251,-0.043256,0.036289,-0.010412
129970,0.053372,-0.087436,0.053435,0.033393,0.010193,0.037103,0.038221,0.028853,-0.075111,-0.023183,...,-0.007885,-0.02344,0.000868,0.031423,-0.03184,-0.060474,0.09548,0.002652,0.040922,-0.04554


<IPython.core.display.Javascript object>

In [159]:
neigh = NearestNeighbors(n_neighbors=5)
neigh.fit(embeddings_df)

joblib.dump(neigh, "model.joblib")
with tarfile.open("model.joblib.tar.gz", mode="w:gz") as archive:
    archive.add("model.joblib")

inputs = (
    boto3.resource("s3")
    .Bucket(bucket)
    .upload_file("model.joblib.tar.gz", "model/nn/model.joblib.tar.gz")
)

<IPython.core.display.Javascript object>

# Nearest Neighbors Model Creation

In [225]:
timestamp_prefix = strftime("%Y-%m-%d-%H-%M-%S", gmtime())

nn_endpoint_name = "nn-model-ep-" + timestamp_prefix

nn_model = SKLearnModel(
    model_data=f"s3://{bucket}/model/nn/model.joblib.tar.gz",
    role=role,
    entry_point="src/nn_inference.py",
    framework_version="0.23-1",
    sagemaker_session=sagemaker.Session(),
)

<IPython.core.display.Javascript object>

In [226]:
nn_predictor = nn_model.deploy(
    instance_type="ml.m4.xlarge",
    initial_instance_count=1,
    endpoint_name=nn_endpoint_name,
)

---------!

<IPython.core.display.Javascript object>

In [227]:
predictor = Predictor(
    endpoint_name=nn_endpoint_name,
    sagemaker_session=sagemaker.Session(),
    serializer=JSONSerializer(),
    deserializer=JSONDeserializer(),
)

prediction = predictor.predict(
    {"embeddings": test_embedding["embeddings"], "kneighbors": 5}
)
print(prediction)
# zipped = list(
#     zip(
#         prediction["recommendations"]["neighbors"][0],
#         prediction["recommendations"]["distance"][0],
#     )
# )


{'recommendations': [[5792, 0.6971731262020007], [99509, 0.7127081025413139], [87946, 0.7221756314556704], [126268, 0.7293226724388949], [51012, 0.7328417131487005]]}


<IPython.core.display.Javascript object>


# Inference Pipeline


In [229]:
timestamp_prefix = strftime("%Y-%m-%d-%H-%M-%S", gmtime())
endpoint_name = "inference-pipeline-ep-" + timestamp_prefix
pipeline_model = PipelineModel(
    role=role, 
    models=[
        embeddings_model, 
        nn_model
    ],
    sagemaker_session=sagemaker.Session(),
)


<IPython.core.display.Javascript object>

In [None]:
inference_pipeline = pipeline_model.deploy(
    initial_instance_count=1,
    instance_type="ml.m4.xlarge",
    endpoint_name=endpoint_name,
    serializer=JSONSerializer(),
    deserializer=JSONDeserializer(),
)

------------

In [233]:
pipeline_predictor = Predictor(
    endpoint_name=endpoint_name,
    sagemaker_session=sagemaker.Session(),
    serializer=JSONSerializer(),
    deserializer=JSONDeserializer(),
)

<IPython.core.display.Javascript object>

# Test Pipeline



In [234]:
test_payload = json.dumps({"data": "sweet wine with a hint of tartness"})
test_response = pipeline_predictor.predict(data=test_payload)
# test_recommendations = json.loads(test_response)

print(test_recommentations)

ValidationError: An error occurred (ValidationError) when calling the InvokeEndpoint operation: Endpoint inference-pipeline-ep-2021-12-10-23-25-18 of account 985074551727 not found.

<IPython.core.display.Javascript object>

# Clean Up

In [None]:
# Delete model
embeddings_model.delete_model()
nn_model.delete_model()
pipeline_predictor.delete_model()

# Delete endpoint and endpoint configuration
embeddings_predictor.delete_predictor()
nn_predictor.delete_predictor()
pipeline_predictor.delete_predictor()