
# Wine in a Million

### Authors: __[Zephyr Headley](https://github.com/jzheadley)__ and __[John Naylor](https://jonaylor.xyz)__

[![Open In SageMaker Studio Lab](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/jonaylor89/WineInAMillion/blob/main/notebooks/Wine%20In%20A%20Million.ipynb)



In [None]:
!pip install sentence_transformers
!pip install nvidia-ml-py3

!pip install nb_black
%load_ext nb_black

In [None]:

import os
import tarfile
import json
import time
import pandas as pd
import boto3
import joblib
import sagemaker
from time import gmtime, strftime
from sagemaker import get_execution_role
from sagemaker.amazon.amazon_estimator import image_uris
from sagemaker.serializers import JSONSerializer
from sagemaker.deserializers import JSONDeserializer
from sentence_transformers import SentenceTransformer
from sagemaker.sklearn import SKLearnModel
from sagemaker.pytorch import PyTorch, PyTorchModel
from sagemaker.predictor import Predictor
from sagemaker.inputs import TrainingInput
from sklearn.neighbors import NearestNeighbors
from sagemaker.pipeline import PipelineModel

# Preprocessing
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from string import punctuation

from tqdm.notebook import tqdm

tqdm.pandas()

nltk.download("stopwords")
nltk.download("punkt")
nltk.download("wordnet")

print(f"SageMaker SDK Version: {sagemaker.__version__}")

In [None]:
role = get_execution_role()

# bucket = "<S3_BUCKET>"
# prefix = "<S3_KEY_PREFIX>"
# filename = "<DATASET_FILENAME>"

bucket = "wineinamillion"
prefix = "data/"
filename = "winemag-data-130k-v2.csv"

assert bucket != "<S3_BUCKET>"
assert prefix != "<S3_KEY_PREFIX>"
assert filename != "<DATASET_FILENAME>"

raw_data_location = f"s3://{bucket}/{prefix}raw/{filename}"

In [None]:
# https://www.analyticsvidhya.com/blog/2021/04/how-to-download-kaggle-datasets-using-jupyter-notebook/
!pip install opendatasets
import opendatasets as od
od.download("https://www.kaggle.com/zynicide/wine-reviews")
inputs = boto3.resource("s3").Bucket(bucket).upload_file(f"wine-reviews/{filename}", f"{prefix}raw/{filename}")

In [None]:
df = pd.read_csv(raw_data_location)
df.describe()

In [None]:
df.head(5)

In [None]:
print(df["description"][0])

# Preprocess Dataframe & Clean Data

In [None]:
def clean_data(desc):
    words = stopwords.words('english')
    lower = " ".join([w for w in desc.lower().split() if not w in words])
    punct = ''.join(ch for ch in lower if ch not in punctuation)
    wordnet_lemmatizer = WordNetLemmatizer()

    word_tokens = nltk.word_tokenize(punct)
    lemmatized_word = [wordnet_lemmatizer.lemmatize(word) for word in word_tokens]

    word_joined = " ".join(lemmatized_word)
    
    return word_joined
    

df['clean_desc'] = df["description"].apply(clean_data)

print(df['clean_desc'].head(5))

In [None]:
# Upload the preprocessed dataset to S3
df.to_csv("cleaned_dataset.csv")
clean_data_location = f"{prefix}clean/cleaned_dataset.csv"
inputs = boto3.resource("s3").Bucket(bucket).upload_file('cleaned_dataset.csv', clean_data_location)


# Sentence-BERT Embeddings


In [None]:
# Save the model to disk which we will host at sagemaker

model_name = 'sentence-transformers/all-MiniLM-L6-v2'

saved_model_dir = 'transformer'
if not os.path.isdir(saved_model_dir):
    os.makedirs(saved_model_dir)

model = SentenceTransformer(model_name)
model.save(saved_model_dir)

embeddings = model.encode(df["clean_desc"][0])
print(len(embeddings))

In [None]:
# Zip the model .gz format and upload to s3

export_dir = 'transformer'
with tarfile.open('model.tar.gz', mode='w:gz') as archive:
    archive.add(export_dir, recursive=True)


#Upload the model to S3
boto3.Session().resource("s3").Bucket(bucket).upload_file('model.tar.gz', 'model/transformer/model.tar.gz')

In [None]:
# Generates embeddings from the model
embeddings = []
for i in tqdm(range(len(df["clean_desc"])-100,len(df["clean_desc"]))):
    vector = model.encode([df["clean_desc"][i]])
    embeddings.append(vector)
    
embeddings_flattened = list(map(lambda x:x[0], embeddings))
embeddings_df = pd.DataFrame(embeddings_flattened)

In [None]:
#write embeddings to csv
embeddings_df.to_csv('embeddings.csv.gz', compression='gzip')

In [None]:
# Upload the embeddings to S3
embeddings_location = (
    boto3.Session()
    .resource("s3")
    .Bucket(bucket)
    .upload_file("embeddings.csv.gz", "model/embeddings/embeddings.csv.gz")
)

print(embeddings_location)

In [None]:
# need to have the embeddings as a tarball in s3
embeddings_location = f"s3://{bucket}/embeddings.csv.gz"
df = pd.read_csv(embeddings_location)
df.head(1)

# Create Embedding Model

In [None]:
embeddings_model = PyTorchModel(
    model_data=f"s3://{bucket}/model/transformer/model.tar.gz",
    role = role, 
    entry_point ='encode_inference.py',
    source_dir = './src', 
    framework_version = '1.9.0',
    py_version = 'py38',
    sagemaker_session=sagemaker.Session(),
)

### Test Embedding Model

In [None]:
timestamp_prefix = strftime("%Y-%m-%d-%H-%M-%S", gmtime())
embeddings_endpoint_name = "embeddings-model-ep-" + timestamp_prefix

embedding_predictor = embeddings_model.deploy(
    instance_type='ml.m4.xlarge',
    initial_instance_count=1,
    endpoint_name=embeddings_endpoint_name,
    serializer=JSONSerializer(),
    deserializer=JSONDeserializer(),
)

In [None]:
# test_payload = {'data': 'sweet wine with a hint of tartness'}
# test_features = embeddings_predictor.predict(test_payload)
# test_embedding = json.loads(test_features)

# len(test_embedding)

test_embedding = embedding_predictor.predict(
    {"data": "sweet wine with a hint of tartness"}
)
print(len(test_embedding["embeddings"]))



# Nearest Neighbors Model "Training"


In [None]:
# This is only a caching sort of step.  Instead of regenerating on subsequent runs, this can be run to pull the intermediary data from s3
embeddings_df = pd.read_csv(f"s3://{bucket}/model/embeddings/embeddings.csv.gz")

In [None]:
embeddings_df = embeddings_df[:-1]

embeddings_df.tail(5)
del embeddings_df["embeddings.csv"]
embeddings_df.tail(5)

In [None]:
neigh = NearestNeighbors(n_neighbors=5)
neigh.fit(embeddings_df)

joblib.dump(neigh, "model.joblib")
with tarfile.open("model.joblib.tar.gz", mode="w:gz") as archive:
    archive.add("model.joblib")

inputs = (
    boto3.resource("s3")
    .Bucket(bucket)
    .upload_file("model.joblib.tar.gz", "model/nn/model.joblib.tar.gz")
)

# Nearest Neighbors Model Creation

In [None]:
timestamp_prefix = strftime("%Y-%m-%d-%H-%M-%S", gmtime())

nn_endpoint_name = "nn-model-ep-" + timestamp_prefix

nn_model = SKLearnModel(
    model_data=f"s3://{bucket}/model/nn/model.joblib.tar.gz",
    role=role,
    entry_point="src/nn_inference.py",
    framework_version="0.23-1",
    sagemaker_session=sagemaker.Session(),
)

In [None]:
nn_predictor = nn_model.deploy(
    instance_type="ml.m4.xlarge",
    initial_instance_count=1,
    endpoint_name=nn_endpoint_name,
)

In [None]:
predictor = Predictor(
    endpoint_name=nn_endpoint_name,
    sagemaker_session=sagemaker.Session(),
    serializer=JSONSerializer(),
    deserializer=JSONDeserializer(),
)

prediction = predictor.predict(
    {"embeddings": test_embedding["embeddings"], "kneighbors": 5}
)
print(prediction)
# zipped = list(
#     zip(
#         prediction["recommendations"]["neighbors"][0],
#         prediction["recommendations"]["distance"][0],
#     )
# )



# Inference Pipeline


In [None]:
timestamp_prefix = strftime("%Y-%m-%d-%H-%M-%S", gmtime())
endpoint_name = "inference-pipeline-ep-" + timestamp_prefix
pipeline_model = PipelineModel(
    role=role, 
    models=[
        embeddings_model, 
        nn_model
    ],
    sagemaker_session=sagemaker.Session(),
)


In [None]:
inference_pipeline = pipeline_model.deploy(
    initial_instance_count=1,
    instance_type="ml.m4.xlarge",
    endpoint_name=endpoint_name,
    serializer=JSONSerializer(),
    deserializer=JSONDeserializer(),
)

In [None]:
pipeline_predictor = Predictor(
    endpoint_name=endpoint_name,
    sagemaker_session=sagemaker.Session(),
    serializer=JSONSerializer(),
    deserializer=JSONDeserializer(),
)

# Test Pipeline



In [None]:
test_payload = json.dumps({"data": "sweet wine with a hint of tartness"})
test_response = pipeline_predictor.predict(data=test_payload)
# test_recommendations = json.loads(test_response)

print(test_recommentations)

# Clean Up

In [None]:
# Delete model
embeddings_model.delete_model()
nn_model.delete_model()
pipeline_predictor.delete_model()

# Delete endpoint and endpoint configuration
embeddings_predictor.delete_predictor()
nn_predictor.delete_predictor()
pipeline_predictor.delete_predictor()