
# Wine in a Million

### Authors: __[Zephyr Headley](https://github.com/jzheadley)__ and __[John Naylor](https://jonaylor.xyz)__


In [None]:
!pip install sentence_transformers
!pip install nvidia-ml-py3

import os
import tarfile
import json
import time
import pandas as pd
import boto3
import joblib
import sagemaker
from sagemaker import get_execution_role
from sagemaker.amazon.amazon_estimator import image_uris
from sagemaker.serializers import CSVSerializer
from sentence_transformers import SentenceTransformer
from sagemaker.sklearn import SKLearnModel
from sagemaker.pytorch import PyTorch, PyTorchModel
from sagemaker.predictor import RealTimePredictor
from sagemaker.inputs import TrainingInput
from sklearn.neighbors import NearestNeighbors

# Preprocessing
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from string import punctuation 

from tqdm.notebook import tqdm 
tqdm.pandas()

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

role = get_execution_role()

print(f'SageMaker SDK Version: {sagemaker.__version__}')

In [None]:
# bucket = "<S3_BUCKET>"
# prefix = "<S3_KEY_PREFIX>"
# filename = "<DATASET_FILENAME>"

bucket = "winemag-data-wineinamillion-23452"
prefix = "data/raw/"
filename = "winemag-data-130k-v2.csv"

assert bucket != "<S3_BUCKET>"
assert prefix != "<S3_KEY_PREFIX>"
assert filename != "<DATASET_FILENAME>"

raw_data_location = f"s3://{bucket}/{prefix}{filename}"

In [None]:
df = pd.read_csv(raw_data_location)
df.describe()

In [None]:
df.head(5)

In [None]:
print(df["description"][0])

# Preprocess Dataframe & Clean Data

In [None]:
def clean_data(desc):
    words = stopwords.words('english')
    lower = " ".join([w for w in desc.lower().split() if not w in words])
    punct = ''.join(ch for ch in lower if ch not in punctuation)
    wordnet_lemmatizer = WordNetLemmatizer()

    word_tokens = nltk.word_tokenize(punct)
    lemmatized_word = [wordnet_lemmatizer.lemmatize(word) for word in word_tokens]

    word_joined = " ".join(lemmatized_word)
    
    return word_joined
    

from sklearn.neighbors import NearestNeighbor
df['clean_desc'] = df["description"].apply(clean_data)

print(df['clean_desc'].head(5))

In [None]:
# Upload the preprocessed dataset to S3
df.to_csv("cleaned_dataset.csv")

inputs = boto3.resource("s3").Bucket(bucket).upload_file('cleaned_dataset.csv', "data/clean/dataset.csv")


# Sentence-BERT Embeddings


In [None]:
# Save the model to disk which we will host at sagemaker

model_name = 'sentence-transformers/all-MiniLM-L6-v2'

saved_model_dir = 'transformer'
if not os.path.isdir(saved_model_dir):
    os.makedirs(saved_model_dir)

model = SentenceTransformer(model_name)
model.save(saved_model_dir)

embeddings = model.encode(df["clean_desc"][0])
print(len(embeddings))

In [None]:
# Zip the model .gz format and upload to s3

export_dir = 'transformer'
with tarfile.open('model.tar.gz', mode='w:gz') as archive:
    archive.add(export_dir, recursive=True)


#Upload the model to S3
boto3.Session().resource("s3").Bucket(bucket).upload_file('model.tar.gz', 'model/transformer/model.tar.gz')

In [None]:
# Create Initial Embeddings (this can take a while)
def train():
    embeddings = []
    for i in tqdm.notebook.tqdm(range(len(df["clean_desc"]))):
        vector = model.encode([df["clean_desc"][i]])
        embeddings.append(vector)
        
    # WARNING/TODO : NN doesn't accept json as input so this'll need to be changed
    json_str = {"embeddings": embeddings}
    with open("embeddings.json", "w") as f:
        json.dump(json_str, f, indent=6)

    #Upload the embeddings to S3
    embeddings_location = boto3
        .Session()
        .resource("s3")
        .Bucket(bucket)
        .upload_data(path='model/embeddings/embeddings.json')

    print(embeddings_location)
    return embeddings_location

In [None]:
# Create Embedding Model

class StringPredictor(RealTimePredictor):
    def __init__(self, endpoint_name, sagemaker_session):
        super(StringPredictor, self).__init__(endpoint_name, sagemaker_session, content_type='text/plain')
           

embeddings_estimator = PyTorch(
    model_data=f"s3://{bucket}/model/transformer/model.tar.gz",
    role = role, 
    entry_point ='train_deploy.py',
    source_dir = './src', 
    framework_version = '1.3.1',
    predictor_cls = StringPredictor
)

embeddings_model = embeddings_estimator.create_model()


In [None]:
# Test Embedding Model
# embedding_predictor = pytorch_model.deploy(instance_type='ml.m5.large', initial_instance_count=1)

# test_payload = 'sweet wine with a hint of tartness'
# test_features = embeddings_predictor.predict(test_payload)
# test_embedding = json.loads(test_features)

# len(test_embedding)



# Nearest Neighbors Model "Training"


In [None]:
embeddings_df = pd.read_csv(f"s3://{bucket}/model/embeddings/embeddings.csv.tar.gz")

In [None]:
embeddings_df = embeddings_df[:-1]
embeddings_df.tail(5)


In [None]:

neigh = NearestNeighbors(n_neighbors=2)
neigh.fit(embeddings_df)

joblib.dump(neigh, "model.joblib")
with tarfile.open('model.joblib.tar.gz', mode='w:gz') as archive:
    archive.add("model.joblib", recursive=True)

inputs = boto3.resource("s3").Bucket(bucket).upload_file('model.joblib.tar.gz', "model/nn/model.joblib.tar.gz")

# Nearest Neighbors Model Creation

In [None]:
nn_model = SKLearnModel(
    model_data=f"s3://{bucket}/model/nn/model.joblib.tar.gz",
    role=role,
    entry_point="src/nn_inference.py",
    framework_version="0.20.0",
)

In [None]:
nn_predictor = nn_model.deploy(
    instance_type="ml.m4.xlarge", 
    initial_instance_count=1,
)

In [None]:
nn_predictor.predict({"embeddings": [0.0 for _ in range(384)]})


# Inference Pipeline


In [None]:

timestamp_prefix = strftime("%Y-%m-%d-%H-%M-%S", time.gmtime())

model_name = "inference-pipeline-" + timestamp_prefix
endpoint_name = "inference-pipeline-ep-" + timestamp_prefix
pipeline_model = PipelineModel(
    name=model_name, 
    role=role, 
    models=[
        embedding_model, 
        nn_model
    ]
)

pipeline_model.deploy(
    initial_instance_count=1, 
    instance_type="ml.c4.xlarge", 
    endpoint_name=endpoint_name
)

# Test Pipeline



In [None]:
# pipeline_model.predict("something sweet and without any bitterness")

# Clean Up

In [None]:
# Delete Endpoints 