
# Wine in a Million

### Authors: __[Zephyr Headley](https://github.com/jzheadley)__ and __[John Naylor](https://jonaylor.xyz)__


In [None]:
!pip install sentence_transformers

import os
import tarfile
import json
import time
import pandas as pd
import boto3
import sagemaker
from sagemaker import get_execution_role
from sagemaker.amazon.amazon_estimator import get_image_uri
from sentence_transformers import SentenceTransformer
from sagemaker.pytorch import PyTorch, PyTorchModel
from sagemaker.predictor import RealTimePredictor

# Preprocessing
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from string import punctuation 

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

role = get_execution_role()

print(f'SageMaker SDK Version: {sagemaker.__version__}')

In [None]:
# bucket = "<S3_BUCKET>"
# prefix = "<S3_KEY_PREFIX>"
# filename = "<DATASET_FILENAME>"

bucket = "winemag-data-wineinamillion-23452"
prefix = "data/raw/"
filename = "winemag-data-130k-v2.csv"

assert bucket != "<S3_BUCKET>"
assert prefix != "<S3_KEY_PREFIX>"
assert filename != "<DATASET_FILENAME>"

raw_data_location = f"s3://{bucket}/{prefix}{filename}""

In [None]:
df = pd.read_csv(raw_data_location)
df.describe()

In [None]:
df.head(5)

In [None]:
print(df["description"][0])

# Preprocess Dataframe & Clean Data

In [None]:
def clean_data(desc):
    words = stopwords.words('english')
    lower = " ".join([w for w in desc.lower().split() if not w in words])
    punct = ''.join(ch for ch in lower if ch not in punctuation)
    wordnet_lemmatizer = WordNetLemmatizer()

    word_tokens = nltk.word_tokenize(punct)
    lemmatized_word = [wordnet_lemmatizer.lemmatize(word) for word in word_tokens]

    word_joined = " ".join(lemmatized_word)
    
    return word_joined
    


df['clean_desc'] = df["description"].apply(clean_data)

print(df['clean_desc'].head(5))

In [None]:
# Upload the preprocessed dataset to S3
df.to_csv("cleaned_dataset.csv")

inputs = boto3
    .Session()
    .resource("s3")
    .Bucket(bucket)
    .Object(os.path.join("data", "clean", "dataset.csv"))
    .upload_data(path='cleaned_dataset.csv')


# Sentence-BERT Embeddings


In [None]:
# Save the model to disk which we will host at sagemaker

model_name = 'sentence-transformers/all-MiniLM-L6-v2'

saved_model_dir = 'transformer'
if not os.path.isdir(saved_model_dir):
    os.makedirs(saved_model_dir)

model = SentenceTransformer(model_name)
model.save(saved_model_dir)

embeddings = model.encode(df["clean_desc"][0])
print(len(embeddings))

In [None]:
# Zip the model .gz format and upload to s3

export_dir = 'transformer'
with tarfile.open('model.tar.gz', mode='w:gz') as archive:
    archive.add(export_dir, recursive=True)


#Upload the model to S3
transformer_location = boto3
    .Session()
    .resource("s3")
    .Bucket(bucket)
    .Object(os.path.join("model", "transformer", "model.tar.gz"))
    .upload_data(path='transformer/model.tar.gz')
transformer_location

In [None]:
# Create Initial Embeddings (this can take a while)
def train():
    embeddings = []
    for i in tqdm.notebook.tqdm(range(len(df["clean_desc"]))):
        vector = model.encode([df["clean_desc"][i]])
        embeddings.append(vector)
        
    # WARNING/TODO : KNN doesn't accept json as input so this'll need to be changed
    json_str = {"embeddings": embeddings}
    with open("embeddings.json", "w") as f:
        json.dump(json_str, f, ident=6)

    #Upload the embeddings to S3
    embeddings_location = boto3
        .Session()
        .resource("s3")
        .Bucket(bucket)
        .Object(os.path.join("model", "embeddings", "embeddings.json"))
        .upload_data(path='embeddings.json')

    print(embeddings_location)
    return embeddings_location

In [None]:
# Create Embedding Model

class StringPredictor(RealTimePredictor):
    def __init__(self, endpoint_name, sagemaker_session):
        super(StringPredictor, self).__init__(endpoint_name, sagemaker_session, content_type='text/plain')
           

embeddings_estimator = PyTorch(
    model_data = inputs, 
    role = role, 
    entry_point ='train_deploy.py',
    source_dir = './src', 
    framework_version = '1.3.1',
    predictor_cls = StringPredictor
)

embeddings_model = embeddings_estimator.create_model()

# embeddings_predictor = pytorch_model.deploy(instance_type='ml.m5.large', initial_instance_count=1)

In [None]:
# Test Embedding Model

test_payload = 'sweet wine with a hint of tartness'
test_features = predictor.predict(test_payload)
test_embedding = json.loads(test_features)

len(test_embedding)



# KNN Section


In [None]:
# Set up the estimator
 
hyperparams = {
    "feature_dim": 384, 
    "k": 5, 
    "sample_size": 200000, 
    "predictor_type": "classifier"
}
output_path = f"s3://{bucket}/knn/output"

knn = sagemaker.estimator.Estimator(
    get_image_uri(boto3.Session().region_name, "knn"),
    get_execution_role(),
    instance_count=1,
    instance_type="ml.m5.2xlarge",
    output_path=output_path,
    sagemaker_session=sagemaker.Session(),
)
knn.set_hyperparameters(**hyperparams)

In [None]:
# Train a model. fit_input contains the locations of the train and test data

s3_train_data = "s3://model/embeddings/embeddings.json"
s3_test_data = None

fit_input = {"train": s3_train_data}
if s3_test_data is not None:
    fit_input["test"] = s3_test_data
    
knn.fit(fit_input)

knn_model = knn.create_model()

In [None]:
# Deploy KNN Predictor

# instance_type = "ml.m4.xlarge"
# model_name = "knn_%s" % instance_type
# endpoint_name = "knn-ml-m4-xlarge-%s" % (str(time.time()).replace(".", "-"))

# print("setting up the endpoint..")

#knn_predictor = knn_estimator.deploy(
#    initial_instance_count=1, 
#    instance_type=instance_type, 
#    endpoint_name=endpoint_name
# )
# knn_predictor.serializer = CSVSerializer()
# knn_predictor.deserializer = JSONDeserializer()

In [None]:
# Test KNN Model
batches = np.array_split(test_features, 100)
print(f"data split into 100 batches, of size {batches[0].shape[0]}.")

# obtain an np array with the predictions for the entire test set
start_time = time.time()
predictions = []
for batch in batches:
    result = predictor.predict(batch, initial_args={"ContentType": "text/csv"})
    cur_predictions = np.array(
        [result["predictions"][i]["predicted_label"] for i in range(len(result["predictions"]))]
    )
    predictions.append(cur_predictions)
predictions = np.concatenate(predictions)
run_time = time.time() - start_time

test_size = test_labels.shape[0]
num_correct = sum(predictions == test_labels)
accuracy = num_correct / float(test_size)
print("time required for predicting %d data point: %.2f seconds" % (test_size, run_time))
print("accuracy of model: %.1f%%" % (accuracy * 100))


# Inference Pipeline


In [None]:

timestamp_prefix = strftime("%Y-%m-%d-%H-%M-%S", time.gmtime())

model_name = "inference-pipeline-" + timestamp_prefix
endpoint_name = "inference-pipeline-ep-" + timestamp_prefix
pipeline_model = PipelineModel(
    name=model_name, 
    role=role, 
    models=[
        embedding_model, 
        knn_model
    ]
)

pipeline_model.deploy(
    initial_instance_count=1, 
    instance_type="ml.c4.xlarge", 
    endpoint_name=endpoint_name
)

# Test Pipeline



In [None]:
# pipeline_model.predict("something sweet and without any bitterness")