In [None]:
###Notebooks loads a model from hugging face and CREATES A SQL FUNCTION to accept TEXT and return the scored result
###


####TO USE THIS YOU NEED TO CREATE External Access Integration (EAI) for Hugging Face

#
#-- create network rule and eai
#CREATE OR REPLACE NETWORK RULE hf_network_rule
#  MODE = EGRESS
#  TYPE = HOST_PORT
#  VALUE_LIST = ('huggingface.co','cdn-lfs-us-1.huggingface.co');
#
#CREATE OR REPLACE EXTERNAL ACCESS INTEGRATION hf_access_integration
#  ALLOWED_NETWORK_RULES = (hf_network_rule)
#  ENABLED = true;
#
#  
#ALTER NOTEBOOK HUGGIN_NOTEBOOK
#SET EXTERNAL_ACCESS_INTEGRATIONS=(hf_access_integration);


###YOU ALSO SHOULD HAVE A STAGE (@lyrics_input) with a docx file for profanity detection.  
#The code can be adjusted to use a table or text file for model input


# Import python packages
import streamlit as st
import pandas as pd
import warnings
import io
from snowflake.snowpark.session import Session
import snowflake.snowpark.functions as F
import snowflake.snowpark.types as T
from snowflake.snowpark.functions import col
from snowflake.ml.registry import Registry
from snowflake.ml.model.model_signature import FeatureSpec, DataType, ModelSignature
import os
os.environ['HF_HOME'] = '/tmp'
warnings.filterwarnings("ignore")
#hugging face library to pull open source models
from transformers import pipeline

from datetime import datetime
now = datetime.now()
dt_string = now.strftime("%Y/%m/%d %H:%M:%S")
dt_string



In [None]:
#setting variables to model names we want to work with from hugging face --- open source models will be IMPORTED and hosted in Snowflake
#choice of model is VERY flexible, can be llama-3, specialized, etc.
#this model returns a confidence level that the text is offensive
#you can easily change this to a model that provides more detail about why its offensive

#model_name = 'martin-ha/toxic-comment-model'
model_name = 'parsawar/profanity_model2.0'
#model_name = 'JungleLee/bert-toxic-comment-classification'
#model_name = 'citizenlab/distilbert-base-multilingual-cased-toxicity'
#model_name = 'pykeio/lite-toxic-comment-classification'

save_model_name = model_name.replace('-','_').replace('/','_').replace('.','')+'_custom'


In [None]:
#get a snowflake session object
from snowflake.snowpark.context import get_active_session
session = get_active_session()

#confirm the schema/db/wh we are using
print('Role:     ', session.get_current_role())
print('Warehouse:', session.get_current_warehouse())
print('Database: ', session.get_current_database())
print('Schema:   ', session.get_current_schema())


In [None]:
#snowflake ml registry object to save models and load models
reg = Registry(
    session=session, 
    database_name=session.get_current_database(), 
    schema_name=session.get_current_schema()
    )


In [None]:
# Get the  model from Huggingface
# Make sure it fits into a Snowflake warehouse and does not require GPU, because we are using regular warehouses today
# typically use Snowpark WH because models are big
# Otherwise the model must deployed in Snowpark Container Services
pipe = pipeline("text-classification", model=model_name)

In [None]:
#OPTION 1
# Customize our model signature (input / outputs)
model_sig = ModelSignature(
                  inputs=[
                      FeatureSpec(dtype=DataType.STRING, name='TEXT')
                      #, FeatureSpec(dtype=DataType.BOOLEAN, name='aggregate')
                      ],
                      outputs=[
                          FeatureSpec(dtype=DataType.STRING, name='label'),
                          FeatureSpec(dtype=DataType.DOUBLE, name='score')
                      ]
                  )



# Register/Save the model to Snowflake
snow_model_custom = reg.log_model(
    pipe, 
    model_name=  save_model_name, 
    signatures={'predict':model_sig},
    conda_dependencies=['tokenizers','transformers']
    )



In [None]:
#CHECK IT WORKED:
#run the model we just loaded/registered

# Initialize test data
df = session.create_dataframe(
        [{'TEXT': 'You are awesome bro!'},
        {'TEXT': 'You are a terrible person and so smelly!'}]
)
 
sentiment_values = snow_model_custom.run(df).cache_result()
sentiment_values.show(n=15, max_width=1000)

In [None]:
#easily write results to table
sentiment_values.write.save_as_table("MY_SENTIMENT_RESULTS", mode="append")

In [None]:
print(save_model_name)

In [None]:
--call from SQL!

select parsawar_profanity_model20_custom!predict('You are awesome') as scored_result