In [1]:
# Librerías básicas
import pandas as pd
import numpy as np

# Librerías para importar tablas de BigQuery
from google.cloud import bigquery

client = bigquery.Client()

# Librerías para importar funciones que estructuran texto y predicen outputs
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix

# Dont show warnings 
import warnings
warnings.filterwarnings("ignore")

In [None]:
# query to get info from cdp 
demographic_sql = """
SELECT
  td_id,
  abi_firstname,
  LOWER(TRIM(REGEXP_REPLACE(NORMALIZE(abi_gender, nfd), r"\pM", ''))) AS abi_gender,
FROM
  abi-martech-global.maz_col_cdp_inbound.L2_attributes
WHERE
  classification_category IN ('gold',
    'diamond')
  AND abi_gender IS NOT NULL
  AND abi_firstname IS NOT null
"""

In [None]:
# train model 
df = client.query(demographic_sql).to_dataframe()
# Tomar solo géneros validos y codificar variable
df['abi_gender1']=np.where(df.abi_gender.isin(['male']),'Male',
                       np.where(df.abi_gender.isin(['female']),'Female',
                                np.where(df.abi_gender.isin(['other','otro','o']),'Other',df.abi_gender)))

df = df[df['abi_gender1'].isin(['Female', 'Male'])]
df['abi_gender1'].replace({'Female':0,'Male':1},inplace=True)

# Tratamiento de texto
df['abi_firstname']=df['abi_firstname'].str.replace('\d+', '')    
df['abi_firstname']=df['abi_firstname'].str.replace(r'\b(\w{1,3})\b', '')
df['abi_firstname']=df['abi_firstname'].str.lower()
df['abi_firstname']=df['abi_firstname'].str.normalize('NFKD').str.encode('ascii',errors='ignore').str.decode('utf-8')

# Dataset con consumidores que tienen género femenino o masculino y que tienen nombre
df1=df[df.abi_gender1.isin([0,1])]
df1['abi_gender1']= pd.to_numeric(df1['abi_gender1'])
df1=df1[df1.abi_firstname.notnull()]
df1=df1[df1.abi_firstname!='']
df1=df1[df1.abi_firstname!=' ']

# Transformar texto
Xfeatures =df1['abi_firstname']
cv = CountVectorizer()
X = cv.fit_transform(Xfeatures)
y = df1.abi_gender1.values

# Asignar train y test y correr modelo
X_train, X_test, y_train, y_test, Xfeatures_train, Xfeatures_test = train_test_split(X, y, Xfeatures, test_size=0.20, random_state=42)
clf = MultinomialNB()
clf.fit(X_train,y_train)
print("Accuracy of Model",clf.score(X_test,y_test)*100,"%")
y_pred=(clf.predict(X_test))
print(confusion_matrix(y_test, y_pred))

In [4]:
import joblib
from google.cloud import storage
client = storage.Client()
# save model
bucket_name = 'abi-martech-maz-col-local'
model_file_path = 'gender_model/modelo_entrenado.joblib'
joblib.dump(clf, 'modelo_entrenado.joblib')
bucket = client.get_bucket(bucket_name)
blob = bucket.blob(model_file_path)
blob.upload_from_filename('modelo_entrenado.joblib')
#guardar vectorizador
model_file_path = 'gender_model/vectorizador.joblib'
joblib.dump(cv, 'vectorizador.joblib')
blob = bucket.blob(model_file_path)
blob.upload_from_filename('vectorizador.joblib')

NameError: name 'clf' is not defined

In [5]:
def if_tbl_exists(client, table_ref):
    from google.cloud.exceptions import NotFound
    try:
        client.get_table(table_ref)
        return True
    except NotFound:
        return False

def score_model(): 
    # Cargar el modelo desde el archivo
    import joblib
    import pandas as pd
    import numpy as np
    import warnings
    from google.cloud import bigquery
    from google.cloud import storage

    client = storage.Client()
    warnings.filterwarnings("ignore")

    # Cargar el modelo del bucket
    bucket_name = 'abi-martech-maz-col-local'
    model_file_path = 'gender_model/modelo_entrenado.joblib'

    bucket = client.get_bucket(bucket_name)
    blob = bucket.blob(model_file_path)
    # Carga directamente el modelo desde el objeto blob
    with blob.open("rb") as file:
        modelo_cargado = joblib.load(file)

    # Cargar vectorizador
    model_file_path = 'gender_model/vectorizador.joblib'
    blob = bucket.blob(model_file_path)
    # Carga directamente el modelo desde el objeto blob
    with blob.open("rb") as file:
        vectorizador = joblib.load(file)
    
    client = bigquery.Client()
    table_id = 'abi-martech-maz-col.maz_col_sandbox.atribucion_genero'

    if not if_tbl_exists(client, table_id):
        schema = [
        bigquery.SchemaField('td_id', 'STRING'),
        bigquery.SchemaField('abi_gender_pred', 'STRING')
        ]
        
        table = bigquery.Table(table_id, schema=schema)
        client.create_table(table)

    query = """
    SELECT
      td_id,
      abi_firstname,
      LOWER(TRIM(REGEXP_REPLACE(NORMALIZE(abi_gender, nfd), r"\pM", ''))) AS abi_gender,
    FROM
      `abi-martech-global.maz_col_cdp_inbound.L2_attributes` a
    WHERE
      classification_category IN ('gold',
        'diamond')
      AND abi_gender IS NULL
      AND abi_firstname IS NOT NULL
      AND NOT EXISTS (
      SELECT
        td_id
      FROM
        `abi-martech-maz-col.maz_col_sandbox.atribucion_genero` b
      WHERE
        a.td_id = b.td_id )
    """

    connombre_singenero = client.query(query).to_dataframe()
    # Aplicar reglas para revision 
    connombre_singenero['abi_firstname']=connombre_singenero['abi_firstname'].str.replace('\d+', '')    
    connombre_singenero['abi_firstname']=connombre_singenero['abi_firstname'].str.replace(r'\b(\w{1,3})\b', '')
    connombre_singenero['abi_firstname']=connombre_singenero['abi_firstname'].str.lower()
    connombre_singenero['abi_firstname']=connombre_singenero['abi_firstname'].str.normalize('NFKD').str.encode('ascii',errors='ignore').str.decode('utf-8')
    connombre_singenero1=connombre_singenero[connombre_singenero.abi_firstname.notnull()]
    connombre_singenero1=connombre_singenero1[connombre_singenero1.abi_firstname!='']
    connombre_singenero1=connombre_singenero1[connombre_singenero1.abi_firstname!=' ']
    
    sample_name = list(connombre_singenero1['abi_firstname'].values)
    if len(sample_name) > 0:
        # Obtener las predicciones de género
        calificaciones = modelo_cargado.predict(vectorizador.transform(sample_name))
        # Asignar género
        calificaciones_genero = ['Male' if i == 1 else 'Female' for i in calificaciones]
        # Asignar las predicciones al DataFrame
        connombre_singenero1['abi_gender_pred'] = calificaciones_genero
        # Exportar resultados BigQuery - Sandbox
        connombre_singenero1[['td_id', 'abi_gender_pred']].to_gbq('maz_col_sandbox.atribucion_genero','abi-martech-maz-col', if_exists='append')
    
    else:
        print("No new records for score")    

In [6]:
score_model()

Forbidden: 403 GET https://storage.googleapis.com/storage/v1/b/abi-martech-maz-col-local?projection=noAcl&prettyPrint=false: Andres.GonzalezA@ab-inbev.com does not have storage.buckets.get access to the Google Cloud Storage bucket. Permission 'storage.buckets.get' denied on resource (or it may not exist).