In [1]:
# # Librerías básicas
# import pandas as pd
# import numpy as np

# # Librerías para importar tablas de BigQuery
# from google.cloud import bigquery

# client = bigquery.Client()

# # Librerías para importar funciones que estructuran texto y predicen outputs
# from sklearn.feature_extraction.text import CountVectorizer
# from sklearn.model_selection import train_test_split
# from sklearn.naive_bayes import MultinomialNB
# from sklearn.metrics import confusion_matrix

# # Dont show warnings 
# import warnings
# warnings.filterwarnings("ignore")

In [None]:
# Basic Libraries
import pandas as pd
import numpy as np

# Libraries for importing BigQuery tables
from google.cloud import bigquery

client = bigquery.Client()

# Libraries for importing text processing and prediction functions
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix

# Ignore warnings
import warnings
warnings.filterwarnings("ignore")

# SQL query to get information from CDP
demographic_sql = """
SELECT
  td_id,
  abi_firstname,
  LOWER(TRIM(REGEXP_REPLACE(NORMALIZE(abi_gender, nfd), r"\pM", ''))) AS abi_gender,
FROM
  abi-martech-global.maz_col_cdp_inbound.L2_attributes
WHERE
  classification_category IN ('gold',
    'diamond')
  AND abi_gender IS NOT NULL
  AND abi_firstname IS NOT null
"""

# Train the model
df = client.query(demographic_sql).to_dataframe()

# Take only valid genders and encode the variable
df['abi_gender1'] = np.where(df.abi_gender.isin(['male']), 'Male',
                             np.where(df.abi_gender.isin(['female']), 'Female',
                                      np.where(df.abi_gender.isin(['other', 'otro', 'o']), 'Other', df.abi_gender)))

df = df[df['abi_gender1'].isin(['Female', 'Male'])]
df['abi_gender1'].replace({'Female': 0, 'Male': 1}, inplace=True)

# Text processing
df['abi_firstname'] = df['abi_firstname'].str.replace('\d+', '')
df['abi_firstname'] = df['abi_firstname'].str.replace(r'\b(\w{1,3})\b', '')
df['abi_firstname'] = df['abi_firstname'].str.lower()
df['abi_firstname'] = df['abi_firstname'].str.normalize('NFKD').str.encode('ascii', errors='ignore').str.decode('utf-8')

# Dataset with consumers having female or male gender and a name
df1 = df[df.abi_gender1.isin([0, 1])]
df1['abi_gender1'] = pd.to_numeric(df1['abi_gender1'])
df1 = df1[df1.abi_firstname.notnull()]
df1 = df1[df1.abi_firstname != '']
df1 = df1[df1.abi_firstname != ' ']

# Transform text
Xfeatures = df1['abi_firstname']
cv = CountVectorizer()
X = cv.fit_transform(Xfeatures)
y = df1.abi_gender1.values

# Assign train and test and run the model
X_train, X_test, y_train, y_test, Xfeatures_train, Xfeatures_test = train_test_split(X, y, Xfeatures, test_size=0.20, random_state=42)
clf = MultinomialNB()
clf.fit(X_train, y_train)
y_pred = (clf.predict(X_test))

# Libraries for saving the model
import joblib
from google.cloud import storage

client = storage.Client()

# Save the model
bucket_name = 'abi-martech-maz-col-local'
model_file_path = 'gender_model/modelo_entrenado.joblib'
joblib.dump(clf, 'modelo_entrenado.joblib')
bucket = client.get_bucket(bucket_name)
blob = bucket.blob(model_file_path)
blob.upload_from_filename('modelo_entrenado.joblib')

# Save the vectorizer
model_file_path = 'gender_model/vectorizador.joblib'
joblib.dump(cv, 'vectorizador.joblib')
blob = bucket.blob(model_file_path)
blob.upload_from_filename('vectorizador.joblib')
