In [None]:
import logging

# Set up logging
logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(levelname)s - %(message)s')
# Debugging example
def divide(a, b):
    logging.debug(f"Dividing {a} by {b}")
    result = a / b
    logging.debug(f"Result: {result}")
    return result

In [None]:
divide(5, 2)

In [None]:
def sql_to_dataframe(sql, token: str): 
    from google.cloud import bigquery
    from google.oauth2 import credentials

    CREDENTIALS = google.oauth2.credentials.Credentials(token) # get credentials from token
    client = bigquery.Client(project=project_id, credentials=CREDENTIALS)
    df_target = client.query(sql).to_dataframe()

    return df_target 

In [None]:
import google.oauth2.credentials
token = !gcloud auth print-access-token
token_str = token[0]

project_id = 'divg-josh-pr-d1cc3a'
dataset_id = 'breast_cancer'
table_id = 'breast_cancer_X_val'

sql = '''SELECT * FROM `{project_id}.{dataset_id}.{table_id}` '''.format(project_id=project_id,
                                                                        dataset_id=dataset_id,
                                                                        table_id=table_id,
                                                                        )

df = sql_to_dataframe(sql=sql, token=token_str)

df.head() 

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE

# Sample data
data = {
    'date': ['2023-01-01', '2023-01-02', '2023-01-03'],
    'float_col': [1.1, 2.2, 3.3],
    'string_col': ['A', 'B', 'A'],
    'category_col': ['cat1', 'cat2', 'cat1'],
    'integer_col': [1, 2, 3],
    'target': [0, 1, 0]
}

df = pd.DataFrame(data)

# Convert date to numerical features
df['date'] = pd.to_datetime(df['date'])
df['year'] = df['date'].dt.year
df['month'] = df['date'].dt.month
df['day'] = df['date'].dt.day
df.drop('date', axis=1, inplace=True)

# Define the preprocessing steps
categorical_features = ['string_col', 'category_col']
numeric_features = ['float_col', 'integer_col', 'year', 'month', 'day']

# One-hot encode categorical features and standardize numerical features
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(), categorical_features),
        ('num', StandardScaler(), numeric_features)
    ]
)

# Split the data into features and target
X = df.drop('target', axis=1)
y = df['target']

# Apply the preprocessor to the data
X_processed = preprocessor.fit_transform(X)

# Apply SMOTE
smote = SMOTE(random_state=42)
X_res, y_res = smote.fit_resample(X_processed, y)

print("Original dataset shape:", X.shape)
print("Resampled dataset shape:", X_res.shape)