In [5]:
# Install required packages
!pip install sqlalchemy psycopg2-binary numpy pandas pillow scikit-learn

import numpy as np
import pandas as pd
from sqlalchemy import create_engine, text
from PIL import Image
from io import BytesIO
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

def load_image_data():
    # Create an SQLAlchemy engine
    engine = create_engine('postgresql://postgres:admin@localhost/ml_course')
    
    with engine.connect() as conn:
        # Use the text() function to wrap the SQL query
        query = text("SELECT image, label FROM synthetic_images")
        result = conn.execute(query)
        data = result.fetchall()
    
    X = []
    y = []
    for img_bytes, label in data:
        img = Image.open(BytesIO(img_bytes))
        X.append(np.array(img))
        y.append(label)
    
    return np.array(X), np.array(y)

def load_customer_data():
    # Create an SQLAlchemy engine
    engine = create_engine('postgresql://postgres:admin@localhost/ml_course')
    
    # Use the text() function to wrap the SQL query
    query = text("SELECT age, gender, income, purchase_frequency FROM synthetic_customers")
    df = pd.read_sql_query(query, engine)
    
    return df

# Load the data
X_images, y_images = load_image_data()
customer_data = load_customer_data()

print("Image data shape:", X_images.shape)
print("Image labels shape:", y_images.shape)
print("\nCustomer data:")
print(customer_data.head())

# Preprocess image data
X_images = X_images / 255.0  # Normalize pixel values
X_images = X_images.reshape(-1, 28, 28, 1)  # Reshape for CNN input

# Preprocess customer data
X_customer = customer_data[['age', 'income']].values
y_customer = pd.get_dummies(customer_data['purchase_frequency']).values

scaler = StandardScaler()
X_customer_scaled = scaler.fit_transform(X_customer)

# Split the data
X_train_img, X_test_img, y_train_img, y_test_img = train_test_split(X_images, y_images, test_size=0.2, random_state=42)
X_train_cust, X_test_cust, y_train_cust, y_test_cust = train_test_split(X_customer_scaled, y_customer, test_size=0.2, random_state=42)

print("\nPreprocessing complete. Data split into training and test sets.")

Defaulting to user installation because normal site-packages is not writeable
Image data shape: (1000, 28, 28)
Image labels shape: (1000,)

Customer data:
   age gender        income purchase_frequency
0   56      M  55362.893658               High
1   69      M  41190.958734             Medium
2   46      F  47108.303191               High
3   32      M  47689.449260             Medium
4   60      M  70352.851613                Low

Preprocessing complete. Data split into training and test sets.
