# ***Imports***

In [None]:
import os
import getpass

import warnings
warnings.filterwarnings('ignore')
warnings.simplefilter(action='ignore', category=DeprecationWarning)
warnings.simplefilter(action='ignore', category=RuntimeWarning)
warnings.simplefilter(action='ignore', category=FutureWarning)

from teradataml import *
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import lightgbm as lgb
from sklearn2pmml import sklearn2pmml, PMMLPipeline
from sklearn.metrics import accuracy_score, classification_report

# ***Connect to Teradata Vantage***

In [None]:
host = '172.16.0.12'
username = getpass.getpass("Username: ")
password = getpass.getpass("Password: ")

# Create connection context
eng = create_context(host=host, username=username, password=password)
print("Connected to Vantage:", eng)

# ***Data exploration***

### Load data from Vantage table

In [None]:
tdf = DataFrame(in_schema('Source_data_db', 'jcr_fake_events'))

In [None]:
tdf.head()

In [None]:
tdf.head().show_query()

### Check data types and basic statistics

In [None]:
print("Dataset shape:", tdf.shape)
print("\nColumn information:")
tdf.info()
print("\nBasic statistics:")
tdf.describe()

### Load and preview data

In [None]:
# Load Teradata DataFrame
df1 = DataFrame(in_schema("source_data_db", "jcr_fake_events"))
df1

In [None]:
# Convert to pandas
df1 = df1.to_pandas()

# Check column names
print("Columns:", df1.columns)

# Normalize just in case
df1.columns = df1.columns.str.strip().str.lower()
print("Normalized columns:", df1.columns)

# Proceed with timestamp conversion and plotting
df1['ts'] = pd.to_datetime(df1['ts'])

# Now 'customer_id' should be present
print(df1.head())


In [None]:
df1 = df1.reset_index()

In [None]:
# Clean column names
df1.columns = df1.columns.str.strip().str.lower()

# Convert timestamp column
df1['ts'] = pd.to_datetime(df1['ts'])


In [None]:
# 1. Product purchase frequency
product_counts = df1['product_id'].value_counts()

plt.figure(figsize=(10, 5))
plt.bar(product_counts.index, product_counts.values, color='skyblue')
plt.title('Product Purchase Frequency')
plt.xlabel('Product ID')
plt.ylabel('Number of Purchases')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()


In [None]:
# 2. Customer purchase distribution
customer_purchases = df1['customer_id'].value_counts()

plt.figure(figsize=(8, 5))
plt.hist(customer_purchases.values, bins=20, color='lightgreen', alpha=0.7)
plt.title('Purchases per Customer')
plt.xlabel('Number of Purchases')
plt.ylabel('Number of Customers')
plt.tight_layout()
plt.show()

In [None]:
# 3. Price distribution by product
plt.figure(figsize=(10, 6))
df1.boxplot(column='price', by='product_id')
plt.title('Price Distribution by Product')
plt.suptitle('')  # remove default title
plt.xlabel('Product ID')
plt.ylabel('Price')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
# 4. Purchases over time
df_time = df1.set_index('ts').resample('D').size()

plt.figure(figsize=(12, 5))
plt.plot(df_time.index, df_time.values, color='coral')
plt.title('Purchases Over Time')
plt.xlabel('Date')
plt.ylabel('Number of Purchases')
plt.tight_layout()
plt.show()

# ***Analyze the sequence of purchase of products by a customer***

# Copy to Vantage for nPath

In [None]:
copy_to_sql(df1[['customer_id', 'product_id', 'price' ,'ts']], table_name='npath_purchase_data', if_exists='replace')

# Run NPath to detect sequential purchases

In [None]:
npath_result = NPath(
    data1=DataFrame("npath_purchase_data"),
    data1_partition_column=["customer_id"],
    data1_order_column="ts",
    mode="NONOVERLAPPING",
    symbols=["TRUE AS A", "TRUE AS B"],
    pattern="A.B",
    result=[
        "FIRST(product_id OF A) AS current_product",
        "FIRST(product_id OF B) AS next_product",
        "FIRST(customer_id OF A) AS customer_id",
        "FIRST(ts OF A) AS current_ts",
        "FIRST(ts OF B) AS next_ts",
        "FIRST(price OF A) AS current_price",
        "FIRST(price OF B) AS next_price"
    ]
)

purchase_pairs_df = npath_result.result.to_pandas()
purchase_pairs_df

# ***Generate target variable***

In [None]:
transition_analysis = (
    purchase_pairs_df.groupby(["current_product", "next_product"])
    .size()
    .reset_index(name="count")
    .sort_values(by="count", ascending=False)
)

transition_analysis.head(10)

# Time-based features

In [None]:
sequences_df = purchase_pairs_df.copy()
sequences_df["current_ts"] = pd.to_datetime(sequences_df["current_ts"])
sequences_df["next_ts"] = pd.to_datetime(sequences_df["next_ts"])

sequences_df["time_between_purchases"] = (sequences_df["next_ts"] - sequences_df["current_ts"]).dt.total_seconds() / 3600  # in hours
sequences_df["hour"] = sequences_df["current_ts"].dt.hour
sequences_df["day_of_week"] = sequences_df["current_ts"].dt.dayofweek
sequences_df["day_of_month"] = sequences_df["current_ts"].dt.day
sequences_df["is_weekend"] = sequences_df["day_of_week"].isin([5, 6]).astype(int)

sequences_df.head()

# Customer-level aggregates

In [None]:
df1["ts"] = pd.to_datetime(df1["ts"])

customer_features = (
    df1.groupby("customer_id")
    .agg(
        total_purchases=("product_id", "count"),
        avg_price=("price", "mean"),
        max_price=("price", "max"),
        min_price=("price", "min"),
        first_purchase=("ts", "min"),
        last_purchase=("ts", "max")
    )
    .reset_index()
)

customer_features["purchase_frequency"] = (
    (customer_features["last_purchase"] - customer_features["first_purchase"]).dt.days
    / customer_features["total_purchases"].replace(0, 1)
)
customer_features["customer_lifetime_days"] = (
    customer_features["last_purchase"] - customer_features["first_purchase"]
).dt.days

reference_date = df1["ts"].max()
customer_features["recency_days"] = (reference_date - customer_features["last_purchase"]).dt.days

# Final Dataset Creation

In [None]:
# Merge sequence data with customer features to create final modeling dataset
print("Creating final modeling dataset...")
model_data = sequences_df.merge(customer_features, on='customer_id', how='left')

print(f"Initial merged dataset shape: {model_data.shape}")
print(f"Available columns: {list(model_data.columns)}")

# Create temporal features if they don't exist
if 'current_ts' in model_data.columns:
    model_data['current_ts'] = pd.to_datetime(model_data['current_ts'])
    
    # Create missing temporal features
    if 'current_hour' not in model_data.columns:
        model_data['current_hour'] = model_data['current_ts'].dt.hour
        print("✓ Created current_hour feature")
    
    if 'current_day_of_week' not in model_data.columns:
        model_data['current_day_of_week'] = model_data['current_ts'].dt.dayofweek
        print("✓ Created current_day_of_week feature")
else:
    print("WARNING: No current_ts column found, using default temporal features")
    model_data['current_hour'] = 12  # Default noon
    model_data['current_day_of_week'] = 2  # Default Wednesday

# Create robust engineered features with safety checks
model_data['price_ratio'] = model_data['current_price'] / (model_data['avg_price'] + 0.01)  # Avoid division by zero
model_data['is_expensive_product'] = (model_data['current_price'] > model_data['avg_price']).astype(int)

# Create is_weekend feature (now we're sure current_day_of_week exists)
model_data['is_weekend'] = model_data['current_day_of_week'].isin([5, 6]).astype(int)

# Handle missing values safely
numeric_columns = ['std_price', 'customer_lifetime_days', 'purchase_frequency']
for col in numeric_columns:
    if col in model_data.columns:
        model_data[col] = model_data[col].fillna(0)

# Remove rows with missing target
initial_rows = len(model_data)
model_data = model_data.dropna(subset=['next_product'])
final_rows = len(model_data)

print(f"Data cleaned: {initial_rows} -> {final_rows} rows")
print(f"Target variable distribution:")
if final_rows > 0:
    print(model_data['next_product'].value_counts())
    print(f"Available columns after processing: {list(model_data.columns)}")
else:
    print("ERROR: No valid data for training!")

# ***Generate a predictive model using lightGBM and sklearn***

# Model Training with LightGBM

In [None]:
# Check if we have sufficient data for modeling
if 'model_data' not in locals() or len(model_data) < 10:
    print("ERROR: Insufficient data for modeling. Need at least 10 samples.")
    print("This likely means the NPath analysis didn't find enough sequential patterns.")
    print("Consider:")
    print("1. Checking if customers have multiple purchases")
    print("2. Verifying the date range has sufficient data")
    print("3. Reducing the time window for sequence detection")
else:
    print(f"Proceeding with modeling using {len(model_data)} samples")
    
    # Encode categorical variables
    le_current = LabelEncoder()
    le_target = LabelEncoder()
    
    model_data['current_product_encoded'] = le_current.fit_transform(model_data['current_product'])
    model_data['next_product_encoded'] = le_target.fit_transform(model_data['next_product'])
    
    # Define available features (only use columns that exist)
    potential_features = [
        'current_price', 'time_between_purchases', 'current_hour', 'current_day_of_week',
        'avg_price', 'std_price', 'min_price', 'max_price', 'total_purchases',
        'customer_lifetime_days', 'purchase_frequency', 'price_ratio', 
        'is_expensive_product', 'is_weekend', 'current_product_encoded'
    ]
    
    # Filter to only existing columns
    available_features = [f for f in potential_features if f in model_data.columns]
    print(f"Using features: {available_features}")
    
    # Prepare final datasets
    X = model_data[available_features].copy()
    y = model_data['next_product_encoded'].copy()
    
    # Check for any remaining data issues
    print(f"Feature matrix shape: {X.shape}")
    print(f"Target classes: {len(le_target.classes_)} products: {list(le_target.classes_)}")
    
    # Verify no infinite or missing values
    infinite_cols = X.columns[X.isin([np.inf, -np.inf]).any()].tolist()
    if infinite_cols:
        print(f"WARNING: Infinite values found in: {infinite_cols}")
        X = X.replace([np.inf, -np.inf], 0)
    
    missing_cols = X.columns[X.isna().any()].tolist()
    if missing_cols:
        print(f"WARNING: Missing values found in: {missing_cols}")
        X = X.fillna(0)
    
    # Split data for training and testing
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y
    )
    
    print(f"\nData split completed:")
    print(f"• Training set: {X_train.shape}")
    print(f"• Test set: {X_test.shape}")
    
    # Configure LightGBM with more conservative settings for small datasets
    
    n_samples = len(X_train)
    lgbm_model = lgb.LGBMClassifier(
        objective='multiclass',
        num_class=len(le_target.classes_),
        boosting_type='gbdt',
        num_leaves=min(31, max(2, n_samples // 10)),  # Adaptive num_leaves
        learning_rate=0.1,  # Higher learning rate for small datasets
        feature_fraction=0.8,
        bagging_fraction=0.8,
        bagging_freq=5,
        min_data_in_leaf=max(1, n_samples // 50),  # Adaptive min_data_in_leaf
        verbose=-1,  # Suppress warnings
        random_state=42,
        n_estimators=min(100, max(10, n_samples // 2)),  # Adaptive n_estimators
        force_col_wise=True  # For better performance on small datasets
    )
    
    print(f"\nTraining LightGBM model with {lgbm_model.n_estimators} estimators...")
    print(f"Model configured for dataset size: {n_samples} samples")
    
    # Train the model
    lgbm_model.fit(X_train, y_train)
    
    # Generate predictions
    y_pred = lgbm_model.predict(X_test)
    y_pred_proba = lgbm_model.predict_proba(X_test)
    
    print("✓ Model training completed successfully!")


# Evaluate model performance

In [None]:
# Evaluate model performance (only if model was trained successfully)
if 'model_data' in locals() and len(model_data) >= 10 and 'lgbm_model' in locals():
    
    # Calculate accuracy
    accuracy = accuracy_score(y_test, y_pred)
    
    print("Model Performance Results:")
    print("=" * 40)
    print(f"• Overall Accuracy: {accuracy:.4f} ({accuracy:.1%})")
    
    # Show detailed classification report
    if len(le_target.classes_) <= 10:  # Only show detailed report for manageable number of classes
        print(f"\nDetailed Classification Report:")
        print(classification_report(y_test, y_pred, target_names=le_target.classes_))
    
    # Feature importance analysis
    feature_importance = pd.DataFrame({
        'feature': available_features,
        'importance': lgbm_model.feature_importances_
    }).sort_values('importance', ascending=False)
    
    print("\nTop 5 Most Important Features:")
    print(feature_importance.head(5).to_string(index=False))
    
    # Create a simple visualization
    plt.figure(figsize=(10, 6))
    top_features = feature_importance.head(min(10, len(feature_importance)))
    plt.barh(range(len(top_features)), top_features['importance'])
    plt.yticks(range(len(top_features)), top_features['feature'])
    plt.xlabel('Feature Importance')
    plt.title('Top Feature Importance for Next Purchase Prediction')
    plt.gca().invert_yaxis()
    plt.tight_layout()
    plt.show()
    
else:
    print("⚠️ Model evaluation skipped - no trained model available")
    print("This indicates insufficient sequential purchase data in your dataset")
    if 'model_data' not in locals():
        print("ERROR: model_data variable not found - please run previous cells first")
    elif len(model_data) < 10:
        print(f"ERROR: Only {len(model_data)} samples available, need at least 10")
    else:
        print("ERROR: lgbm_model not found - model training may have failed")


# ***Import model in Vantage***

In [None]:
# Export the trained LightGBM model to PMML and import into Vantage (BYOM)
if 'lgbm_model' in locals() and 'X_train' in locals() and 'y_train' in locals():
    try:
        # Wrap the model in a PMMLPipeline
        pipeline = PMMLPipeline([("classifier", lgbm_model)])
        pipeline.fit(X_train, y_train)  # Fit again to ensure pipeline is ready
        pmml_filename = "next_purchase_lgbm.pmml"
        sklearn2pmml(pipeline, pmml_filename)
        print(f"Model exported to {pmml_filename}")
        
        byom_table = "byom_models"  # Table to store BYOM models
        model_id = "next_purchase_lgbm"
        save_byom(model_id=model_id, model_file=pmml_filename, table_name=byom_table)
        print(f"Model {model_id} imported into Vantage in table {byom_table}")
    except Exception as e:
        print(f"Error during BYOM export/import: {e}")
else:
    print("Model not trained or missing data. Please run previous cells.")

# ***Score the model in-database***

In [None]:
# Score the model in-database using PMMLPredict
try:
    # Retrieve the model from Vantage
    model_tdf = retrieve_byom("next_purchase_lgbm", table_name="byom_models")
    # Prepare test data from Vantage (adjust table/columns as needed)
    test_data = DataFrame("npath_purchase_data")
    # Score the model in-database
    scored_results = PMMLPredict(
        modeldata=model_tdf,
        newdata=test_data,
        accumulate=['customer_id', 'product_id', 'price', 'ts'],  # Adjust as needed
        overwrite_cached_models='*'
    )
    print("In-database scoring completed. Sample results:")
    display(scored_results.result.head())
except Exception as e:
    print(f"Error during in-database scoring: {e}")