In [4]:
# Install necessary libraries
# !pip install boto3 sagemaker pandas scikit-learn

# Import necessary libraries
import pandas as pd
import boto3
import pickle
import os
import tarfile
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.datasets import fetch_california_housing
import shutil

# Define S3 bucket and output paths
s3_bucket = 'justin-inferences'  # Replace with your S3 bucket name for checkpoints and model-related things
model_local_path = '../models/model.pkl'
tar_gz_local_path = '../models/model.tar.gz'
model_artifact_path = 's3://{}/model/model.tar.gz'.format(s3_bucket)

# Clear the cache by removing the cache directory
cache_dir = os.path.join(os.path.expanduser("~"), "scikit_learn_data")
if os.path.exists(cache_dir):
    shutil.rmtree(cache_dir)
    print(f"Cleared cache at {cache_dir}")

# Load and prepare your training data
california = fetch_california_housing()
X, y = pd.DataFrame(california.data, columns=california.feature_names), pd.Series(california.target)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the model
model = LinearRegression()
model.fit(X_train, y_train)

# Save the model to a file locally
os.makedirs('../models', exist_ok=True)
with open(model_local_path, 'wb') as f:
    pickle.dump(model, f)

# Create a tar.gz file
with tarfile.open(tar_gz_local_path, "w:gz") as tar:
    tar.add(model_local_path, arcname=os.path.basename(model_local_path))

# Upload the tar.gz file to S3
s3 = boto3.client('s3', region_name='us-east-1')
s3.upload_file(tar_gz_local_path, s3_bucket, 'model/model.tar.gz')

print(f"Model saved to {model_artifact_path} and locally at {tar_gz_local_path}")


Cleared cache at C:\Users\Admin\scikit_learn_data
Model saved to s3://justin-inferences/model/model.tar.gz and locally at ../models/model.tar.gz
