In [143]:
%pip install -U sagemaker
%pip install xgboost

Looking in indexes: https://pypi.org/simple, https://pip.repos.neuron.amazonaws.com
Note: you may need to restart the kernel to use updated packages.
Looking in indexes: https://pypi.org/simple, https://pip.repos.neuron.amazonaws.com
Note: you may need to restart the kernel to use updated packages.


INFO:botocore.credentials:Found credentials from IAM Role: BaseNotebookInstanceEc2InstanceRole
INFO:botocore.credentials:Found credentials from IAM Role: BaseNotebookInstanceEc2InstanceRole
INFO:sagemaker.image_uris:Defaulting to only available Python version: py3


In [225]:
import boto3
import sagemaker
from sagemaker import get_execution_role
from sagemaker.sklearn.processing import SKLearnProcessor

region = boto3.session.Session().region_name

role = get_execution_role()


INFO:botocore.credentials:Found credentials from IAM Role: BaseNotebookInstanceEc2InstanceRole
INFO:botocore.credentials:Found credentials from IAM Role: BaseNotebookInstanceEc2InstanceRole
INFO:sagemaker.image_uris:Defaulting to only available Python version: py3


In [226]:
import pandas as pd
data = "s3://finkers-bucket/alldata/cleaned_startups.csv".format(region)
df = pd.read_csv(data)

list(df)

['Unnamed: 0',
 'id',
 'created_at',
 'funding_rounds',
 'funding_total_usd',
 'number_of_members',
 'number_of_founders',
 'pre_money_valuation',
 'post_money_valuation',
 'mean_funding',
 'max_funding',
 'crowdfunding',
 'post-ipo',
 'private-equity',
 'venture',
 'a',
 'angel',
 'b',
 'c',
 'convertible',
 'crowd',
 'crowd_equity',
 'd',
 'debt_round',
 'e',
 'f',
 'g',
 'grant',
 'partial',
 'post_ipo_debt',
 'post_ipo_equity',
 'private_equity',
 'secondary_market',
 'seed',
 'unattributed',
 'number of invested VCs',
 'total investment from VCs']

In [227]:
# Data cleaning

# Check that there is no missing data
CATEGORICAL = list(filter(lambda x: df[x].dtype=='object',df.columns))
def show_stats(columns):
    stat = {}
    for col in columns:
        stat[col] =  [df[col].nunique(), df[col].isna().mean()*100, df[col].dtype]
    return pd.DataFrame.from_dict(stat, orient='index', columns=['Unique',  'Missed values in %', 'DType'])
show_stats(CATEGORICAL)

Unnamed: 0,Unique,Missed values in %,DType
id,26709,0.0,object
created_at,26580,0.0,object


In [228]:
import numpy as np

valid = df.loc[df['post_money_valuation'] > 0]
valid = valid.drop("pre_money_valuation", axis=1).drop("id",axis=1)
valid["created_at"] = pd.to_datetime(valid["created_at"])
valid['year'] = pd.DatetimeIndex(valid['created_at']).year
valid['month'] = pd.DatetimeIndex(valid['created_at']).month
valid['day'] = pd.DatetimeIndex(valid['created_at']).day
valid = valid.drop("created_at",axis=1)


# Remove data that have pre but not post valuation (consider removing)
remove = df.loc[(df['post_money_valuation'] == 0) & (df['pre_money_valuation'] > 0)]

valid.dtypes

Unnamed: 0                     int64
funding_rounds                 int64
funding_total_usd            float64
number_of_members            float64
number_of_founders           float64
post_money_valuation         float64
mean_funding                 float64
max_funding                  float64
crowdfunding                 float64
post-ipo                     float64
private-equity               float64
venture                      float64
a                            float64
angel                        float64
b                            float64
c                            float64
convertible                  float64
crowd                        float64
crowd_equity                 float64
d                            float64
debt_round                   float64
e                            float64
f                            float64
g                            float64
grant                        float64
partial                      float64
post_ipo_debt                float64
p

In [230]:
# Allocate training and test sets, and save files to bucket
import sagemaker,boto3, os

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    valid.drop("post_money_valuation", axis=1),
    valid["post_money_valuation"],
    test_size = 0.2,
    random_state = 0)

#Log10 transform target col
y_train = np.log10(y_train)
y_test = np.log10(y_test)

final_train = X_train.copy()
final_train.insert(0,"post_money_valuation",y_train)

final_test = X_test.copy()
final_test.insert(0,"post_money_valuation",y_test)

final_train.to_csv("final_training.csv", index=False, header=False)
final_test.to_csv("final_testing.csv", index=False, header=False)

bucket = 'finkers-bucket'

boto3.Session().resource('s3').Bucket(bucket).Object(
    os.path.join('training/final_training.csv')).upload_file('final_training.csv')

boto3.Session().resource('s3').Bucket(bucket).Object(
    os.path.join('validation/final_testing.csv')).upload_file('final_testing.csv')

INFO:botocore.credentials:Found credentials from IAM Role: BaseNotebookInstanceEc2InstanceRole
INFO:botocore.credentials:Found credentials from IAM Role: BaseNotebookInstanceEc2InstanceRole


In [231]:
from sagemaker.xgboost.estimator import XGBoost
from sagemaker.session import Session
from sagemaker.inputs import TrainingInput

hyperparameters = {
        "max_depth":"5",
        "eta":"0.2",
        "gamma":"3",
        "min_child_weight":"6",
        "subsample":"0.7",
        "verbosity":"1",
        "objective":"reg:squarederror",
        "num_round":"1000",
        "early_stopping_rounds": "5"}

output_path = 's3://finkers-bucket/output/'

xgboost_container = sagemaker.image_uris.retrieve("xgboost", region, "1.5-1", "latest")
estimator = sagemaker.estimator.Estimator(image_uri=xgboost_container, 
                                          hyperparameters=hyperparameters,
                                          role=role,
                                          instance_count=1, 
                                          instance_type='ml.m4.xlarge', 
                                          volume_size=1, # 1 GB 
                                          output_path=output_path)

# define the data type and paths to the training and validation datasets
content_type = "csv"
train_input = TrainingInput("s3://{}/{}/{}".format(bucket, 'training', "final_training.csv"), content_type=content_type)
validation_input = TrainingInput("s3://{}/{}/{}".format(bucket, 'validation', 'final_testing.csv'), content_type=content_type)

# execute the XGBoost training job
estimator.fit({'train': train_input, "validation": validation_input})

INFO:sagemaker.image_uris:Ignoring unnecessary Python version: latest.
INFO:sagemaker.image_uris:Ignoring unnecessary instance type: None.
INFO:botocore.credentials:Found credentials from IAM Role: BaseNotebookInstanceEc2InstanceRole
INFO:sagemaker:Creating training-job with name: sagemaker-xgboost-2023-01-16-08-55-05-141


2023-01-16 08:55:05 Starting - Starting the training job...
2023-01-16 08:55:21 Starting - Preparing the instances for training......
2023-01-16 08:56:31 Downloading - Downloading input data...
2023-01-16 08:57:01 Training - Downloading the training image......
2023-01-16 08:57:37 Training - Training image download completed. Training in progress.[34m[2023-01-16 08:57:53.804 ip-10-0-169-108.ap-southeast-1.compute.internal:7 INFO utils.py:27] RULE_JOB_STOP_SIGNAL_FILENAME: None[0m
[34m[2023-01-16:08:57:53:INFO] Imported framework sagemaker_xgboost_container.training[0m
[34m[2023-01-16:08:57:53:INFO] Failed to parse hyperparameter objective value reg:squarederror to Json.[0m
[34mReturning the value itself[0m
[34m[2023-01-16:08:57:53:INFO] No GPUs detected (normal if no gpus installed)[0m
[34m[2023-01-16:08:57:53:INFO] Running XGBoost Sagemaker in algorithm mode[0m
[34m[2023-01-16:08:57:53:INFO] Determined delimiter of CSV input is ','[0m
[34m[2023-01-16:08:57:53:INFO] Dete

In [232]:
from sagemaker.serializers import CSVSerializer

xgb_predictor = estimator.deploy(initial_instance_count=1,instance_type='ml.m4.xlarge', serializer = CSVSerializer())


INFO:sagemaker:Creating model with name: sagemaker-xgboost-2023-01-16-08-58-57-469
INFO:sagemaker:Creating endpoint-config with name sagemaker-xgboost-2023-01-16-08-58-57-469
INFO:sagemaker:Creating endpoint with name sagemaker-xgboost-2023-01-16-08-58-57-469


------!

In [233]:
predictions = xgb_predictor.predict(X_test.values).decode('utf-8')
y_preds = np.fromstring(predictions, sep='\n')
y_preds

array([6.33967447, 5.27834129, 5.18182802, 6.52271795, 5.01564455,
       5.1010828 , 5.23800802, 8.37554455, 5.70354843, 6.19517756,
       6.06058931, 7.43921471, 6.4201684 , 5.70805073, 6.66614866,
       7.8162818 , 5.37521219, 6.92956352, 7.28550482, 5.4831295 ,
       4.0953784 , 5.73403549, 5.85450459, 8.0687685 , 5.8023324 ,
       8.26784992, 6.015625  , 6.52586555, 9.34286213, 6.66070986,
       5.30417109, 7.28687286, 6.87459898, 5.99541044, 5.80041361,
       7.06702471, 6.39491415, 6.57245445, 6.37933683, 6.51789236,
       7.94397163, 6.39472961, 5.91348982, 6.61259937, 6.5731988 ,
       6.28219032, 6.4868145 , 5.14106464, 6.37257051, 6.05520964,
       6.91374683, 5.93503237, 5.78766584, 6.79505634, 5.90298653,
       5.34210873, 5.93279743, 3.59774017, 5.36846447, 6.668643  ,
       6.33507395, 3.87094259, 6.41080904, 5.73552847, 4.62413931,
       1.98669922, 5.10040283, 5.49983215, 3.29827929, 6.13534451,
       6.08099556, 6.35636377, 6.52167892, 6.79176712, 6.25344

In [234]:
from sklearn.metrics import mean_squared_error
rmse = mean_squared_error(y_test, y_preds, squared=False)
frac = rmse/y_test.mean()
frac

0.12963276462442194

In [239]:
rmse

0.81655675956416

In [None]:
# Utility Code

In [172]:
# Assert that training file is in the bucket

!aws s3 ls finkers-bucket/train --recursive

2023-01-16 07:43:51          0 training/
2023-01-16 07:44:45     137338 training/final_training.csv


In [None]:
# Run to delete endpoint only
sgemaker_boto3.delete_endpoint(EndpointName=xgb_predictor.endpoint)