In [1]:
!pip install sagemaker
!pip install scikit-learn
!pip install tree-sitter tree-sitter-python tree-sitter-typescript tree-sitter-javascript tree-sitter-go
!pip install pandas matplotlib ipywidgets ipympl

Collecting tree-sitter
  Downloading tree_sitter-0.22.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (10 kB)
Collecting tree-sitter-python
  Downloading tree_sitter_python-0.21.0-cp38-abi3-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1.8 kB)
Collecting tree-sitter-typescript
  Downloading tree_sitter_typescript-0.21.2-cp38-abi3-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (2.3 kB)
Collecting tree-sitter-javascript
  Downloading tree_sitter_javascript-0.21.4-cp38-abi3-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1.9 kB)
Collecting tree-sitter-go
  Downloading tree_sitter_go-0.21.0-cp38-abi3-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1.7 kB)
Downloading tree_sitter-0.22.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (542 kB)
[2K   [90m━━━━━━━━━━━━━

In [2]:
import tree_sitter_python
import tree_sitter_go
import tree_sitter_javascript
from tree_sitter import Language, Parser

PY_LANG = Language(tree_sitter_python.language())
TS_LANG = Language(tree_sitter_javascript.language())
GO_LANG = Language(tree_sitter_go.language())

In [4]:
import os

bucket_name = "github-repos-forked-jk4784nr"
data_path = "./data"
valid_extensions = [".go", ".py", ".ts"]
extension_map_tokenizer = {"go": GO_LANG, "py": PY_LANG, "ts": TS_LANG}
nodes_to_ignore = []

def read_file(file_path):
    try:
        with open(file_path, 'r') as file:
            content = file.read()
        return file_path, content
    except Exception as e:
        return file_path, f"Error reading file: {str(e)}"

def collect_tokens(node, source_code, tokens, depth=0):
    if node.child_count == 0:
        if node.type not in nodes_to_ignore:
            start_byte = node.start_byte
            end_byte = node.end_byte
            token = source_code[start_byte:end_byte].decode("utf-8")
            tokens.append((node.type, token, depth))
    else:
        for child in node.children:
            collect_tokens(child, source_code, tokens, depth+1)

def load_source_code_files(directory):
    file_paths = []
    for root, dirs, files in os.walk(directory):
        for file in files:
            if any(file.endswith(ext) for ext in valid_extensions):
                file_path = os.path.join(root, file)
                file_paths.append(file_path)
    return file_paths

def count_file_types_class(files):
    file_types = {}
    for f in files:
        extension = f.split(".")[-1]
        if extension not in file_types:
            file_types[extension] = 0
        file_types[extension] += 1
    return file_types

### Copy data

In [7]:
import sagemaker
from sagemaker import Session

session = Session()
role = sagemaker.get_execution_role()

In [8]:
role

'arn:aws:iam::941652505371:role/service-role/AmazonSageMaker-ExecutionRole-20240711T221808'

In [9]:
session.download_data(path=data_path, bucket=bucket_name)

['./data/cpython/Android/android.py',
 './data/cpython/Android/testbed/app/src/main/python/main.py',
 './data/cpython/Doc/conf.py',
 './data/cpython/Doc/includes/dbpickle.py',
 './data/cpython/Doc/includes/diff.py',
 './data/cpython/Doc/includes/email-alternative.py',
 './data/cpython/Doc/includes/email-dir.py',
 './data/cpython/Doc/includes/email-headers.py',
 './data/cpython/Doc/includes/email-mime.py',
 './data/cpython/Doc/includes/email-read-alternative.py',
 './data/cpython/Doc/includes/email-simple.py',
 './data/cpython/Doc/includes/email-unpack.py',
 './data/cpython/Doc/includes/minidom-example.py',
 './data/cpython/Doc/includes/mp_newtype.py',
 './data/cpython/Doc/includes/mp_pool.py',
 './data/cpython/Doc/includes/mp_workers.py',
 './data/cpython/Doc/includes/ndiff.py',
 './data/cpython/Doc/includes/newtypes/setup.py',
 './data/cpython/Doc/includes/newtypes/test.py',
 './data/cpython/Doc/includes/tzinfo_examples.py',
 './data/cpython/Doc/tools/extensions/asdl_highlight.py',
 '

In [10]:
!ls ./data

cpython  go  strapi  tools


### Process data

In [11]:
data_files = load_source_code_files(data_path)
data_files[:10]

['./data/go/misc/go_android_exec/main.go',
 './data/go/misc/go_android_exec/exitcode_test.go',
 './data/go/misc/ios/detect.go',
 './data/go/misc/ios/go_ios_exec.go',
 './data/go/misc/cgo/gmp/gmp.go',
 './data/go/misc/cgo/gmp/fib.go',
 './data/go/misc/cgo/gmp/pi.go',
 './data/go/misc/linkcheck/linkcheck.go',
 './data/go/test/closure.go',
 './data/go/test/typeswitch.go']

In [12]:
len(data_files), count_file_types_class(data_files)

(2000, {'go': 1000, 'py': 500, 'ts': 500})

In [41]:
from sklearn.feature_extraction.text import TfidfVectorizer
from functools import lru_cache

@lru_cache(maxsize=None)
def tokenizer(file):
    file_extension = file.split(".")[-1]
    file_content = read_file(file)[1]

    lang = extension_map_tokenizer[file_extension]
    parser = Parser(lang)
    tree = parser.parse(bytes(file_content, "utf8"))
    
    tokens = []
    collect_tokens(tree.root_node, file_content.encode("utf8"), tokens)

    node_types = [x[0] for x in tokens]
    return node_types

vectorizer = TfidfVectorizer(tokenizer=tokenizer)

In [42]:
features = vectorizer.fit_transform(data_files)
print(f"n_samples: {features.shape[0]}, n_features: {features.shape[1]}")

n_samples: 2000, n_features: 147


#### Store vectorizer in S3

In [18]:
import boto3
import pickle

vectorizer_pickle = pickle.dumps(vectorizer)
s3_client = boto3.client("s3")
s3_client.put_object(
    Body=vectorizer_pickle,
    Bucket=bucket_name,
    Key="vectorizer_pickle.pkl"
)

response = s3_client.get_object(Bucket=bucket_name, Key="vectorizer_pickle.pkl")
body = response["Body"].read()
vectorize_pkl = pickle.loads(body)

In [22]:
vectorize_pkl.get_feature_names_out()

array(['\n', '!', '!=', '!==', '"', '${', '%', '%=', '&', '&&', '&=',
       '&^', '&^=', "'", '(', ')', '*', '**', '*=', '+', '++', '+=', ',',
       '-', '--', '-=', '.', '...', '/', '/=', '/>', ':', ':=', ';', '<',
       '<-', '<<', '<<=', '<=', '=', '==', '===', '=>', '>', '>=', '>>',
       '>>=', '?', '??', 'ERROR', '[', ']', '^', '^=', '`', 'as', 'async',
       'await', 'blank_identifier', 'break', 'case', 'catch', 'chan',
       'class', 'comment', 'const', 'continue', 'default', 'defer',
       'delete', 'else', 'escape_sequence', 'export', 'extends',
       'fallthrough', 'false', 'field_identifier', 'finally',
       'float_literal', 'for', 'from', 'func', 'function', 'get', 'go',
       'goto', 'identifier', 'if', 'imaginary_literal', 'import', 'in',
       'instanceof', 'int_literal', 'integer', 'interface', 'iota',
       'jsx_text', 'label_name', 'let', 'map', 'new', 'nil', 'null',
       'number', 'of', 'optional_chain', 'or', 'package',
       'package_identifier', '

In [24]:
vectorizer.get_feature_names_out()

array(['\n', '!', '!=', '!==', '"', '${', '%', '%=', '&', '&&', '&=',
       '&^', '&^=', "'", '(', ')', '*', '**', '*=', '+', '++', '+=', ',',
       '-', '--', '-=', '.', '...', '/', '/=', '/>', ':', ':=', ';', '<',
       '<-', '<<', '<<=', '<=', '=', '==', '===', '=>', '>', '>=', '>>',
       '>>=', '?', '??', 'ERROR', '[', ']', '^', '^=', '`', 'as', 'async',
       'await', 'blank_identifier', 'break', 'case', 'catch', 'chan',
       'class', 'comment', 'const', 'continue', 'default', 'defer',
       'delete', 'else', 'escape_sequence', 'export', 'extends',
       'fallthrough', 'false', 'field_identifier', 'finally',
       'float_literal', 'for', 'from', 'func', 'function', 'get', 'go',
       'goto', 'identifier', 'if', 'imaginary_literal', 'import', 'in',
       'instanceof', 'int_literal', 'integer', 'interface', 'iota',
       'jsx_text', 'label_name', 'let', 'map', 'new', 'nil', 'null',
       'number', 'of', 'optional_chain', 'or', 'package',
       'package_identifier', '

### Train K-Means model

In [27]:
features.toarray().dtype

dtype('float64')

In [29]:
import numpy as np

features.toarray().astype(np.float32).dtype

dtype('float32')

In [31]:
from sagemaker import KMeans

k = 3
kmeans = KMeans(
    role=role,
    k=k,
    instance_count=1,
    instance_type="ml.m4.xlarge",
    output_path=f"s3://{bucket_name}/sagemaker-test"
)
kmeans.fit(kmeans.record_set(features.toarray().astype(np.float32)))

INFO:sagemaker.image_uris:Same images used for training and inference. Defaulting to image scope: inference.
INFO:sagemaker.image_uris:Ignoring unnecessary instance type: None.
INFO:sagemaker.image_uris:Same images used for training and inference. Defaulting to image scope: inference.
INFO:sagemaker.image_uris:Ignoring unnecessary instance type: None.
INFO:sagemaker:Creating training-job with name: kmeans-2024-07-13-18-11-26-131


2024-07-13 18:11:26 Starting - Starting the training job...
2024-07-13 18:11:51 Starting - Preparing the instances for training......
2024-07-13 18:12:35 Downloading - Downloading input data...
2024-07-13 18:13:01 Downloading - Downloading the training image......
2024-07-13 18:14:22 Training - Training image download completed. Training in progress....
2024-07-13 18:14:47 Uploading - Uploading generated training model[34mDocker entrypoint called with argument(s): train[0m
[34mRunning default environment configuration script[0m
[34m[07/13/2024 18:14:38 INFO 139747023550272] Reading default configuration from /opt/amazon/lib/python3.8/site-packages/algorithm/resources/default-input.json: {'init_method': 'random', 'mini_batch_size': '5000', 'epochs': '1', 'extra_center_factor': 'auto', 'local_lloyd_max_iter': '300', 'local_lloyd_tol': '0.0001', 'local_lloyd_init_method': 'kmeans++', 'local_lloyd_num_trials': 'auto', 'half_life_time_size': '0', 'eval_metrics': '["msd"]', 'force_dense

In [33]:
kmeans_deployed = kmeans.deploy(initial_instance_count=1,
                                instance_type="ml.m4.xlarge")

INFO:sagemaker.image_uris:Same images used for training and inference. Defaulting to image scope: inference.
INFO:sagemaker.image_uris:Ignoring unnecessary instance type: None.
INFO:sagemaker:Creating model with name: kmeans-2024-07-13-18-25-46-835
INFO:sagemaker:Creating endpoint-config with name kmeans-2024-07-13-18-25-46-835
INFO:sagemaker:Creating endpoint with name kmeans-2024-07-13-18-25-46-835


---------!

### Predict

In [48]:
test_features = vectorizer.transform(data_files[:1])
result = kmeans_deployed.predict(test_features.toarray()[:2].astype(np.float32))
clusters = [r.label["closest_cluster"].float32_tensor.values[0] for r in result]

clusters

[0.0]

### Example use model inside a Lambda

In [None]:
import json
import boto3

def lambda_handler(event, context):
    # Initialize the SageMaker runtime client
    client = boto3.client('sagemaker-runtime')
    
    # Extract input data from the event
    input_data = json.loads(event['body'])
    
    # Define the SageMaker endpoint name
    endpoint_name = 'your-sagemaker-endpoint'
    
    # Invoke the SageMaker endpoint
    response = client.invoke_endpoint(
        EndpointName=endpoint_name,
        ContentType='application/json',
        Body=json.dumps(input_data)
    )
    
    # Parse the response
    result = json.loads(response['Body'].read().decode())
    
    # Return the result
    return {
        'statusCode': 200,
        'body': json.dumps(result)
    }

### Delete endpoint

In [49]:
kmeans_deployed.delete_endpoint()

INFO:sagemaker:Deleting endpoint configuration with name: kmeans-2024-07-13-18-25-46-835
INFO:sagemaker:Deleting endpoint with name: kmeans-2024-07-13-18-25-46-835
