In [1]:
import numpy as np
import pandas as pd
from sklearn import preprocessing
from sklearn.model_selection import cross_validate
from sklearn.pipeline import make_pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import recall_score
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.python.keras.callbacks import EarlyStopping
from tensorflow.python.keras import Model, Sequential, layers, regularizers, optimizers
from colorama import Fore, Style
import pickle
from typing import Tuple
import os
import time
import glob

2022-11-29 15:07:41.020005: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [39]:
def tokenize(X:np.ndarray):
    """
    Accepts raw source_code as input and tokenizes using TF-IDF
    :return: returns the preprocessed X and the vocab_size
    """
    print(Fore.BLUE + "\nTokenizes source code..." + Style.RESET_ALL)
    # Initialize Tokenizer and fit to X
    tk = Tokenizer()
    tk.fit_on_texts(X)

    # Define vocab size
    vocab_size = len(tk.word_index)
    print(f'There are {vocab_size} different words in your corpus')

    # Transform to sequences
    X_token = tk.texts_to_sequences(X)

    # Pad inputs
    X_pad = pad_sequences(X_token, dtype='float32', padding='post', value=0)
    print("\n✅ Source code tokenized")
    return X_pad, vocab_size



def label_encode(y:np.ndarray)->np.ndarray:
    target_encoder = LabelEncoder().fit(y)
    target_encoded = target_encoder.transform(y)
    
    target_cat = to_categorical(target_encoded, num_classes = len(np.unique(target_encoded)))
    
    return target_cat



def initialize_model(X_pad: np.ndarray,
                     y = np.ndarray,
                     vocab_size = None) -> Model:
    """
    Initialize the CNN with random weights
    """

    print(Fore.BLUE + "\nInitialize model..." + Style.RESET_ALL)

    if vocab_size is not None:
        input_dim = vocab_size
    else:
        print(f"\n❌ vocab size needed to define input dimension. Please insert the vocab size returned by the tokenize function.")
        return None

    input_length = X_pad.shape[1]

    print(f'input_dim = {input_dim}')
    print(f'input_length = {input_length}')

    model = Sequential([
        layers.Embedding(input_dim=input_dim, input_length=input_length, output_dim=256, mask_zero=True),
        layers.Conv1D(128, kernel_size=3),
        layers.MaxPool1D(pool_size = (4)),
        layers.Conv1D(128, kernel_size=5),
        layers.MaxPool1D(pool_size = (4)),
        layers.Conv1D(128, kernel_size=7),
        layers.MaxPool1D(pool_size = (4)),
        layers.Conv1D(128, kernel_size=9),
        layers.MaxPool1D(pool_size = (4)),
        layers.Flatten(),
        layers.Dense(y.shape[1], activation="softmax"),  # check if we need to input the number of categories in softmax
        ])

    print("\n✅ Model initialized. Summary:")
    print(model.summary())

    return model



def compile_model(model: Model) -> Model:
    """
    Compile the CNN
    """
    print(Fore.BLUE + "\nCompile model..." + Style.RESET_ALL)
    es = EarlyStopping(patience=10, restore_best_weights=True)

    model.compile(loss='categorical_crossentropy',
                  optimizer='rmsprop',
                  metrics=['accuracy'])

    print("\n✅ Model compiled")
    return model



def train_model(model: Model,
                X_pad: np.ndarray,
                y: np.ndarray,
                batch_size=64,
                epochs=200,
                patience=2,
                verbose=0,
                validation_split=0.2
                ) -> Tuple[Model, dict]:
    """
    Fit model and return a tuple (fitted_model, history)
    """

    print(Fore.BLUE + "\nTrain model..." + Style.RESET_ALL)
    breakpoint()
    # TODO Discuss: should we use monitor?
    es = EarlyStopping(patience=10,
                       restore_best_weights=True,
                       # monitor="val_loss"
                       )

    history = model.fit(X_pad,
                      y,
                      epochs=epochs,
                      batch_size=batch_size,
                      verbose=verbose,
                      validation_split=validation_split,
                      callbacks=[es])

    print(f"\n✅ Model trained ({len(X_pad)} rows)")

    return model, history


def predict(code:str)-> np.ndarray:
    """
    Accepts a piece of code as an input, to predict its author as a return.
    :param code: a given peace of code.
    :return: returns an array containing one or more predictions of authors for the given peaces of code
    """
    print(Fore.BLUE + "\nPredict author..." + Style.RESET_ALL)
    # Load model
    model = pickle.load(open("model.pkl","rb"))

    # predict with model
    prediction = model.predict_proba(code)

    # TODO inverse_transform the result
    print(f"\n✅ Prediction done!")
    return prediction
    # return prediction_inversed



def evaluate_model(model: Model,
                   X: np.ndarray,
                   y: np.ndarray,
                   batch_size=64) -> Tuple[Model, dict]:
    """
    Evaluate trained model performance on dataset
    # TODO are the metrics rigt? Which one should we used?
    """

    print(Fore.BLUE + f"\nEvaluate model on {len(X)} rows..." + Style.RESET_ALL)

    if model is None:
        print(f"\n❌ no model to evaluate")
        return None

    metrics = model.evaluate(
        x=X,
        y=y,
        batch_size=batch_size,
        verbose=1,
        # callbacks=None,
        return_dict=True)

    loss = metrics["loss"]
    mae = metrics["mae"]

    print(f"\n✅ Model evaluated: loss {round(loss, 2)} mae {round(mae, 2)}")

    return metrics


def save_model(model: Model = None,
               params: dict = None,
               metrics: dict = None) -> None:
    """
    persist trained model, params and metrics
    """

    timestamp = time.strftime("%Y%m%d-%H%M%S")

    print(Fore.BLUE + "\nSave model to local disk..." + Style.RESET_ALL)

    # save params
    if params is not None:
        params_path = os.path.join('params.pkl', "params", timestamp + ".pickle")
        print(f"- params path: {params_path}")
        with open(params_path, "wb") as file:
            pickle.dump(params, file)

    # save metrics
    if metrics is not None:
        metrics_path = os.path.join('metrics.pkl', "metrics", timestamp + ".pickle")
        print(f"- metrics path: {metrics_path}")
        with open(metrics_path, "wb") as file:
            pickle.dump(metrics, file)

    # save model
    if model is not None:
        model_path = os.path.join('model.pkl', "models", timestamp)
        print(f"- model path: {model_path}")
        model.save(model_path)

    print("\n✅ Data saved locally")

    return None


# TODO not working yet - check paths?
def load_model(save_copy_locally=False) -> Model:
    """
    load the latest saved model, return None if no model found
    """
    print(Fore.BLUE + "\nLoad model from local disk..." + Style.RESET_ALL)

    # get latest model version
    model_directory = os.path.join("models")

    results = glob.glob(f"{model_directory}/*")
    if not results:
        return None

    model_path = sorted(results)[-1]
    print(f"- path: {model_path}")

    model = models.load_model(model_path)
    print("\n✅ model loaded from disk")

    return model

In [51]:
# read data
data= pd.read_csv('../raw_data/preprocessed_dataset.csv')[:1000]


In [52]:
X = data["code_source"]
y = label_encode(y = data["username"])

In [53]:
y.shape

(1000, 24)

In [54]:
X.shape

(1000,)

In [55]:
data_tokenized, vocab_size = tokenize(X = X)
print(data_tokenized)
print(vocab_size)



[34m
Tokenizes source code...[0m
There are 4365 different words in your corpus

✅ Source code tokenized
[[ 11. 135.  11. ...   0.   0.   0.]
 [ 11. 119.  11. ...   0.   0.   0.]
 [ 11. 119.  11. ...   0.   0.   0.]
 ...
 [ 11. 119.  11. ...   0.   0.   0.]
 [ 11. 119.  11. ...   0.   0.   0.]
 [ 11. 119.  11. ...   0.   0.   0.]]
4365


In [56]:
y.shape

(1000, 24)

In [57]:
model = initialize_model(X_pad=data_tokenized,
                         y = y,
                         vocab_size = vocab_size)




[34m
Initialize model...[0m
input_dim = 4365
input_length = 2652

✅ Model initialized. Summary:
Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 2652, 256)         1117440   
_________________________________________________________________
conv1d_12 (Conv1D)           (None, 2650, 128)         98432     
_________________________________________________________________
max_pooling1d_12 (MaxPooling (None, 662, 128)          0         
_________________________________________________________________
conv1d_13 (Conv1D)           (None, 658, 128)          82048     
_________________________________________________________________
max_pooling1d_13 (MaxPooling (None, 164, 128)          0         
_________________________________________________________________
conv1d_14 (Conv1D)           (None, 158, 128)          114816    
______________________

In [58]:
compile_model(model = model)




[34m
Compile model...[0m

✅ Model compiled


<tensorflow.python.keras.engine.sequential.Sequential at 0x139b151f0>

In [59]:
model, history = train_model(model = model,
                X_pad = data_tokenized,
                y = y)

[34m
Train model...[0m


InvalidArgumentError: Graph execution error:

Detected at node 'sequential_3/embedding_3/embedding_lookup' defined at (most recent call last):
    File "/Users/timcerta/.pyenv/versions/3.8.12/lib/python3.8/runpy.py", line 194, in _run_module_as_main
      return _run_code(code, main_globals, None,
    File "/Users/timcerta/.pyenv/versions/3.8.12/lib/python3.8/runpy.py", line 87, in _run_code
      exec(code, run_globals)
    File "/Users/timcerta/.pyenv/versions/3.8.12/envs/xref/lib/python3.8/site-packages/ipykernel_launcher.py", line 17, in <module>
      app.launch_new_instance()
    File "/Users/timcerta/.pyenv/versions/3.8.12/envs/xref/lib/python3.8/site-packages/traitlets/config/application.py", line 982, in launch_instance
      app.start()
    File "/Users/timcerta/.pyenv/versions/3.8.12/envs/xref/lib/python3.8/site-packages/ipykernel/kernelapp.py", line 712, in start
      self.io_loop.start()
    File "/Users/timcerta/.pyenv/versions/3.8.12/envs/xref/lib/python3.8/site-packages/tornado/platform/asyncio.py", line 215, in start
      self.asyncio_loop.run_forever()
    File "/Users/timcerta/.pyenv/versions/3.8.12/lib/python3.8/asyncio/base_events.py", line 570, in run_forever
      self._run_once()
    File "/Users/timcerta/.pyenv/versions/3.8.12/lib/python3.8/asyncio/base_events.py", line 1859, in _run_once
      handle._run()
    File "/Users/timcerta/.pyenv/versions/3.8.12/lib/python3.8/asyncio/events.py", line 81, in _run
      self._context.run(self._callback, *self._args)
    File "/Users/timcerta/.pyenv/versions/3.8.12/envs/xref/lib/python3.8/site-packages/ipykernel/kernelbase.py", line 510, in dispatch_queue
      await self.process_one()
    File "/Users/timcerta/.pyenv/versions/3.8.12/envs/xref/lib/python3.8/site-packages/ipykernel/kernelbase.py", line 499, in process_one
      await dispatch(*args)
    File "/Users/timcerta/.pyenv/versions/3.8.12/envs/xref/lib/python3.8/site-packages/ipykernel/kernelbase.py", line 406, in dispatch_shell
      await result
    File "/Users/timcerta/.pyenv/versions/3.8.12/envs/xref/lib/python3.8/site-packages/ipykernel/kernelbase.py", line 730, in execute_request
      reply_content = await reply_content
    File "/Users/timcerta/.pyenv/versions/3.8.12/envs/xref/lib/python3.8/site-packages/ipykernel/ipkernel.py", line 383, in do_execute
      res = shell.run_cell(
    File "/Users/timcerta/.pyenv/versions/3.8.12/envs/xref/lib/python3.8/site-packages/ipykernel/zmqshell.py", line 528, in run_cell
      return super().run_cell(*args, **kwargs)
    File "/Users/timcerta/.pyenv/versions/3.8.12/envs/xref/lib/python3.8/site-packages/IPython/core/interactiveshell.py", line 2885, in run_cell
      result = self._run_cell(
    File "/Users/timcerta/.pyenv/versions/3.8.12/envs/xref/lib/python3.8/site-packages/IPython/core/interactiveshell.py", line 2940, in _run_cell
      return runner(coro)
    File "/Users/timcerta/.pyenv/versions/3.8.12/envs/xref/lib/python3.8/site-packages/IPython/core/async_helpers.py", line 129, in _pseudo_sync_runner
      coro.send(None)
    File "/Users/timcerta/.pyenv/versions/3.8.12/envs/xref/lib/python3.8/site-packages/IPython/core/interactiveshell.py", line 3139, in run_cell_async
      has_raised = await self.run_ast_nodes(code_ast.body, cell_name,
    File "/Users/timcerta/.pyenv/versions/3.8.12/envs/xref/lib/python3.8/site-packages/IPython/core/interactiveshell.py", line 3318, in run_ast_nodes
      if await self.run_code(code, result, async_=asy):
    File "/Users/timcerta/.pyenv/versions/3.8.12/envs/xref/lib/python3.8/site-packages/IPython/core/interactiveshell.py", line 3378, in run_code
      exec(code_obj, self.user_global_ns, self.user_ns)
    File "/var/folders/xx/tt65cl5915g65mpfmwy5nk8h0000gn/T/ipykernel_10534/3232910807.py", line 1, in <module>
      model, history = train_model(model = model,
    File "/var/folders/xx/tt65cl5915g65mpfmwy5nk8h0000gn/T/ipykernel_10534/4016102854.py", line 113, in train_model
      history = model.fit(X_pad,
    File "/Users/timcerta/.pyenv/versions/3.8.12/envs/xref/lib/python3.8/site-packages/tensorflow/python/keras/engine/training.py", line 1218, in fit
      val_logs = self.evaluate(
    File "/Users/timcerta/.pyenv/versions/3.8.12/envs/xref/lib/python3.8/site-packages/tensorflow/python/keras/engine/training.py", line 1497, in evaluate
      tmp_logs = self.test_function(iterator)
    File "/Users/timcerta/.pyenv/versions/3.8.12/envs/xref/lib/python3.8/site-packages/tensorflow/python/keras/engine/training.py", line 1327, in test_function
      return step_function(self, iterator)
    File "/Users/timcerta/.pyenv/versions/3.8.12/envs/xref/lib/python3.8/site-packages/tensorflow/python/keras/engine/training.py", line 1318, in step_function
      outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "/Users/timcerta/.pyenv/versions/3.8.12/envs/xref/lib/python3.8/site-packages/tensorflow/python/keras/engine/training.py", line 1311, in run_step
      outputs = model.test_step(data)
    File "/Users/timcerta/.pyenv/versions/3.8.12/envs/xref/lib/python3.8/site-packages/tensorflow/python/keras/engine/training.py", line 1270, in test_step
      y_pred = self(x, training=False)
    File "/Users/timcerta/.pyenv/versions/3.8.12/envs/xref/lib/python3.8/site-packages/tensorflow/python/keras/engine/base_layer.py", line 1044, in __call__
      outputs = call_fn(inputs, *args, **kwargs)
    File "/Users/timcerta/.pyenv/versions/3.8.12/envs/xref/lib/python3.8/site-packages/tensorflow/python/keras/engine/sequential.py", line 379, in call
      return super(Sequential, self).call(inputs, training=training, mask=mask)
    File "/Users/timcerta/.pyenv/versions/3.8.12/envs/xref/lib/python3.8/site-packages/tensorflow/python/keras/engine/functional.py", line 419, in call
      return self._run_internal_graph(
    File "/Users/timcerta/.pyenv/versions/3.8.12/envs/xref/lib/python3.8/site-packages/tensorflow/python/keras/engine/functional.py", line 555, in _run_internal_graph
      outputs = node.layer(*args, **kwargs)
    File "/Users/timcerta/.pyenv/versions/3.8.12/envs/xref/lib/python3.8/site-packages/tensorflow/python/keras/engine/base_layer.py", line 1044, in __call__
      outputs = call_fn(inputs, *args, **kwargs)
    File "/Users/timcerta/.pyenv/versions/3.8.12/envs/xref/lib/python3.8/site-packages/tensorflow/python/keras/layers/embeddings.py", line 191, in call
      out = embedding_ops.embedding_lookup_v2(self.embeddings, inputs)
Node: 'sequential_3/embedding_3/embedding_lookup'
indices[3,122] = 4365 is not in [0, 4365)
	 [[{{node sequential_3/embedding_3/embedding_lookup}}]] [Op:__inference_test_function_3758]