In [2]:
from transformers import pipeline
import os
import numpy as np
from dask.distributed import Client
from pathlib import Path

pipe = pipeline("feature-extraction", model="microsoft/codebert-base")

### Why CodeBert?

Codebert is a model trained on both comments (of the code) and code. [Model](https://huggingface.co/microsoft/codebert-base). [Data](https://github.com/github/CodeSearchNet#data).


In [3]:
vector = pipe(["import os", "a"])
len(vector), len(vector[0]), len(vector[0][0])

(2, 1, 4)

# Job


In [4]:
path_dir = "./input_data/"

In [5]:
all_files = [Path(path_dir + x) for x in os.listdir(path_dir)]
print(all_files[:3])

[WindowsPath('input_data/ClimbingStairs.py'), WindowsPath('input_data/Containers.py'), WindowsPath('input_data/HouseRobberDynProg.py')]


In [6]:
def get_texts(filepath):
    with open(filepath) as file:
        return file.read()

In [6]:
corpus = [get_texts(path) for path in all_files]
embeddings = [out for out in pipe(corpus, padding=True, truncation=True)]
print(
    len(embeddings), len(embeddings[0]), len(embeddings[0][0]), len(embeddings[0][0][0])
)
cls_components = [x[0][0] for x in embeddings]
print(len(cls_components), len(cls_components[0]))

12 1 192 768
12 768


In [7]:
import pandas as pd

names = [
    "María Rodríguez López",
    "Alejandro Pérez García",
    "Ana Martínez Fernández",
    "Juan García Sánchez",
    "Carmen González Ruiz",
]
dates = [
    "3-10-2023 09:15",
    "9-10-2023 14:30",
    "15-10-2023 18:45",
    "21-10-2023 11:20",
    "5-11-2023 20:00",
    "11-11-2023 16:10",
    "17-11-2023 08:55",
    "23-11-2023 22:25",
    "29-11-2023 13:40",
    "6-12-2023 07:30",
    "12-12-2023 19:55",
    "18-12-2023 10:50",
]

subjects = [
    "Programación Orientada a Objetos (POO)",
    "Estructuras de Datos y Algoritmos",
    "Desarrollo Web Avanzado",
    "Bases de Datos y Administración",
    "Programación en Sistemas Embebidos",
]

df = pd.DataFrame({"text": corpus, "vector": cls_components})
df["author"] = np.random.choice(names, size=len(df))
df["subject"] = np.random.choice(subjects, size=len(df))
df["code_language"] = "Python"
df["lines_of_code"] = df["text"].str.split("\n").apply(len)
df["date"] = pd.to_datetime(
    pd.Series(dates),
    format="%d-%m-%Y %H:%M",
)

df.to_parquet("output_data/dataframe_text_embeddings.parquet")

In [27]:
import pandas as pd
import numpy as np

df = pd.read_parquet("output_data/dataframe_text_embeddings.parquet")
df.head()

Unnamed: 0,text,vector,author,subject,code_language,lines_of_code
0,class Solution(object):\n def climbStairs(s...,"[-0.21761193871498108, -0.006907954812049866, ...",Ana Martínez Fernández,Desarrollo Web Avanzado,Python,15
1,"def calculate_area(height, l, r):\n return ...","[-0.17671486735343933, -0.11769969761371613, -...",Ana Martínez Fernández,Estructuras de Datos y Algoritmos,Python,28
2,"class Solution:\n def rob(self, nums) -> in...","[-0.11747059971094131, -0.04676730930805206, -...",Ana Martínez Fernández,Estructuras de Datos y Algoritmos,Python,15
3,# Definition for singly-linked list.\n# class ...,"[-0.21408772468566895, -0.02248210459947586, -...",Alejandro Pérez García,Bases de Datos y Administración,Python,24
4,class Solution:\n def longestPalindrome(sel...,"[-0.35301244258880615, -0.0692468136548996, -0...",María Rodríguez López,Programación Orientada a Objetos (POO),Python,28


In [28]:
df["date"] = pd.to_datetime(
    pd.Series(dates),
    format="%d-%m-%Y %H:%M",
)

In [30]:
df.to_parquet("output_data/dataframe_text_embeddings.parquet")