In [1]:
import pandas as pd
import numpy as np
import pickle
from transformers import AutoModel, AutoTokenizer
import torch
from tqdm.notebook import tqdm
from warnings import simplefilter

In [2]:
simplefilter(action="ignore", category=pd.errors.PerformanceWarning)
pd.options.mode.chained_assignment = None

# Load tokenizer & model

In [None]:
checkpoint = "Salesforce/codet5p-110m-embedding"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

tokenizer = AutoTokenizer.from_pretrained(checkpoint, trust_remote_code=True)
model = AutoModel.from_pretrained(checkpoint, trust_remote_code=True).to(device)

# Util functions

In [4]:
def save_model(file_name, model):
    with open(file_name, 'wb') as file:
        pickle.dump(model, file)


def embed_sequence(model, sequence):
    # attention_masks = np.ones(len(sequence), dtype=int)
    # out = model(sequence)
    # pooled_embedding = torch.mean(out, dim=0)
    return model(sequence)[0].cpu().detach().numpy()


def embed_line_by_line(df, model, df_path):
    df['embeded_sequence_sum'] = None
    df['embeded_sequence_avg'] = None
    for i, row in tqdm(df.iterrows()):
        lines = row['method'].split('\n')
        embeded = []
        for line in lines:
            try:
                if len(line) > 0:
                    tokens = tokenizer.encode(line, return_tensors="pt").to(device)
                    embeded.append(embed_sequence(model, tokens))
            except Exception as e:
                print('Exception')
                print(e)
                print(line)
        embeded = np.asarray(embeded)
        df['embeded_sequence_sum'][i] = np.sum(embeded, axis=0)
        df['embeded_sequence_avg'][i] = np.mean(embeded, axis=0)
        # print(f"{i}. {row['sample_id']}")

    pd.to_pickle(df, df_path)


def embed_class(df, model, df_path):
    df['embeded_sequence'] = None
    for i, row in tqdm(df.iterrows()):
        # print(f"{i}. {row['sample_id']}")
        try:
            df['embeded_sequence'][i] = embed_sequence(model,
               tokenizer.encode(row['method'], return_tensors="pt").to(device))
        except Exception as e:
            print("EXCEPTION " + str(i))
            print(e)

    pd.to_pickle(df, df_path)
    
def cols_with_null(df):
    sum_nan = df.isnull().sum().to_frame()
    sum_nan.columns=['#NaN']
    sum_nan_thresh = sum_nan[sum_nan['#NaN'] > 0]
    pd.set_option('display.max_rows', None)
    print(sum_nan_thresh)

# Generate embeddings

In [5]:
df = pd.read_csv('/kaggle/input/mlcq-code-smells/data_class.csv')
df['label'] = np.where(df.severity == 'none', 0, 1)

In [None]:
embedding_path = './T5_plus_df_fe_embeded_by_line.pkl'
embed_line_by_line(df, model, embedding_path)

In [None]:
embedding_path = './T5_plus_df_fe_embeded.pkl'
embed_class(df, model, embedding_path)

# Aggregate embeddings 

In [8]:
file_path = './T5_plus_df_fe_embeded_by_line.pkl'
with open(file_path, "rb") as fh:
    df = pickle.load(fh)
df.head()

Unnamed: 0,sample_id,severity,method,label,embeded_sequence_sum,embeded_sequence_avg
0,4432196,major,@JsonSerialize(include = JsonSerialize.Inclusi...,1,"[-2.940714, -0.4194301, -3.420236, 2.5291574, ...","[-0.049011897, -0.0069905017, -0.057003934, 0...."
1,7391055,major,@Value\r\npublic class Release {\r\n\r\n\tpriv...,1,"[-0.42214704, -0.21482264, -0.81495583, 0.4101...","[-0.046905227, -0.023869181, -0.090550646, 0.0..."
2,4687786,major,public class GroomServerStatus implements Writ...,1,"[-7.2888536, -1.2876477, -16.215038, 5.7705374...","[-0.0398298, -0.0070363265, -0.08860677, 0.031..."
3,3797964,major,public static class Value {\r\n int...,1,"[-0.24960531, 0.04043095, -0.06891978, -0.0137...","[-0.06240133, 0.010107737, -0.017229944, -0.00..."
4,5339993,major,public class InstantiatedVersionedLabel extend...,1,"[-0.43222657, 0.13846795, -1.5239453, 0.357454...","[-0.022748766, 0.007287787, -0.080207646, 0.01..."


In [9]:
embeded_sum = df.loc[:, ['sample_id', 'severity', 'label', 'embeded_sequence_sum']]
embeded_avg = df.loc[:, ['sample_id', 'severity', 'label', 'embeded_sequence_avg']]
embeded_sum.head()

Unnamed: 0,sample_id,severity,label,embeded_sequence_sum
0,4432196,major,1,"[-2.940714, -0.4194301, -3.420236, 2.5291574, ..."
1,7391055,major,1,"[-0.42214704, -0.21482264, -0.81495583, 0.4101..."
2,4687786,major,1,"[-7.2888536, -1.2876477, -16.215038, 5.7705374..."
3,3797964,major,1,"[-0.24960531, 0.04043095, -0.06891978, -0.0137..."
4,5339993,major,1,"[-0.43222657, 0.13846795, -1.5239453, 0.357454..."


In [10]:
n = len(embeded_avg.embeded_sequence_avg[0])
n

256

## Line by line (Sum)

In [11]:
embeded_sum["embeded_sequence_sum"]

0       [-2.940714, -0.4194301, -3.420236, 2.5291574, ...
1       [-0.42214704, -0.21482264, -0.81495583, 0.4101...
2       [-7.2888536, -1.2876477, -16.215038, 5.7705374...
3       [-0.24960531, 0.04043095, -0.06891978, -0.0137...
4       [-0.43222657, 0.13846795, -1.5239453, 0.357454...
                              ...                        
2154    [-13.812866, -1.404182, -14.243062, 9.7452, -1...
2155    [-3.9812224, -0.05903911, -6.9324274, 3.325756...
2156    [-8.600702, -1.2161518, -10.065453, 4.8647795,...
2157    [-18.468273, 0.64463806, -26.12898, 17.591528,...
2158    [-1.4878904, -0.016329564, -1.9423453, 0.78523...
Name: embeded_sequence_sum, Length: 2159, dtype: object

In [12]:
columns = [f'em_{i+1}' for i in range(n)]
data = pd.DataFrame(embeded_sum["embeded_sequence_sum"].to_list(), columns=columns)
data['label'] = embeded_sum['label']
data['sample_id'] = embeded_sum['sample_id']
data['severity'] = embeded_sum['severity']
data.head()

Unnamed: 0,em_1,em_2,em_3,em_4,em_5,em_6,em_7,em_8,em_9,em_10,...,em_250,em_251,em_252,em_253,em_254,em_255,em_256,label,sample_id,severity
0,-2.940714,-0.41943,-3.420236,2.529157,-3.268705,-0.794287,1.861847,0.922473,-2.550556,-0.680615,...,1.65718,-5.35397,-0.630297,1.391051,-3.09304,-2.006454,-0.199604,1,4432196,major
1,-0.422147,-0.214823,-0.814956,0.410188,-0.664965,-0.150506,0.223903,-0.196193,-0.34549,0.056269,...,0.04598,-0.324954,-0.212359,0.99414,-0.29715,-0.103053,0.412256,1,7391055,major
2,-7.288854,-1.287648,-16.215038,5.770537,-10.063114,-3.369243,0.058749,3.068726,-4.718543,1.702126,...,7.902037,-8.1736,-3.949423,5.200824,-10.987302,-2.030042,-2.962582,1,4687786,major
3,-0.249605,0.040431,-0.06892,-0.013734,-0.221833,-0.130678,-0.064837,0.062101,-0.018497,-0.06932,...,-0.049142,-0.2756,-0.069007,0.047209,-0.072848,0.01707,-0.182038,1,3797964,major
4,-0.432227,0.138468,-1.523945,0.357454,-1.045204,-0.148029,0.760149,-0.13861,-0.663277,-0.12591,...,0.405414,-0.664842,0.029378,0.03601,-0.663721,-0.871525,-0.221499,1,5339993,major


In [13]:
dataset_path = './T5_plus_line_sum.pkl'
pd.to_pickle(data, dataset_path)

## Line by line (Average)

In [14]:
columns = [f'em_{i+1}' for i in range(n)]
data = pd.DataFrame(embeded_avg["embeded_sequence_avg"].to_list(), columns=columns)
data['label'] = embeded_avg['label']
data['sample_id'] = embeded_avg['sample_id']
data['severity'] = embeded_avg['severity']
data.head()

Unnamed: 0,em_1,em_2,em_3,em_4,em_5,em_6,em_7,em_8,em_9,em_10,...,em_250,em_251,em_252,em_253,em_254,em_255,em_256,label,sample_id,severity
0,-0.049012,-0.006991,-0.057004,0.042153,-0.054478,-0.013238,0.031031,0.015375,-0.042509,-0.011344,...,0.02762,-0.089233,-0.010505,0.023184,-0.051551,-0.033441,-0.003327,1,4432196,major
1,-0.046905,-0.023869,-0.090551,0.045576,-0.073885,-0.016723,0.024878,-0.021799,-0.038388,0.006252,...,0.005109,-0.036106,-0.023595,0.11046,-0.033017,-0.01145,0.045806,1,7391055,major
2,-0.03983,-0.007036,-0.088607,0.031533,-0.05499,-0.018411,0.000321,0.016769,-0.025784,0.009301,...,0.043181,-0.044664,-0.021582,0.02842,-0.06004,-0.011093,-0.016189,1,4687786,major
3,-0.062401,0.010108,-0.01723,-0.003434,-0.055458,-0.032669,-0.016209,0.015525,-0.004624,-0.01733,...,-0.012286,-0.0689,-0.017252,0.011802,-0.018212,0.004267,-0.045509,1,3797964,major
4,-0.022749,0.007288,-0.080208,0.018813,-0.055011,-0.007791,0.040008,-0.007295,-0.034909,-0.006627,...,0.021338,-0.034992,0.001546,0.001895,-0.034933,-0.04587,-0.011658,1,5339993,major


In [15]:
dataset_path = './T5_plus_line_avg.pkl'
pd.to_pickle(data, dataset_path)

## Whole

In [16]:

file_path = './T5_plus_df_fe_embeded.pkl'
with open(file_path, "rb") as fh:
  df = pickle.load(fh)

print(len(df))
df.head()

2159


Unnamed: 0,sample_id,severity,method,label,embeded_sequence_sum,embeded_sequence_avg,embeded_sequence
0,4432196,major,@JsonSerialize(include = JsonSerialize.Inclusi...,1,"[-2.940714, -0.4194301, -3.420236, 2.5291574, ...","[-0.049011897, -0.0069905017, -0.057003934, 0....","[-0.03266676, -0.014134759, -0.026271144, 0.01..."
1,7391055,major,@Value\r\npublic class Release {\r\n\r\n\tpriv...,1,"[-0.42214704, -0.21482264, -0.81495583, 0.4101...","[-0.046905227, -0.023869181, -0.090550646, 0.0...","[-0.01641362, 0.004841463, -0.074231334, -0.01..."
2,4687786,major,public class GroomServerStatus implements Writ...,1,"[-7.2888536, -1.2876477, -16.215038, 5.7705374...","[-0.0398298, -0.0070363265, -0.08860677, 0.031...","[-0.01804091, -0.0048276465, -0.16424337, -0.0..."
3,3797964,major,public static class Value {\r\n int...,1,"[-0.24960531, 0.04043095, -0.06891978, -0.0137...","[-0.06240133, 0.010107737, -0.017229944, -0.00...","[-0.09794383, 0.048422616, -0.008628332, -0.02..."
4,5339993,major,public class InstantiatedVersionedLabel extend...,1,"[-0.43222657, 0.13846795, -1.5239453, 0.357454...","[-0.022748766, 0.007287787, -0.080207646, 0.01...","[-0.0065223454, 0.013562555, -0.120787665, -0...."


In [17]:
cols_with_null(df)

                  #NaN
embeded_sequence   129


In [18]:
df = df.dropna()
len(df)

2030

In [19]:
n = len(df.embeded_sequence[0])
columns = [f'em_{i+1}' for i in range(n)]
df[columns] = df["embeded_sequence"].to_list()

In [20]:
df.drop(columns=['method', 'embeded_sequence',
                 # 'embeded_sequence_sum', 'embeded_sequence_avg'
                ], inplace=True)
df.head()

Unnamed: 0,sample_id,severity,label,embeded_sequence_sum,embeded_sequence_avg,em_1,em_2,em_3,em_4,em_5,...,em_247,em_248,em_249,em_250,em_251,em_252,em_253,em_254,em_255,em_256
0,4432196,major,1,"[-2.940714, -0.4194301, -3.420236, 2.5291574, ...","[-0.049011897, -0.0069905017, -0.057003934, 0....",-0.032667,-0.014135,-0.026271,0.013122,0.006806,...,0.044912,0.096656,-0.113224,0.040938,-0.052985,-0.08003,0.055118,0.007036,0.150335,0.124481
1,7391055,major,1,"[-0.42214704, -0.21482264, -0.81495583, 0.4101...","[-0.046905227, -0.023869181, -0.090550646, 0.0...",-0.016414,0.004841,-0.074231,-0.010195,-0.077366,...,-0.006608,0.040132,-0.107381,-0.024466,0.114401,-0.005538,0.104168,0.064047,0.154163,0.184892
2,4687786,major,1,"[-7.2888536, -1.2876477, -16.215038, 5.7705374...","[-0.0398298, -0.0070363265, -0.08860677, 0.031...",-0.018041,-0.004828,-0.164243,-0.003471,-0.006314,...,0.053331,0.09211,-0.069606,0.068281,-0.036756,-0.063533,0.096139,-0.091888,0.074853,0.069856
3,3797964,major,1,"[-0.24960531, 0.04043095, -0.06891978, -0.0137...","[-0.06240133, 0.010107737, -0.017229944, -0.00...",-0.097944,0.048423,-0.008628,-0.022152,-0.070442,...,-0.042569,0.057705,-0.030025,-0.039155,-0.02214,-0.040839,0.010294,-0.000435,0.066752,0.040659
4,5339993,major,1,"[-0.43222657, 0.13846795, -1.5239453, 0.357454...","[-0.022748766, 0.007287787, -0.080207646, 0.01...",-0.006522,0.013563,-0.120788,-0.006446,-0.05139,...,0.013946,0.074363,-0.096485,0.024297,0.080034,-0.005665,0.041369,0.029679,0.003668,0.032187


In [21]:
dataset_path = './T5_plus_whole.pkl'
pd.to_pickle(df, dataset_path)