In [33]:
import pandas as pd
import numpy as np
import sklearn.metrics
from nltk.tokenize import word_tokenize
from sklearn.neural_network import MLPClassifier
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from gensim.models.callbacks import CallbackAny2Vec

from tqdm import tqdm

### Load dataframe and separate it into train and test

In [7]:
all_df = pd.read_csv("participants_dataset.csv", index_col=0)
all_df.head()

Unnamed: 0_level_0,function_source,CWE-119,CWE-120,CWE-476,CWE-468,CWE-other
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,"value_new(const struct berval *bval,CSNType t,...",0.0,0.0,0.0,0.0,0.0
1,gpk_application_remove_packages_cb (PkTask *ta...,0.0,0.0,0.0,0.0,0.0
2,"gen_solo_work(struct pool *pool, struct work *...",1.0,1.0,0.0,0.0,0.0
3,"main(int argc, char *argv[])\n{\n\tFILE * infi...",0.0,0.0,0.0,0.0,1.0
4,getMoveActions( const QModelIndexList &indices...,0.0,0.0,0.0,0.0,0.0


In [8]:
all_df.tail()

Unnamed: 0_level_0,function_source,CWE-119,CWE-120,CWE-476,CWE-468,CWE-other
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
105225,bcm_setup(struct hci_uart *hu)\n{\n\tstruct bc...,,,,,
105226,save_timeservers(char **servers)\n{\n\tGKeyFil...,,,,,
105227,"lang_optimistic_scan(input_ty *fp, string_list...",,,,,
105228,ParseFlags() const {\n int flags = Regexp::Cl...,,,,,
105229,ipmi_lan_check_session_id (fiid_obj_t obj_lan_...,,,,,


In [18]:
test_df = all_df[np.isnan(all_df["CWE-119"])]

In [21]:
train_df = all_df[np.isnan(all_df["CWE-119"])==False]

In [24]:
print(f"train df has shape {train_df.shape} and test df has shape {test_df.shape}")

train df has shape (85804, 6) and test df has shape (19426, 6)


### Define and train model on train_df

In [34]:
train_source_codes = []
for i in tqdm(range(0, train_df.shape[0])):
    train_source_codes.append(word_tokenize(train_df.iloc[i, 0].lower()))

100%|███████████████████████████████████| 85804/85804 [01:16<00:00, 1118.67it/s]


In [35]:
class EpochLogger(CallbackAny2Vec):
    '''Callback to log information about training'''
    
    def __init__(self):
        self.epoch = 0

    def on_epoch_end(self, model):
        print("Epoch #{} end".format(self.epoch))
        self.epoch += 1

In [36]:
tagged_data = [TaggedDocument(words=train_source_codes[i], tags=[str(i)]) for i in range(0, len(train_source_codes))]

In [37]:
epoch_logger = EpochLogger()
embedding_model = Doc2Vec(vector_size=100, window=5, min_count=100, alpha=0.025, min_alpha=0.00025, dm =1, workers=4)
  
embedding_model.build_vocab(tagged_data)

embedding_model.train(tagged_data, total_examples=embedding_model.corpus_count, epochs=40, callbacks=[epoch_logger])

Epoch #0 end
Epoch #1 end
Epoch #2 end
Epoch #3 end
Epoch #4 end
Epoch #5 end
Epoch #6 end
Epoch #7 end
Epoch #8 end
Epoch #9 end
Epoch #10 end
Epoch #11 end
Epoch #12 end
Epoch #13 end
Epoch #14 end
Epoch #15 end
Epoch #16 end
Epoch #17 end
Epoch #18 end
Epoch #19 end
Epoch #20 end
Epoch #21 end
Epoch #22 end
Epoch #23 end
Epoch #24 end
Epoch #25 end
Epoch #26 end
Epoch #27 end
Epoch #28 end
Epoch #29 end
Epoch #30 end
Epoch #31 end
Epoch #32 end
Epoch #33 end
Epoch #34 end
Epoch #35 end
Epoch #36 end
Epoch #37 end
Epoch #38 end
Epoch #39 end


In [38]:
x_train = []
for i in tqdm(range(0, len(train_source_codes))):
    x_train.append(embedding_model.infer_vector(train_source_codes[i]))
y_train1 = np.asarray(train_df)[:,1].astype('float32')
y_train2 = np.asarray(train_df)[:,2].astype('float32')
y_train3 = np.asarray(train_df)[:,3].astype('float32')
y_train4 = np.asarray(train_df)[:,4].astype('float32')
y_train5 = np.asarray(train_df)[:,5].astype('float32')

100%|████████████████████████████████████| 85804/85804 [07:08<00:00, 200.09it/s]


In [39]:
predictor_model1 = MLPClassifier(activation="relu", solver="adam", alpha=1e-5, hidden_layer_sizes=(100, 100, 10), learning_rate="constant", learning_rate_init=0.001, max_iter=500)
predictor_model1.fit(x_train,y_train1)

MLPClassifier(alpha=1e-05, hidden_layer_sizes=(100, 100, 10), max_iter=500)

In [40]:
predictor_model2 = MLPClassifier(activation="relu", solver="adam", alpha=1e-5, hidden_layer_sizes=(100, 100, 10), learning_rate="constant", learning_rate_init=0.001, max_iter=500)
predictor_model2.fit(x_train,y_train2)

MLPClassifier(alpha=1e-05, hidden_layer_sizes=(100, 100, 10), max_iter=500)

In [41]:
predictor_model3 = MLPClassifier(activation="relu", solver="adam", alpha=1e-5, hidden_layer_sizes=(100, 100, 10), learning_rate="constant", learning_rate_init=0.001, max_iter=500)
predictor_model3.fit(x_train,y_train3)

MLPClassifier(alpha=1e-05, hidden_layer_sizes=(100, 100, 10), max_iter=500)

In [42]:
predictor_model4 = MLPClassifier(activation="relu", solver="adam", alpha=1e-5, hidden_layer_sizes=(100, 100, 10), learning_rate="constant", learning_rate_init=0.001, max_iter=500)
predictor_model4.fit(x_train,y_train4)

MLPClassifier(alpha=1e-05, hidden_layer_sizes=(100, 100, 10), max_iter=500)

In [43]:
predictor_model5 = MLPClassifier(activation="relu", solver="adam", alpha=1e-5, hidden_layer_sizes=(100, 100, 10), learning_rate="constant", learning_rate_init=0.001, max_iter=500)
predictor_model5.fit(x_train,y_train5)

MLPClassifier(alpha=1e-05, hidden_layer_sizes=(100, 100, 10), max_iter=500)

### Score model based on training data

In [44]:
y_pred1 = predictor_model1.predict(x_train)
y_pred2 = predictor_model2.predict(x_train)
y_pred3 = predictor_model3.predict(x_train)
y_pred4 = predictor_model4.predict(x_train)
y_pred5 = predictor_model5.predict(x_train)

In [45]:
f1_scores = [sklearn.metrics.f1_score(y_train1, y_pred1), 
             sklearn.metrics.f1_score(y_train2, y_pred2), 
             sklearn.metrics.f1_score(y_train3, y_pred3), 
             sklearn.metrics.f1_score(y_train4, y_pred4), 
             sklearn.metrics.f1_score(y_train5, y_pred5)]
print(f"Mean of f1 scores on training data is {np.mean(f1_scores)}")

Mean of f1 scores on training data is 0.9705801236713387


### Apply model on test data

In [46]:
test_source_codes = []
for i in tqdm(range(0, test_df.shape[0])):
    test_source_codes.append(word_tokenize(test_df.iloc[i, 0].lower()))

100%|███████████████████████████████████| 19426/19426 [00:16<00:00, 1172.28it/s]


In [47]:
x_test = []
for i in tqdm(range(0, len(test_source_codes))):
    x_test.append(embedding_model.infer_vector(test_source_codes[i]))

100%|████████████████████████████████████| 19426/19426 [01:32<00:00, 211.13it/s]


In [48]:
y_pred1 = predictor_model1.predict(x_test)
y_pred2 = predictor_model2.predict(x_test)
y_pred3 = predictor_model3.predict(x_test)
y_pred4 = predictor_model4.predict(x_test)
y_pred5 = predictor_model5.predict(x_test)

In [49]:
new_test_df = test_df.copy(deep=True)
new_test_df["CWE-119"] = y_pred1
new_test_df["CWE-120"] = y_pred2
new_test_df["CWE-476"] = y_pred3
new_test_df["CWE-468"] = y_pred4
new_test_df["CWE-other"] = y_pred5

In [55]:
new_test_df.tail()

Unnamed: 0_level_0,function_source,CWE-119,CWE-120,CWE-476,CWE-468,CWE-other
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
105225,bcm_setup(struct hci_uart *hu)\n{\n\tstruct bc...,0.0,0.0,0.0,0.0,0.0
105226,save_timeservers(char **servers)\n{\n\tGKeyFil...,0.0,0.0,0.0,0.0,0.0
105227,"lang_optimistic_scan(input_ty *fp, string_list...",1.0,1.0,0.0,0.0,1.0
105228,ParseFlags() const {\n int flags = Regexp::Cl...,0.0,0.0,0.0,0.0,0.0
105229,ipmi_lan_check_session_id (fiid_obj_t obj_lan_...,0.0,0.0,0.0,0.0,0.0


In [56]:
new_test_df.to_csv("SCVD_submission.csv")