In [2]:
import seaborn as sn
import pandas as pd
import json, os
import numpy as np
import csv
import matplotlib.pyplot as plt
import random
from collections import OrderedDict
import time
import random

from sklearn.metrics import accuracy_score, recall_score, f1_score, precision_score, \
roc_auc_score, confusion_matrix, classification_report
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split

from sklearn.svm import SVC  
from sklearn.naive_bayes import GaussianNB
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

In [3]:
seeders = [123456, 789012, 345678, 901234, 567890, 123, 456, 789, 123, 456]

seed = seeders[0]

np.random.seed(seed)
random.seed(seed)

In [None]:
root_path = os.path.join('..', '..')

In [None]:
dataset = pd.read_csv(os.path.join(root_path, 'data', 'train.csv'))

In [None]:
data = dataset.sample(frac=1, random_state=seed).reset_index(drop=True)
print(data.head())
print(len(data))

In [None]:
data = data[data["project"] != "Chrome"]
print(len(data))

In [None]:
data = data[["processed_func", "target"]]
data.head()

In [None]:
data = data.dropna(subset=["processed_func"])

In [None]:
word_counts = data["processed_func"].apply(lambda x: len(x.split()))
max_length = word_counts.max()
print("Maximum number of words:", max_length)

In [None]:
vc = data["target"].value_counts()

print(vc)

print("Percentage: ", (vc[1] / vc[0])*100, '%')

n_categories = len(vc)
print(n_categories)

In [None]:
train_data = pd.DataFrame(({'text': data['processed_func'], 'label': data['target']}))
#train_data = train_data[0:100]
train_data.head()

In [None]:
val_data = pd.read_csv(os.path.join(root_path, 'data', 'val.csv'))

val_data = val_data[val_data["project"] != "Chrome"]

val_data = pd.DataFrame(({'text': val_data['processed_func'], 'label': val_data['target']}))
val_data.head()

In [None]:
test_data = pd.read_csv(os.path.join(root_path, 'data', 'test.csv'))

test_data = test_data[test_data["project"] != "Chrome"]

test_data = pd.DataFrame(({'text': test_data['processed_func'], 'label': test_data['target']}))

In [None]:
sampling = False
if n_categories == 2 and sampling == True:
    # Apply under-sampling with the specified strategy
    class_counts = pd.Series(train_data["label"]).value_counts()
    print("Class distribution ", class_counts)

    majority_class = class_counts.idxmax()
    print("Majority class ", majority_class)

    minority_class = class_counts.idxmin()
    print("Minority class ", minority_class)

    target_count = 2 * class_counts[class_counts.idxmin()] # class_counts[class_counts.idxmin()] # int(class_counts.iloc[0] / 2) 
    print("Targeted number of majority class", target_count)

    # under
    sampling_strategy = {majority_class: target_count}        
    rus = RandomUnderSampler(random_state=seed, sampling_strategy=sampling_strategy)

    x_train_resampled, y_train_resampled = rus.fit_resample(np.array(train_data["text"]).reshape(-1, 1), train_data["label"]) 
    print("Class distribution after augmentation", pd.Series(y_train_resampled).value_counts())


    # Shuffle the resampled data while preserving the correspondence between features and labels
    x_train_resampled, y_train_resampled = shuffle(x_train_resampled, y_train_resampled, random_state=seed)

    # rename
    X_train = x_train_resampled
    Y_train = y_train_resampled

    X_train = pd.Series(X_train.reshape(-1))

else:
    X_train = train_data["text"]
    Y_train = train_data["label"]

In [3]:
# textual code data
X_val = val_data["text"]

X_test = test_data["text"]

In [4]:
# labels
y_train = Y_train
y_val = val_data["label"]
y_test = test_data["label"]

In [5]:
# apply BoW feature extraction
vectorizer = TfidfVectorizer(norm='l2', max_features=1000)
vectorizer = vectorizer.fit(X_train)

In [6]:
X_train = np.asarray(vectorizer.transform(X_train).todense())
X_val = np.asarray(vectorizer.transform(X_val).todense())
X_test = np.asarray(vectorizer.transform(X_test).todense())

In [7]:
# define model
rf = RandomForestClassifier(n_estimators=1000,
                            n_jobs=-1,
                            verbose=1)


In [8]:
# train model
rf.fit(X_train, y_train)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 6 concurrent workers.
[Parallel(n_jobs=-1)]: Done  38 tasks      | elapsed:  2.3min
[Parallel(n_jobs=-1)]: Done 188 tasks      | elapsed: 11.0min
[Parallel(n_jobs=-1)]: Done 438 tasks      | elapsed: 24.0min
[Parallel(n_jobs=-1)]: Done 788 tasks      | elapsed: 42.1min
[Parallel(n_jobs=-1)]: Done 1000 out of 1000 | elapsed: 52.8min finished


In [9]:
# make predictions
val_preds = rf.predict(X_val)
preds = rf.predict(X_test)

[Parallel(n_jobs=6)]: Using backend ThreadingBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:    0.1s
[Parallel(n_jobs=6)]: Done 188 tasks      | elapsed:    0.6s
[Parallel(n_jobs=6)]: Done 438 tasks      | elapsed:    1.5s
[Parallel(n_jobs=6)]: Done 788 tasks      | elapsed:    2.9s
[Parallel(n_jobs=6)]: Done 1000 out of 1000 | elapsed:    3.6s finished


In [None]:
# evaluate on validation data
f1 = f1_score(y_true=y_val, y_pred=val_preds)
precision = precision_score(y_true=y_val, y_pred=val_preds)
recall = recall_score(y_true=y_val, y_pred=val_preds)
f2=5*precision*recall / (4*precision+recall)
print(f"F1 Score: {f1}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F2 Score: {f2}")

In [10]:
# evaluate on test data
f1 = f1_score(y_true=y_test, y_pred=preds)
precision = precision_score(y_true=y_test, y_pred=preds)
recall = recall_score(y_true=y_test, y_pred=preds)
f2=5*precision*recall / (4*precision+recall)
print(f"F1 Score: {f1}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F2 Score: {f2}")

cm = confusion_matrix(y_test, preds)
#print(cm)
sn.heatmap(cm, annot=True)
print(classification_report(y_test, preds))

F1 Score: 0.25
Precision: 0.4903047091412742
Recall: 0.16777251184834124
