In [1]:
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning) 

In [38]:
import lightgbm as lgb
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.externals import joblib
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.utils.multiclass import unique_labels

import seaborn as sns

import itertools
from scipy import stats

In [3]:
import sys
sys.path.append('../')

In [4]:
from langclass.features.build_features import Vectorizer
from langclass.data.dataframes import Data

In [5]:
data = Data()

In [6]:
train_df = data.train_df
test_df = data.test_df

In [8]:
X_train = train_df['code']
X_test = test_df['code']

In [9]:
y_train = train_df['language']
y_test = test_df['language']

In [28]:
labels = sorted(y_train.unique())

In [12]:
tokenizer_ls = ['char', 'alphanum']
ngrams_ls = [(1,1), (2,2)]

In [13]:
preds = []
for tokenizer, n_gram in itertools.product(tokenizer_ls, ngrams_ls):
    vec = Vectorizer(n_features=2**12, tokenizer=tokenizer, n_gram=n_gram)
    X_train_vec = vec.vectorize_df(X_train)
    X_test_vec = vec.vectorize_df(X_test)
    model = lgb.LGBMClassifier()
    model.fit(X_train_vec, y_train)
    y_pred = model.predict(X_test_vec)
    acc_score = accuracy_score(y_test, y_pred)
    print(f'{tokenizer}, {n_gram}, acc: {acc_score}')
    preds.append(y_pred)

  if diff:


char, (1, 1), acc: 0.8752562225475842


  if diff:


char, (2, 2), acc: 0.9376281112737921


  if diff:


alphanum, (1, 1), acc: 0.9332357247437775
alphanum, (2, 2), acc: 0.9153733528550513


  if diff:


In [33]:
cm = confusion_matrix(y_test, preds[1], labels=labels)

In [34]:
cm

array([[246,   5,   0,   1,   0,   4,   1,   0,   1,   1,   4,   3],
       [  7, 192,   0,   1,   1,   5,   1,   0,   0,   0,   0,   1],
       [  2,   0, 255,   7,   0,   7,   2,   0,   1,   0,   3,   0],
       [  0,   0,   0, 313,   0,   3,   3,   3,   1,   1,   0,   0],
       [  8,   5,   1,   2, 248,   8,   2,   0,   0,   2,   1,   0],
       [  2,   1,   0,   0,   1, 281,   3,   0,   0,   2,   3,   0],
       [  3,   0,   0,   4,   0,   3, 204,   5,   0,   2,  13,   0],
       [  0,   0,   2,   0,   0,   2,   0, 307,   0,   2,   4,   0],
       [  1,   0,   0,   8,   0,   2,   1,   1, 200,   4,   0,   0],
       [  3,   0,   0,   3,   2,   4,   0,   0,   1, 296,   2,   0],
       [  1,   0,   0,   5,   0,   4,   3,   2,   0,   1, 423,   0],
       [  2,   0,   0,   4,   0,   3,   1,   5,   0,   0,   0, 237]])

In [55]:
cm_norm = np.array([x / np.linalg.norm(x)for x in cm])

In [58]:
cm_normcbrt = np.cbrt(cm_norm)

In [59]:
df = pd.DataFrame(cm_normcbrt, index=labels, columns=labels)

In [61]:
df

Unnamed: 0,C,C++,Go,Haskell,Java,JavaScript,Julia,Lua,OCaml,Perl,Python,Racket
C,0.999807,0.272853,0.0,0.159565,0.0,0.253294,0.159565,0.0,0.159565,0.159565,0.253294,0.230133
C++,0.331471,0.999648,0.0,0.173279,0.173279,0.296303,0.173279,0.0,0.0,0.0,0.0,0.173279
Go,0.198625,0.0,0.999703,0.301571,0.0,0.301571,0.198625,0.0,0.157649,0.0,0.227369,0.0
Haskell,0.0,0.0,0.0,0.999951,0.0,0.212408,0.212408,0.212408,0.147275,0.147275,0.0,0.0
Java,0.318188,0.272047,0.159094,0.200446,0.999548,0.318188,0.200446,0.0,0.0,0.200446,0.159094,0.0
JavaScript,0.192346,0.152665,0.0,0.0,0.152665,0.999941,0.220181,0.0,0.0,0.192346,0.220181,0.0
Julia,0.244772,0.0,0.0,0.269406,0.0,0.244772,0.999074,0.290209,0.0,0.213828,0.399057,0.0
Lua,0.0,0.0,0.186756,0.0,0.0,0.186756,0.0,0.99995,0.0,0.186756,0.235298,0.0
OCaml,0.170936,0.0,0.0,0.341871,0.0,0.215365,0.170936,0.170936,0.999638,0.271343,0.0,0.0
Perl,0.216392,0.0,0.0,0.216392,0.189036,0.23817,0.0,0.0,0.150038,0.999918,0.189036,0.0


In [68]:
accuracy_score(preds[1], preds[2])

0.9317715959004392