In [1]:
import pandas as pd
from collections import Counter
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import save_npz

In [2]:
# Extract top n words and full words from each file. Used for a stratified vocab and for documents to feed into tfidf
def extract_words(filename, N):
    print(f"Processing {filename}...")
    df = pd.read_csv(filename)
    # fill in missing values with an empty string
    df = df.fillna("")
    top_words = [word for word, count in Counter(" ".join(df["snippet"]).lower().split()).most_common(N)]
    words = df["snippet"].tolist()
    label = df["language"].tolist()
    return words, top_words, label

#files = ["snippets-bash.csv", "snippets-c.csv", "snippets-cpp.csv", "snippets-csv.csv", 
#         "snippets-dotfile.csv", "snippets-go.csv", "snippets-html.csv", "snippets-java.csv", "snippets-javascript.csv",
#         "snippets-json.csv"]
files = ["snippets-all.csv"]

# Loosely based off number of reserved keywords per language. Trying to upper bound to track most common keywords 
# and other common symbols in each language 
# https://stackoverflow.com/questions/4980766/reserved-keywords-count-by-programming-language
N = 20

vocabulary = []
snippets = []
labels = []
for file in files:
    snippet_list, vocab, label = extract_words(file, N)
    vocabulary += vocab
    snippets += snippet_list
    labels += label

vocabulary = list(set(vocabulary))

Processing snippets-all.csv...


In [3]:
print(snippets[:5])
print(vocabulary[:5])

['test/files/normalise.jpg.png\ntest/files/normalise-resized.jpg\npackage-lock.json\n/package.json\n*.mongodb\n', 'a computer network, with no transfer of a copy, is not conveying.\n\n  An interactive user interface displays "Appropriate Legal Notices"\nto the extent that it includes a convenient and prominently visible\nfeature that (1) displays an appropriate copyright notice, and (2)\n', 'the predecessor has it or can get it with reasonable efforts.\n\n  You may not impose any further restrictions on the exercise of the\nrights granted or affirmed under this License.  For example, you may\nnot impose a license fee, royalty, or other charge for exercise of\n', '  THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY\nAPPLICABLE LAW.  EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT\nHOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY\nOF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO,\nTHE IMPLIED WARRANTIES OF MERCHANTABILI

In [4]:
#vectorizer = TfidfVectorizer(vocabulary=vocabulary)
vectorizer = TfidfVectorizer(max_features = 2000, ngram_range=(1, 2), token_pattern=r"(?u)\b[A-Za-z_]+\b|[:={}<>,#\[\]]|\/\/")
X = vectorizer.fit_transform(snippets)
# Lots of garbage in our features, e.g. '2', 'sherlock', '"",', 'carolina,south', '"仄仄平平仄，平平仄仄平"'
# Could be improved by creating smarter tokens, or engineering the vocabulary a bit more. 
print(vectorizer.get_feature_names())



In [5]:
languages = list(set(labels))
encoding = { language: number for language, number in zip(languages, range(len(languages)))}
# One-hot encode the language
y = pd.get_dummies(map(lambda x: encoding[x], labels))
print(X)
print(y)

print(X.shape[0])
print(len(y))

  (0, 1066)	0.47470589174780514
  (0, 1141)	0.279891465032747
  (0, 1338)	0.4686902798875368
  (0, 1374)	0.27899257312562403
  (0, 832)	0.5011637762686464
  (0, 1687)	0.384242786760636
  (1, 32)	0.18750004474974924
  (1, 1747)	0.17067121298392426
  (1, 1039)	0.19500518284010737
  (1, 379)	0.20524657702045762
  (1, 1286)	0.21333608196990453
  (1, 1257)	0.22379986709406727
  (1, 632)	0.17481493705391907
  (1, 818)	0.22505795355685101
  (1, 412)	0.25253409657512443
  (1, 1045)	0.14853050659823494
  (1, 1698)	0.29746433736206307
  (1, 1700)	0.09722356526056666
  (1, 1743)	0.10965996927271528
  (1, 1024)	0.18987849526072975
  (1, 1824)	0.18554714274279882
  (1, 409)	0.3066271885289884
  (1, 1252)	0.13824736166210957
  (1, 1035)	0.11924662887414295
  (1, 630)	0.18430431201418968
  :	:
  (4849997, 1999)	0.16576618189484785
  (4849997, 1968)	0.17260997387559635
  (4849997, 151)	0.0790050162584248
  (4849997, 1928)	0.08336638719277914
  (4849997, 20)	0.07020606939093695
  (4849998, 990)	0.40947

In [7]:
# Save data in csv files
save_npz("X", X)
y.to_pickle("y.pcl")

## Section to test results of feature engineering on accuarcy for LR

In [6]:
# 90% of rows have at least one entry 
rows, cols = X.nonzero()
print(len(list(set(rows.tolist()))))
# Remove empty rows from dataset
# y = y[X.getnnz(1)>0]
# X = X[X.getnnz(1)>0]

4727681


In [7]:
import pandas as pd
from scipy.sparse import load_npz, hstack
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [10]:
lr = LogisticRegression(verbose=3, multi_class='ovr', penalty="elasticnet", solver="saga", l1_ratio=0.5, n_jobs=-1)
# lr.fit(X_train.toarray(), y_train.idxmax(axis=1))
lr.fit(X_train, y_train.idxmax(axis=1))

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 4 concurrent workers.


convergence after 18 epochs took 232 seconds
convergence after 18 epochs took 536 seconds
convergence after 19 epochs took 571 seconds
convergence after 49 epochs took 732 seconds
convergence after 20 epochs took 614 seconds
convergence after 19 epochs took 570 seconds
convergence after 20 epochs took 574 seconds
convergence after 24 epochs took 387 seconds
convergence after 19 epochs took 569 seconds
convergence after 18 epochs took 530 seconds
convergence after 17 epochs took 454 seconds
convergence after 19 epochs took 560 seconds
max_iter reached after 753 seconds




convergence after 18 epochs took 457 seconds
convergence after 18 epochs took 491 seconds
convergence after 21 epochs took 605 seconds
convergence after 19 epochs took 443 seconds
convergence after 16 epochs took 477 seconds
convergence after 19 epochs took 628 seconds
max_iter reached after 925 seconds




convergence after 21 epochs took 595 seconds


[Parallel(n_jobs=-1)]: Done  21 out of  21 | elapsed: 51.5min finished


LogisticRegression(l1_ratio=0.5, multi_class='ovr', n_jobs=-1,
                   penalty='elasticnet', solver='saga', verbose=3)

In [11]:
# Top 20 per language gets accuracy ~33%
# Top 200 total gets accuracy ~57%
# Top 2000 total (with some better regex for code tokens) gets 70% on the test! Likely an upper bound. 
print(accuracy_score(y_train.idxmax(axis=1), lr.predict(X_train)))
print(accuracy_score(y_test.idxmax(axis=1), lr.predict(X_test)))

0.7019974226804123
0.7020711340206186
