In [1]:
import re
from typing import List, Tuple

import os
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import gensim
from gensim.models import Word2Vec
from tqdm.notebook import tqdm

from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split

from src.dataset import read_lang_dataset, tokenize_dataset, get_vocab_mapping, gateways

sns.set_theme()
sns.set(rc={'figure.figsize': (12, 8)})

In [2]:
dataset = read_lang_dataset("data/dataset_github_codes.db")
dataset

Unnamed: 0,language,code
0,Python,"""""""The tests for the automation component.""""""\..."
1,Python,#!python2\n# -*- coding: utf-8 -*-\nimport os\...
2,Python,#!/usr/bin/python\n#\n# Copyright (c) 2017 Yuw...
3,Python,"""""""The tests for the Template automation.""""""\n..."
4,Python,"""""""The tests for numeric state automation.""""""\..."
...,...,...
9468,C#,using Microsoft.SharePoint.Client;\nusing Micr...
9469,C#,using System;\nusing System.Collections;\nusin...
9470,C#,using UnityEngine;\nusing System.Collections.G...
9471,C#,// Copyright (c) Microsoft Corporation. All ri...


In [3]:
dataset.language.value_counts()

C++       2688
C#        2494
Go        2203
Python    2088
Name: language, dtype: int64

In [4]:
use_cache = True
# load "tokenized_sample.pkl" from disk if it exists
if use_cache and os.path.exists("models/tokenized_sample.pkl"):
    tokenized_sample = pd.read_pickle("models/tokenized_sample.pkl")
else:
    tokenized_sample = tokenize_dataset(dataset, ignore_langs=['C#'])
    tokenized_sample.to_pickle("models/tokenized_sample.pkl")

tokenized_sample

100%|██████████| 2088/2088 [01:12<00:00, 28.67it/s]
 57%|█████▋    | 1411/2494 [00:15<00:11, 93.08it/s]


MaybeEncodingError: Error sending result: '<multiprocessing.pool.ExceptionWithTraceback object at 0x7fc090124400>'. Reason: 'TypeError("cannot pickle '_thread.RLock' object")'

In [None]:
def create_lang_model(dataset, complete_model, language):
    model = Word2Vec.load("models/complete_model.gensim")
    #model = Word2Vec(vector_size=100, window=10, min_count=10, workers=4)
    #model.reset_from(complete_model)
    model.init_weights()
    train_dataset = dataset.code[dataset.language == language]
    model.train(train_dataset, total_examples=len(train_dataset), epochs=10)
    return model

In [None]:
complete_model = Word2Vec(vector_size=100, window=10, min_count=10, workers=8)
complete_model.build_vocab(tokenized_sample.code)
complete_model.save("models/complete_model.gensim")
models = [(language, create_lang_model(tokenized_sample, complete_model, language)) for language in tokenized_sample.language.unique()]
models

# Visualize

In [None]:
from gensim.models import TranslationMatrix


inserted_tokens = ["INT_LITERAL", "FLOAT_LITERAL", "STRING_LITERAL", "VARIABLE"]
math_ops = ["+", "-", "*", "/", "%"]
common_keywords = ["if", "else", "for", "while"]
python_keywords = ["def", "class", "return", "if", "else", "for", "while", "in", "import", "from", "as", "with", "try", "except", "finally", "raise", "assert", "yield", "lambda", "pass", "break", "continue", "del", "global", "nonlocal", "and", "or", "not", "is", "in", "True", "False", "None", "async", "await"]
cpp_keywords = ["class", "bool", "catch", "try", "break", "continue", "delete", "do", "else", "enum", "explicit", "export", "extern", "false", "for", "friend", "goto", "if", "inline", "mutable", "namespace", "new", "operator", "private", "protected", "public", "register", "return", "sizeof", "static", "struct", "switch", "template", "this", "throw", "true", "typedef", "typeid", "typename", "union", "using", "virtual", "volatile", "while"]
csharp_keywords = ["class", "bool", "catch", "try", "break", "continue", "delete", "do", "else", "enum", "explicit", "export", "extern", "false", "for", "friend", "goto", "if", "inline", "mutable", "namespace", "new", "operator", "private", "protected", "public", "register", "return", "sizeof", "static", "struct", "switch", "template", "this", "throw", "true", "typedef", "typeid", "typename", "union", "using", "virtual", "volatile", "while"]
go_keywords = ["break", "func", "default", "type", "defer", "go", "struct", "map", "chan", "else", "goto", "package", "range", "const", "fallthrough", "for", "import", "interface", "return", "select", "case", "continue", "if", "switch", "var", "nil", "true", "false"]

In [None]:
# TODO: vectors are different for models so we have to align them, currently we just use least square method
#  maybe removing mean vector or something can help.
def word_cloud(models: List[Tuple[str, Word2Vec]], words, translation_fix_words):
    words = list(set(words))
    translation_target = models[0][1].wv[translation_fix_words]
    df = pd.DataFrame(columns=["model", "word", "x", "y"])
    for name, model in models:
        translation_source = model.wv[translation_fix_words]
        translation_matrix = np.linalg.lstsq(translation_source, translation_target, rcond=None)[0]
        existing_words = [word for word in words if word in model.wv]
        vecs = [model.wv[word] @ translation_matrix for word in existing_words]
        vecs = np.asarray(vecs)
        df = df.append(pd.DataFrame({"model": [name] * len(existing_words), "word": existing_words, "x": vecs[:, 0], "y": vecs[:, 1]}))

    # visulize 2D with dimensionality reductino
    pca = PCA(2, whiten=True)
    df[["x", "y"]] = pca.fit_transform(df[["x", "y"]])
    ax = sns.scatterplot(data=df, x="x", y="y", hue="model")
    range_x, range_y = np.ptp(df[["x", "y"]], axis=0)
    for m in df.model.unique():
        model_df = df[df.model == m]
        for model, word, x, y in model_df.itertuples(index=False):
            ax.text(x + 0.005 * range_x, y, word,
                verticalalignment='center', horizontalalignment='left', fontsize=8)
    return ax

In [None]:
word_cloud(models, python_keywords + common_keywords, [w for w in complete_model.wv.key_to_index.keys()])