# Embeddings

In [10]:
import itertools
import sqlite3
from typing import List
import itertools
import pandas as pd
from typing import *

In [4]:
# Create your connection.
db = sqlite3.connect('../stackoverflow.db')

In [5]:
tag_df = pd.read_sql_query("SELECT * FROM Tag", db)
tag_df.set_index('TagId', inplace=True)
tag_df

Unnamed: 0_level_0,TagName,Count
TagId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,.net,323469
2,html,1146245
3,javascript,2426650
4,css,771867
5,php,1445747
...,...,...
158156,lost-update,1
158157,chai-subset,1
158159,at-spi,1
158160,oro,0


In [None]:
POST_LIMIT = 500000
post_df = pd.read_sql_query(f"SELECT * FROM Post LIMIT {POST_LIMIT}", db)
post_df.set_index('PostId', inplace=True)
post_df

In [None]:
BADGE_LIMIT = 500000
badge_df = pd.read_sql_query(f"SELECT * FROM badge LIMIT {BADGE_LIMIT}", db)
badge_df.set_index('BadgeId', inplace=True)
badge_df

In [None]:
user_LIMIT = 500000
user_df = pd.read_sql_query(f"SELECT * FROM user LIMIT {user_LIMIT}", db)
user_df.set_index('UserId', inplace=True)
user_df

In [None]:
comment_LIMIT = 500000
comment_df = pd.read_sql_query(f"SELECT * FROM comment LIMIT {comment_LIMIT}", db)
comment_df.set_index('CommentId', inplace=True)
comment_df

In [None]:
vote_LIMIT = 500000
vote_df = pd.read_sql_query(f"SELECT * FROM vote LIMIT {vote_LIMIT}", db)
vote_df.set_index('VoteId', inplace=True)
vote_df

So, we want to build an embedding for the body of posts.
The body of posts are stored as HTML, we need to split up the text and the code snippets so we can process them separately.

In [None]:
sample_post = post_df.head(1)
sample_post

Let's process the Body first

In [25]:
from bs4 import BeautifulSoup
import spacy
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchtext.vocab import GloVe

en = spacy.load('en_core_web_sm')
stopwords = en.Defaults.stop_words

In [None]:
class PostEmbedding(nn.Module):

    def __init__(self):
        super().__init__()
        self._global_vectors = GloVe(name='840B', dim=300)

    def forward(self, html: str, title: str, flatten=True) -> torch.tensor:
        soup = BeautifulSoup(html)
        ps = self.get_paragraphs(soup, title)
        if flatten:
            # Treat all paragraphs the same
            ps = [token for para in ps for token in para]

        para_emb = self.to_paragraph_embedding(ps)

        code = self.get_code(soup)
        return para_emb

    def preprocess(self, text: str) -> List[str]:
        doc = en(text.lower())
        tokens = [word.text for word in doc if not (word.is_stop or word.is_punct or word.like_num)]
        return tokens

    def get_paragraphs(self, soup: BeautifulSoup, title: str=None) -> List[List[str]]:
        paras = [self.preprocess(x.get_text()) for x in soup.find_all('p')]
        # If title is available add it to the paragraphs
        if title is not None:
            paras += self.preprocess(title)
        return paras

    def to_paragraph_embedding(self, tokens: List[str]):
        word_embeddings = self._global_vectors.get_vecs_by_tokens(tokens)
        return torch.sum(word_embeddings, dim=0) / len(tokens)


    def get_code(self, soup: BeautifulSoup) -> str:
        return "\n".join([x.get_text() for x in soup.find_all('code')])


html = sample_post['Body'].item()

embedding = PostEmbedding()
embedding(html, sample_post['Title'].item())

In [None]:
pe = PostEmbedding()
post_df['code_snippets'] = post_df['Body'].apply(lambda html: pe.get_code(BeautifulSoup(html)))
post_df

In [None]:
import re

python_class_name_pattern = r"class ([a-zA-Z_$][a-zA-Z_$0-9]*)[:(]"
py_func_name_pattern = r"def ([a-zA-Z_$][a-zA-Z_$0-9]*)\("
py_import_pattern = r"(?m)^(?:from[ ]+(\S+)[ ]+)?import[ ]+(\S+)(?:[ ]+as[ ]+\S+)?[ ]*$"
py_variable_names = r"([a-zA-Z_$0-9]+)[ ]="

In [None]:
def find_code_features(code_snippet: str):
    class_names = re.findall(python_class_name_pattern, code_snippet)
    func_names = re.findall(py_func_name_pattern, code_snippet)
    import_names = list(sum(re.findall(py_import_pattern, code_snippet), ()))
    var_names = re.findall(py_variable_names, code_snippet)
    # return {
    #     'class_names' : class_names,
    #     'func_names' : func_names,
    #     'import_names' : import_names,
    #     'var_names' : var_names
    # }
    return import_names

post_df['import_names'] = post_df['code_snippets'].apply(find_code_features)

In [None]:
post_df[['code_snippets', 'import_names']]

In [None]:
a = post_df[['code_snippets', 'import_names']]
a.dtypes

## Building the graph
We want to build a user-expertise graph

In [None]:
USER_ID = 653
user_df.loc[USER_ID]

In [None]:
questions_by_user = pd.read_sql_query(f"SELECT * FROM Post WHERE OwnerUserId={USER_ID} AND PostTypeId=1", db)
questions_by_user.set_index('PostId', inplace=True)
questions_by_user

In [None]:
answers_by_user = pd.read_sql_query(f"SELECT * FROM Post WHERE OwnerUserId={USER_ID} AND PostTypeId=2", db)
answers_by_user.set_index('PostId', inplace=True)
answers_by_user

In [None]:
comments_by_user = pd.read_sql_query(f"SELECT * FROM Comment WHERE UserId={USER_ID}", db)
comments_by_user.set_index('CommentId', inplace=True)
comments_by_user

In [15]:
def parse_tag_list(tag_list: str) -> List[str]:
    return tag_list[1:-1].split("><")

def get_parent_tags(post_id: int) -> str:
    tags = pd.read_sql_query(f"SELECT Tags FROM Post WHERE PostId={post_id}", db)
    return tags['Tags'].item()

In [None]:
tag_frequency = {}

for i, row in questions_by_user.iterrows():
    tags = parse_tag_list(row.Tags)
    for t in tags:
        if t in tag_frequency:
            tag_frequency[t] += 1
        else:
            tag_frequency[t] = 1

for i, row in answers_by_user.iterrows():
    tag_list = get_parent_tags(row.ParentId)
    if tag_list is None:
        continue
    for t in parse_tag_list(tag_list):
        if t in tag_frequency:
            tag_frequency[t] += 1
        else:
            tag_frequency[t] = 1

{k: v for k, v in sorted(tag_frequency.items(), key=lambda item: item[1], reverse=True)}

So we can see there are many categories and for a single user; but there is a lot of variance.
Can we create an embedding for tags which reflect similar categories?

Step 1. Build list of Tag vocabulary

In [6]:
tag_df.head()

Unnamed: 0_level_0,TagName,Count
TagId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,.net,323469
2,html,1146245
3,javascript,2426650
4,css,771867
5,php,1445747


In [126]:
tag_df["TagName"].to_csv("tag_vocab.csv", index=False)
tag_vocab = list(set(tag_df["TagName"]))

In [None]:
post_tags = pd.read_sql_query(f"SELECT Tags FROM Post WHERE PostTypeId=1", db)
post_tags.to_csv("all_tags.csv", index=False)

In [13]:
post_tags = pd.read_csv("all_tags.csv")

In [16]:
tag_list_df = post_tags['Tags'].apply(lambda row: parse_tag_list(row))

In [17]:
combinations = tag_list_df.apply(lambda row: list(itertools.combinations(row, 2)))

In [18]:
combinations = combinations[combinations.astype(str) != '[]']

In [118]:
tag_pairs = []
for i in combinations:
    tag_pairs += i

In [119]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cpu')

In [120]:
import random

TRAIN_SIZE = 10000
tag_pairs = random.sample(tag_pairs, TRAIN_SIZE)
tag_pairs

[('memory', 'buffer'),
 ('google-cloud-functions', 'google-cloud-pubsub'),
 ('php', 'polling'),
 ('jtable', 'abstracttablemodel'),
 ('qimage', 'libtiff'),
 ('arrays', 'filter'),
 ('breakpoints', 'xdebug'),
 ('node.js', 'dojo'),
 ('centos', 'selinux'),
 ('android', 'kotlin'),
 ('c++', 'g++'),
 ('wso2-api-manager', 'wso2-identity-server'),
 ('express', 'express-session'),
 ('jquery', 'html'),
 ('linq-to-sql', 'checkbox'),
 ('php', 'jquery'),
 ('ruby-on-rails-4', 'rspec'),
 ('xml', 'dom'),
 ('php', 'email'),
 ('html', 'tabulator'),
 ('objective-c', 'memory-management'),
 ('mysql', 'database'),
 ('r', 'ggplot2'),
 ('c', 'coffeescript'),
 ('performance', 'cycle'),
 ('android', 'titanium'),
 ('ggplot2', 'colorbar'),
 ('php', 'imap'),
 ('rubygems', 'easyhtmlreport'),
 ('sql-function', 'table-variable'),
 ('javascript', 'node.js'),
 ('arrays', 'string'),
 ('function', 'dplyr'),
 ('filter', 'shinydashboard'),
 ('java', 'spring'),
 ('javascript', 'signalr'),
 ('c#', 'video-encoding'),
 ('javascr

In [121]:
len(tag_pairs), len(tag_vocab)

(10000, 63653)

In [122]:
tag_to_ix = {tag: i for i, tag in enumerate(tag_vocab)}

In [123]:
class TagEmbedding(nn.Module):

    def __init__(self, vocab_size, embedding_dim):
        super(TagEmbedding, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.linear1 = nn.Linear(embedding_dim, 128)
        self.linear2 = nn.Linear(128, vocab_size)

    def forward(self, inputs):
        embeds = self.embedding(inputs).view((1, -1))
        out = F.relu(self.linear1(embeds))
        out = self.linear2(out)
        log_probs = F.log_softmax(out, dim=1)
        return log_probs

In [124]:
losses = []
loss_function = nn.NLLLoss()
model = TagEmbedding(vocab_size=len(tag_vocab), embedding_dim=20).to(device)
optimizer = optim.SGD(model.parameters(), lr=0.001)

In [125]:
from tqdm import tqdm

for epoch in range(1):
    total_loss = 0
    for tag_a, tag_b in tqdm(tag_pairs):
        tag_a_id = torch.tensor(tag_to_ix[tag_a], dtype=torch.long).to(device)
        model.zero_grad()
        log_probs = model(tag_a_id)
        loss = loss_function(log_probs.flatten(), torch.tensor(tag_to_ix[tag_b], dtype=torch.long).to(device))
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    losses.append(total_loss)


100%|██████████| 10000/10000 [03:48<00:00, 43.82it/s]


In [106]:
embd_a = model.embedding.weight[tag_to_ix["python"]]
embd_b = model.embedding.weight[tag_to_ix["java"]]

In [107]:
sim = torch.nn.CosineSimilarity(dim=0)
sim(embd_a, embd_b)

tensor(-0.0741, grad_fn=<SumBackward1>)

In [112]:
from torch.utils.tensorboard import SummaryWriter
writer = SummaryWriter('runs/visualisation')

In [113]:
writer.add_embedding(model.embedding.weight,
                         metadata  = tag_vocab,
                        tag = f'Tag embedding')



In [111]:
writer.close()