In [1]:
import sqlite3
import pandas as pd
import logging
import pickle
logging.basicConfig()
logging.getLogger().setLevel(logging.INFO)
log = logging.getLogger("training-set-builder")

# Create your connection.
db = sqlite3.connect('../stackoverflow.db')

In [2]:
QUESTIONS_RETREIVED = 100

In [3]:
valid_questions = pd.read_sql_query(f"""
        SELECT Q.PostId, Q.Body, Q.Title, Q.OwnerUserId FROM Post Q
        INNER JOIN Post A ON Q.PostId = A.ParentId
        WHERE (Q.Tags LIKE '%<python>%')
        GROUP BY A.ParentId
        HAVING SUM(A.Score) > 15
        LIMIT {QUESTIONS_RETREIVED}
""", db)
valid_questions.columns = ['post_id', 'question_body', 'question_title', 'question_user_id']
valid_questions

Unnamed: 0,post_id,question_body,question_title,question_user_id
0,337,<p>I am about to build a piece of a project th...,XML Processing in Python,111.0
1,469,<p>I am using the Photoshop's javascript API t...,How can I find the full path to a font from it...,147.0
2,502,<p>I have a cross-platform (Python) applicatio...,Get a preview JPEG of a PDF on Windows?,147.0
3,535,<p>I am starting to work on a hobby project wi...,Continuous Integration System for a Python Cod...,154.0
4,594,<p>There are several ways to iterate over a re...,cx_Oracle: How do I iterate over a result set?,116.0
...,...,...,...,...
95,38601,<p>How can I use the nifty JavaScript date and...,Using Django time/date widgets in custom form,2592.0
96,38987,<p>I want to merge two dictionaries into a new...,How do I merge two dictionaries in a single ex...,3207.0
97,39086,<p>I want to loop over the contents of a text ...,Search and replace a line in a file in Python,4166.0
98,39104,<p>I've written a Python package that includes...,Finding a file in a Python module distribution,4198.0


In [4]:
save = True
if save:
    with open("../data/raw/valid_questions.pkl", "wb") as f:
        pickle.dump(valid_questions['post_id'].to_list(), f)

In [4]:
def fetch_questions_by_user(user_id: int, db):
    questions_df = pd.read_sql_query(f"""
            SELECT *
            FROM Post
            WHERE Tags LIKE '%python%' AND (PostTypeId = 1) AND ((LastEditorUserId = {user_id}) OR (OwnerUserId = {user_id}))
    """, db)
    questions_df.set_index('PostId', inplace=True)
    return questions_df

In [5]:
def fetch_answers_by_user(user_id: int, db):
    answers_df = pd.read_sql_query(f"""
            SELECT A.Tags, B.*
            FROM Post A
                INNER JOIN Post B ON (B.ParentId = A.PostId) AND (B.ParentId IS NOT NULL)
            WHERE A.Tags LIKE '%python%' AND (B.PostTypeId = 2) AND ((B.LastEditorUserId = {user_id}) OR (B.OwnerUserId = {user_id}))
    """, db)
    answers_df = answers_df.loc[:, ~answers_df.columns.duplicated()].copy()
    answers_df.set_index('PostId', inplace=True)
    return answers_df

In [6]:
def fetch_answers_for_question(question_post_id: int, db):
    answers_df = pd.read_sql_query(f"""
            SELECT Body, OwnerUserId, Score
            FROM Post
            WHERE ParentId = {question_post_id}
    """, db)
    answers_df = answers_df.dropna()
    return answers_df

In [7]:
def fetch_comments_by_user(user_id: int, db):
    comments_on_questions_df = pd.read_sql_query(f"""
            SELECT A.Tags, B.*
            FROM Post A
                INNER JOIN Comment B ON (B.PostId = A.PostId)
            WHERE A.Tags LIKE '%python%' AND (B.UserId = {user_id}) AND (A.PostTypeId = 1)
    """, db)
    comments_on_questions_df.set_index('CommentId', inplace=True)

    comments_on_answers_df = pd.read_sql_query(f"""
        SELECT A.Tags, C.*
        FROM Post A
            INNER JOIN Post B ON (B.ParentId = A.PostId) AND (B.ParentId IS NOT NULL)
            INNER JOIN Comment C ON (B.PostId = C.PostId)
        WHERE A.Tags LIKE '%python%' AND (C.UserId = {user_id}) AND (B.PostTypeId = 2)
    """, db)
    comments_on_answers_df.set_index('CommentId', inplace=True)

    return pd.concat([comments_on_questions_df, comments_on_answers_df])

In [8]:
def construct_graph(user_id, db):
    graph_constructor = StaticGraphConstruction()
    qs = fetch_questions_by_user(user_id, db)
    ans = fetch_answers_by_user(user_id, db)
    cs = fetch_comments_by_user(user_id, db)
    return graph_constructor.construct(questions=qs, answers=ans, comments=cs)

In [9]:
import torch
from datetime import date
from post_embedding_builder import PostEmbedding
from static_graph_construction import StaticGraphConstruction
import pickle

post_embedding_builder = PostEmbedding()

FILE_BATCH_SIZE = 10

data = {"graphs":[], "questions": [], "answers": [], "labels": []}

question_c = 1
answer_c = 1

for row in valid_questions.itertuples():
    log.info(f"processing question {question_c}")
    # Build Question embedding
    question_word_emb, question_code_emb, _ = post_embedding_builder(
        row.question_body,
        use_bert=True,
        title=row.question_title
    )
    question_emb = torch.concat((question_word_emb, question_code_emb))
    # Build Answer embeddings
    for _, answer_body, answer_user_id, score in fetch_answers_for_question(row.post_id, db).itertuples():
        log.info(f"processing answer {answer_c}")
        answer_word_emb, answer_code_emb, _  = post_embedding_builder(
            answer_body, use_bert=True
        )
        answer_emb = torch.concat((answer_word_emb, answer_code_emb))
        # Construct User Graph
        ug = construct_graph(answer_user_id, db)

        data["graphs"].append(ug)
        data["questions"].append(question_emb)
        data["answers"].append(answer_emb)
        data["labels"].append(1 if score > 0 else 0)

        # Dataset will grow larger than memory, so batch in pickle files
        with open(f"../data/raw/batch{date.today():%m-%d-%Y}.pkl", "wb") as f:
            pickle.dump(data, f)

            data = {"graphs":[], "questions": [], "answers": [], "labels": []}

        answer_c += 1

    question_c += 1
    answer_c = 1


INFO:post_embedding_builder:PostEmbedding instantiated!
INFO:torchtext.vocab.vectors:Loading vectors from .vector_cache\glove.840B.300d.txt.pt
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
INFO:post_em

In [10]:
X_raw[0]

NameError: name 'X_raw' is not defined

In [None]:
X_raw[10]