In [1]:
from __future__ import print_function
from os import listdir
from os.path import isfile, join

import logging
import json
import pprint
import re

import gensim  #for topic modelling
import nltk  # for text preprocessing
import pandas as pd  #for io
from gensim.parsing.preprocessing import preprocess_string, strip_tags, strip_punctuation, strip_multiple_whitespaces, \
    remove_stopwords, stem_text
from ipywidgets import fixed, interact_manual

logging.basicConfig(format='\r%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)



In [2]:
def get_files(path):
    return [f for f in listdir(path) if isfile(join(path, f))]

In [3]:
ENCODING = 'iso-8859-1'
PATH_ANSWERS = '../data/stacksample/Answers.csv'
pp = pprint.PrettyPrinter(indent=4)


In [4]:
PATH_QUESTIONS = '../data/stacksample/Questions.csv'
COLUMNS_QUESTION = ['Title', 'Body']
N_ROWS = 10

question_df = pd.read_csv(PATH_QUESTIONS, encoding=ENCODING, nrows=N_ROWS)


In [11]:
question_df.head

<bound method NDFrame.head of     Id  OwnerUserId          CreationDate            ClosedDate  Score  \
0   80           26  2008-08-01T13:57:07Z                   NaN     26   
1   90           58  2008-08-01T14:41:24Z  2012-12-26T03:45:49Z    144   
2  120           83  2008-08-01T15:50:08Z                   NaN     21   
3  180      2089740  2008-08-01T18:42:19Z                   NaN     53   
4  260           91  2008-08-01T23:22:08Z                   NaN     49   
5  330           63  2008-08-02T02:51:36Z                   NaN     29   
6  470           71  2008-08-02T15:11:47Z  2016-03-26T05:23:29Z     13   
7  580           91  2008-08-02T23:30:59Z                   NaN     21   
8  650          143  2008-08-03T11:12:52Z                   NaN     79   
9  810          233  2008-08-03T20:35:01Z                   NaN      9   

                                               Title  \
0  SQLStatement.execute() - multiple queries in o...   
1  Good branching and merging tutorials for

In [5]:
question_df['All'] = question_df['Title'].map(str) + question_df['Body']

In [6]:
PATH_TAGS = '../data/stacksample/Tags.csv'
tags_df = pd.read_csv(PATH_TAGS, encoding=ENCODING)

In [7]:
def print_state(current, target, loading=True):
    if loading:
        state = round((current / target) * 100.0, 2)
        print('\rLoading: ' + str(state) + '%', end='')
    else:
        print(f'\rRow {current}/{target}', end='')


In [12]:
stemmer = nltk.stem.PorterStemmer()
ALLOWED_SHORT_WORDS = ['c', 'c#', 'r', '3d', '2d', '1d', '7z', 'qt']


def remove_html_tags(text):
    return re.sub("(\<.*?\>)", "", text)


def filter_text(tokenized_text):
    return [stemmer.stem(word) for word in tokenized_text if stemmer.stem(word) not in stop_words]


def tokenize_text(text):
    return filter_text(gensim.utils.simple_preprocess(str(remove_html_tags(text)), deacc=True))


def strip_short(word_list):    
    return [str(word).lower() for word in word_list if len(word) > 2 or word in ALLOWED_SHORT_WORDS]


def prepare_document(document, e_filters=[], show_state=True, show_idx=1):
    output = []
    doc_length = len(document)
    for idx, text in enumerate(document):
        if show_state and idx % show_idx == 0:
            print_state(idx + 1, doc_length)
        output.append(simple_tokenize_text(text, extra_filters=e_filters))
    return output


def simple_tokenize_text(text, extra_filters=[], remove_short=True):
    text_filters=[strip_tags, strip_punctuation, strip_multiple_whitespaces, remove_stopwords]
    if (len(extra_filters) > 0): 
        text_filters = text_filters + extra_filers
    p = preprocess_string(text, filters=text_filters)
    if (remove_short):
        return strip_short(p)
    else:
        return p

---

# Corpus parsing

In [13]:
N_ROWS = 1000
q_titles = question_df['Title'][:]
q_texts = question_df['All'][:]
#q_tags = tags_df['Tag'][:]


## Corpus tokenization

In [15]:
def corpus_to_tokens(corpus, file_name='', save_data=False):
    q_texts_tokenized = prepare_document(corpus, show_idx=1000)
    if save_data:
        with open(f'../data/stacksample/tokenized/{file_name}.json', 'w', encoding=ENCODING) as f:
            print('Saving data...')
            json.dump(q_texts_tokenized, f)
            print(f'Data saved as {file_name}.json')

In [16]:
q_texts_tokenized = interact_manual(corpus_to_tokens, corpus=fixed(q_texts), file_name='new_file_name', save_data=False)

## Dictionary creation

In [8]:
logging.StreamHandler.terminator = ''
def build_dictionary(file_name, save_dict_name='', save_data=False):
    with open(f'../data/stacksample/tokenized/{file_name}') as f:
        print('Waiting for data...')
        tokenized_corpus = json.load(f)
        dict_texts = gensim.corpora.Dictionary(tokenized_corpus)
        if save_data:
            print('Saving data...')
            dict_texts.save(f'../data/stacksample/dictionary/{save_dict_name}.dict')
    return dict_texts


In [9]:
interact_manual(build_dictionary, file_name=get_files('../data/stacksample/tokenized'), 
                save_dict_name='new_file_name', save_data=True)

<function __main__.build_dictionary>

## *Bag of words* creation

In [10]:
logging.StreamHandler.terminator = ''
def build_bow(tokenized_file, dictionary_file, save_file='', save_data=False):
    corpus_dictionary = gensim.corpora.Dictionary.load(f'../data/stacksample/dictionary/{dictionary_file}')
    with open(f'../data/stacksample/tokenized/{tokenized_file}') as f:
        tokenized_corpus = json.load(f)
    corpus_bow = [corpus_dictionary.doc2bow(text) for text in tokenized_corpus]
    if save_data:
        gensim.corpora.MmCorpus.serialize(f'../data/stacksample/bow/{save_file}.mm', corpus_bow)
    return corpus_bow


In [11]:
interact_manual(build_bow, tokenized_file=get_files('../data/stacksample/tokenized'), 
                dictionary_file=get_files('../data/stacksample/dictionary'), save_file='new_file_name', save_data=True)


<function __main__.build_bow>

---

In [35]:
from six import iteritems

DICT_NAME = 'question_title_body.dict'

dictionary = gensim.corpora.Dictionary.load(f'../data/stacksample/dictionary/{DICT_NAME}')

2018-06-03 17:26:54,048 : INFO : loading Dictionary object from ../data/stacksample/dictionary/question_title_body.dict
2018-06-03 17:26:55,261 : INFO : loaded ../data/stacksample/dictionary/question_title_body.dict


In [37]:
once_ids = [tokenid for tokenid, docfreq in iteritems(dictionary.dfs) if docfreq == 1]

In [38]:
dictionary.filter_tokens(once_ids)

In [39]:
dictionary.compactify()

In [40]:
dictionary.save('../data/stacksample/dictionary/question_title_body_compact.dict')

2018-06-03 17:28:35,864 : INFO : saving Dictionary object under ../data/stacksample/dictionary/question_title_body_compact.dict, separately None
2018-06-03 17:28:36,208 : INFO : saved ../data/stacksample/dictionary/question_title_body_compact.dict


In [30]:
def get_bow_question(index):
    tokenized_text = preprocess_string(question_df['Title'][index])
    return dict_texts.doc2bow(tokenized_text)


In [35]:
pp.pprint(ldamodel.print_topics(num_topics=10, num_words=2))

[   (0, '0.024*"build" + 0.021*"file"'),
    (1, '0.017*"code" + 0.012*"string"'),
    (2, '0.010*"us" + 0.009*"user"'),
    (3, '0.052*"script" + 0.032*"http"'),
    (4, '0.020*"tabl" + 0.011*"file"'),
    (5, '0.018*"server" + 0.011*"applic"'),
    (6, '0.019*"class" + 0.014*"public"'),
    (7, '0.014*"data" + 0.013*"object"'),
    (8, '0.020*"text" + 0.012*"string"'),
    (9, '0.015*"page" + 0.012*"applic"')]


In [33]:
pp.pprint(ldamodel[get_bow_question(5)])

[   (0, 0.020003743),
    (1, 0.02000364),
    (2, 0.8199628),
    (3, 0.020003308),
    (4, 0.02000456),
    (5, 0.02000247),
    (6, 0.02000974),
    (7, 0.02000231),
    (8, 0.02000356),
    (9, 0.020003838)]
