In [None]:
from __future__ import print_function
from ipywidgets import interact, interactive, fixed, interact_manual
import ipywidgets as widgets
from IPython.display import display
import logging
import pprint
from os import listdir
from os.path import isfile, join

import pandas as pd
from gensim.parsing.preprocessing import preprocess_string
from gensim.corpora import MmCorpus, Dictionary
from gensim.models import TfidfModel, LdaModel, HdpModel

logging.basicConfig(format='\r%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

pp = pprint.PrettyPrinter(indent=4)
ENCODING = 'iso-8859-1'


In [None]:
def get_files(path):
    return [f for f in listdir(path) if isfile(join(path, f))]

## Setup

### CSV files

In [3]:
PATH_TAGS = '../data/stacksample/Tags.csv'
tags_df = pd.read_csv(PATH_TAGS, encoding=ENCODING)

In [4]:
COLUMNS_QUESTION = ['Title', 'Body', 'Id']
QUESTIONS_PATH = '../data/stacksample/Questions.csv'
N_ROWS = 100

csv_data  = pd.read_csv(QUESTIONS_PATH, encoding=ENCODING, usecols=COLUMNS_QUESTION, nrows=N_ROWS)


### Dictionary file

In [5]:
DICT_FILE = 'question_title_body.dict'
dict_texts = Dictionary.load(f'../data/stacksample/dictionary/{DICT_FILE}')

2018-06-03 19:36:41,023 : INFO : loading Dictionary object from ../data/stacksample/dictionary/question_title_body.dict
2018-06-03 19:36:42,510 : INFO : loaded ../data/stacksample/dictionary/question_title_body.dict


### Model files

In [6]:
tfidf_model_files = widgets.Dropdown(options=[f for f in get_files('../data/stacksample/models') 
                                              if 'tfidf' in f and 'npy' not in f], description='TFIDF model file:')
lda_model_files = widgets.Dropdown(options=[f for f in get_files('../data/stacksample/models') 
                                            if 'lda' in f and 'npy' not in f], description='LDA model file:')
hpd_model_files = widgets.Dropdown(options=[f for f in get_files('../data/stacksample/models') 
                                            if 'hpd' in f and 'npy' not in f], description='HPD model file:')
display(tfidf_model_files, lda_model_files, hpd_model_files)

In [7]:
lda_model = LdaModel.load(f'../data/stacksample/models/{lda_model_files.value}')


2018-06-03 19:37:05,388 : INFO : loading LdaModel object from ../data/stacksample/models/question_compact_text_100.lda
2018-06-03 19:37:05,447 : INFO : loading expElogbeta from ../data/stacksample/models/question_compact_text_100.lda.expElogbeta.npy with mmap=None
2018-06-03 19:37:06,987 : INFO : setting ignored attribute id2word to None
2018-06-03 19:37:06,988 : INFO : setting ignored attribute dispatcher to None
2018-06-03 19:37:06,990 : INFO : setting ignored attribute state to None
2018-06-03 19:37:06,992 : INFO : loaded ../data/stacksample/models/question_compact_text_100.lda
2018-06-03 19:37:06,994 : INFO : loading LdaState object from ../data/stacksample/models/question_compact_text_100.lda.state
2018-06-03 19:37:07,053 : INFO : loading sstats from ../data/stacksample/models/question_compact_text_100.lda.state.sstats.npy with mmap=None
2018-06-03 19:37:08,544 : INFO : loaded ../data/stacksample/models/question_compact_text_100.lda.state


In [8]:
tfidf_model = TfidfModel.load(f'../data/stacksample/models/{tfidf_model_files.value}')


2018-06-03 19:37:09,717 : INFO : loading TfidfModel object from ../data/stacksample/models/question_title_body.tfidf
2018-06-03 19:37:14,561 : INFO : loading id2word recursively from ../data/stacksample/models/question_title_body.tfidf.id2word.* with mmap=None
2018-06-03 19:37:14,563 : INFO : loaded ../data/stacksample/models/question_title_body.tfidf


In [9]:
hpd_model = HdpModel.load(f'../data/stacksample/models/{hpd_model_files.value}')


2018-06-03 19:37:16,141 : INFO : loading HdpModel object from ../data/stacksample/models/question_compact_text.hpd
2018-06-03 19:37:16,539 : INFO : loading corpus recursively from ../data/stacksample/models/question_compact_text.hpd.corpus.* with mmap=None
2018-06-03 19:37:16,540 : INFO : loading id2word recursively from ../data/stacksample/models/question_compact_text.hpd.id2word.* with mmap=None
2018-06-03 19:37:16,542 : INFO : loading lda_beta from ../data/stacksample/models/question_compact_text.hpd.lda_beta.npy with mmap=None
2018-06-03 19:37:22,197 : INFO : loading m_lambda from ../data/stacksample/models/question_compact_text.hpd.m_lambda.npy with mmap=None
2018-06-03 19:37:28,050 : INFO : loading m_Elogbeta from ../data/stacksample/models/question_compact_text.hpd.m_Elogbeta.npy with mmap=None
2018-06-03 19:37:34,579 : INFO : loaded ../data/stacksample/models/question_compact_text.hpd


## Analysis functions

In [77]:
def get_real_tags(question_id):
    real_tags = tags_df[tags_df['Id'] == question_id]['Tag'].values
    return [tag for tag in real_tags]


def get_predicted_tags(question_index, model):
    question_bow, question_id, text = get_bow_question(question_index)
    return model[question_bow], question_id


def compare_real_predicted(question_index, model, prob = 0.0):
    predicted_tags, question_id = get_predicted_tags(question_index, model)
    predicted_tags = [p_tag for p_tag in predicted_tags if p_tag[1] > prob]    
    real_tags = get_real_tags(question_id)
    n_predicted = len(predicted_tags)
    n_real = len(real_tags)    
    correct_predictions = 0
    for tag in predicted_tags:
        if dict_texts[tag[0]] in ' '.join(real_tags):
            correct_predictions += 1            
    if n_real > n_predicted:
        return correct_predictions, n_real    
    return correct_predictions, n_predicted


def get_bow_question(index):
    tokenized_text = preprocess_string(csv_data['Title'][index])
    question_id = csv_data['Id'][index]
    text = csv_data['Body'][index]
    return dict_texts.doc2bow(tokenized_text), question_id, text


def display_results(index, model):
    question_bow, question_id, text = get_bow_question(index)
    predicted_tags = model[question_bow]
    tags = get_real_tags(question_id)
    print(f'Question {question_id}:\n{text}\nReal tags:\n{tags}\n\nPredicted tags:')
    [print(f'Tag: {dict_texts[tag_id]}, Probability: {p}') for tag_id, p in predicted_tags]


In [78]:
def compare_n_tags(n, model, prob = 0.0):
    correct_predictions = 0
    all_predictions = 0
    
    for i in range(n):
        comparison = compare_real_predicted(i, model, prob)
        correct_predictions += comparison[0]
        all_predictions += comparison[1]
        
    return correct_predictions, all_predictions


## Model analysis

### TFIDF Model

In [79]:
IDX = 0
display_results(IDX, tfidf_model)
print(compare_real_predicted(IDX, tfidf_model, 0.4))

Question 80:
<p>I've written a database generation script in <a href="http://en.wikipedia.org/wiki/SQL">SQL</a> and want to execute it in my <a href="http://en.wikipedia.org/wiki/Adobe_Integrated_Runtime">Adobe AIR</a> application:</p>

<pre><code>Create Table tRole (
      roleID integer Primary Key
      ,roleName varchar(40)
);
Create Table tFile (
    fileID integer Primary Key
    ,fileName varchar(50)
    ,fileDescription varchar(500)
    ,thumbnailID integer
    ,fileFormatID integer
    ,categoryID integer
    ,isFavorite boolean
    ,dateAdded date
    ,globalAccessCount integer
    ,lastAccessTime date
    ,downloadComplete boolean
    ,isNew boolean
    ,isSpotlight boolean
    ,duration varchar(30)
);
Create Table tCategory (
    categoryID integer Primary Key
    ,categoryName varchar(50)
    ,parent_categoryID integer
);
...
</code></pre>

<p>I execute this in Adobe AIR using the following methods:</p>

<pre><code>public static function RunSqlFromFile(fileName:String):voi

In [81]:
comparison = compare_n_tags(40, tfidf_model, 0.4)
print(f'Result: {comparison[0]}/{comparison[1]} - {round(comparison[0]/comparison[1], 2)}')

Result: 46/128 - 0.36


### LDA Model

In [42]:
IDX = 50
display_results(IDX, lda_model)
compare_real_predicted(IDX, lda_model)

Question 4430:
<p>Is there available any tool for PHP which can be used to generate code for consuming a <a href="http://en.wikipedia.org/wiki/Web_service">web service</a> based on its <a href="http://en.wikipedia.org/wiki/Web_Services_Description_Language">WSDL</a>? Something comparable to clicking "Add Web Reference" in Visual Studio or the Eclipse plugin which does the same thing for Java.</p>

Real tags:
['php', 'web-services', 'visual-studio', 'wsdl']

Predicted tags:
Tag: isfavorit, Probability: 0.29556527733802795
Tag: parent, Probability: 0.17092831432819366
Tag: try, Probability: 0.2040792852640152
Tag: ve, Probability: 0.16942711174488068


(0, 4)

In [43]:
compare_n_tags(80, lda_model)

(8, 266)

### HPD Model

In [44]:
IDX = 10
display_results(IDX, hpd_model)
compare_real_predicted(IDX, hpd_model)

Question 930:
<p>What's the simplest way to connect and query a database for a set of records in C#?</p>

Real tags:
['c#', 'database', 'loops', 'connection']

Predicted tags:
Tag: 50, Probability: 0.010119895685787621
Tag: bytesavail, Probability: 0.8055925513114077


(0, 2)