In [2]:
from __future__ import print_function
from ipywidgets import interact, interactive, fixed, interact_manual
import ipywidgets as widgets
from IPython.display import display
import logging
import pprint
from os import listdir
from os.path import isfile, join

import pandas as pd
from gensim.parsing.preprocessing import preprocess_string, strip_tags, strip_punctuation, strip_multiple_whitespaces, \
    remove_stopwords, stem_text
from gensim.corpora import MmCorpus, Dictionary
from gensim.models import TfidfModel, LdaModel, HdpModel

logging.basicConfig(format='\r%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

pp = pprint.PrettyPrinter(indent=4)
ENCODING = 'iso-8859-1'

In [3]:
def get_files(path):
    return [f for f in listdir(path) if isfile(join(path, f))]

## Analysis functions
### Text formating

In [None]:
ALLOWED_SHORT_WORDS = ['c', 'c#', 'r', '3d', '2d', '1d', '7z', 'qt']


def strip_short(word_list):    
    return [word for word in word_list if len(word) > 2 or word in ALLOWED_SHORT_WORDS]


def tokenize_text(text):
    processed = preprocess_string(text, filters=[strip_tags, strip_punctuation, 
                                                 strip_multiple_whitespaces, remove_stopwords, stem_text])
    return strip_short(processed)

### Data analysis

In [72]:
def get_real_tags(question_id):
    real_tags = tags_df[tags_df['Id'] == question_id]['Tag'].values
    return [tag for tag in real_tags]


def get_predicted_tags(question_index, model, dictionary):
    question_bow, question_id, text = get_bow_question(question_index, dictionary)
    return model[question_bow], question_id


def compare_real_predicted(question_index, model, dictionary, prob = 0.0):
    predicted_tags, question_id = get_predicted_tags(question_index, model, dictionary)
    predicted_tags = [p_tag for p_tag in predicted_tags if p_tag[1] > prob]
    real_tags = get_real_tags(question_id)
    correct_predictions = 0
    for tag in predicted_tags:
        if dictionary[tag[0]] in ' '.join(real_tags):
            correct_predictions += 1            
    if len(real_tags) > len(predicted_tags):
        return correct_predictions, len(real_tags)    
    return correct_predictions, len(predicted_tags)


def get_bow_question(index, dictionary):
    tokenized_text = tokenize_text(csv_data['All'][index])
    question_id = csv_data['Id'][index]
    text = csv_data['Body'][index]
    return dictionary.doc2bow(tokenized_text), question_id, text


def display_results(index, model, dictionary):
    question_bow, question_id, text = get_bow_question(index, dictionary)
    predicted_tags = model[question_bow]
    tags = get_real_tags(question_id)
    print(f'Question {question_id}:\n{text}\nReal tags:\n{tags}\n\nPredicted tags:')
    [print(f'Tag: {dictionary[tag_id]}, Probability: {p}') for tag_id, p in predicted_tags]

In [73]:
def compare_n_tags(n, model, dictionary, prob=0.0):
    correct_predictions = 0
    all_predictions = 0
    for i in range(n):
        comparison = compare_real_predicted(i, model, dictionary, prob)
        correct_predictions += comparison[0]
        all_predictions += comparison[1]
    return correct_predictions, all_predictions

## Setup
### CSV files

In [18]:
PATH_TAGS = '../data/stacksample/Tags.csv'
tags_df = pd.read_csv(PATH_TAGS, encoding=ENCODING)

In [7]:
COLUMNS_QUESTION = ['Title', 'Body', 'Id']
QUESTIONS_PATH = '../data/stacksample/Questions.csv'
N_ROWS = 100

csv_data  = pd.read_csv(QUESTIONS_PATH, encoding=ENCODING, usecols=COLUMNS_QUESTION, nrows=N_ROWS)
csv_data['All'] = csv_data['Title'].map(str) + csv_data['Body']

### Widgets

In [76]:
model_file_name = widgets.Dropdown(options=[f for f in get_files('../data/stacksample/models') if 'npy' not in f],
                                   description='Model file: ')
dict_file_name = widgets.Dropdown(options=get_files('../data/stacksample/dictionary'), description='Dictionary file: ')

text_index_slider1 = widgets.IntSlider(description='Text index: ', value=0, min=0, max=N_ROWS)
text_index_slider2 = widgets.IntSlider(description='Text index: ', value=0, min=0, max=N_ROWS)

probability_slider = widgets.FloatSlider(description='Tag probability: ', value=0.0, min=0.0, max=1.0, step=0.05)

analyse_btn = widgets.Button(description='Analyse model!', disabled=True)

def get_dictionary(file_name):
    return Dictionary.load(f'../data/stacksample/dictionary/{file_name}')

def get_model(file_name):
    return TfidfModel.load(f'../data/stacksample/models/{file_name}')


def apply_parameters(b):
    model_path = f'../data/stacksample/models/{model_file_name.value}'
    if 'tfidf' in model_path:
        model = TfidfModel.load(model_path)
    corpus_dict = Dictionary.load(f'../data/stacksample/dictionary/{dict_file_name.value}')
    analyse_btn.disabled=False
    

def test_example(b):
    display_results(text_index.value, model, corpus_dict)    

In [77]:
corpus_dict_widget = interactive(get_dictionary, file_name=dict_file_name)
model_widget = interactive(get_model, file_name=model_file_name)
display(corpus_dict_widget, model_widget)

In [67]:
interact_manual(display_results, index=text_index_slider1,
                model=fixed(model_widget.result), dictionary=fixed(corpus_dict_widget.result))

<function __main__.display_results>

In [78]:
interact_manual(compare_n_tags, n=text_index_slider2, model=fixed(model_widget.result), 
                dictionary=fixed(corpus_dict_widget.result), prob=probability_slider)

<function __main__.compare_n_tags>

In [None]:
def get_all_predicted_tags(model, dictionary):
    output_dict = {
        'Id': [],
        'Tags_p': []
    }
    for idx in range(len(csv_data)):
        prediction = get_predicted_tags(idx, model, dictionary)        
        tags = [(dictionary[tag_id], p) for tag_id, p in prediction[0]]
        output_dict['Tags_p'].append(tags)
        output_dict['Id'].append(prediction[1])
    return output_dict