In [6]:
# import everything as needed

import re 
import sys 
import os 

nb_dir = os.getcwd()
if nb_dir not in sys.path:
    sys.path.append(nb_dir)

from plotlib.loaders import *
from plotlib.plotters import *

%matplotlib inline

from phdconf import config 

# import math
# import string

In [7]:
queries = load_queries(config.AUS_TOPIC_PATH)

In [4]:
broad, specific = load_query_types(queries)

In [5]:
qrel_paths = [config.AUS_QREL_PATH, config.AUS_QREL_PATH]
index_names = ['flattened', 'flatstem']
rel_levels = [config.AUS_REL_LEVEL, config.AUS_REL_LEVEL]

In [6]:
start = 300
end = 3050
inc = 50 

dir_path = os.path.join(config.BASE_DIR,'dirichlet_prior')
model_name = '{0}-unigram_dir_mu_{1:.2f}.run'

In [6]:
dfs = load_1d_dfs(index_names, qrel_paths, dir_path, model_name, rel_levels, start, end, inc)
broad_dfs = load_1d_dfs(index_names, qrel_paths, dir_path, model_name, rel_levels, start, end, inc, filtered=broad)
specific_dfs = load_1d_dfs(index_names, qrel_paths, dir_path, model_name, rel_levels, start, end, inc, filtered=specific)

In [8]:
fig = plot_tune_1d_comp(['unstemmed', 'stemmed'], config.METRIC_NAMES, dfs, start, end, inc)

<Figure size 1152x720 with 5 Axes>

In [9]:
fig2 = plot_tune_1d_comp(['broad', 'broad-stem', 'specific', 'specific-stem'], config.METRIC_NAMES, broad_dfs + specific_dfs, start, end, inc)

<Figure size 1152x720 with 5 Axes>

In [6]:
query_dfs = load_1d_dfs(index_names, qrel_paths, dir_path, model_name, rel_levels, start, end, inc, per_query=True)

In [7]:
ind = int((2400 - start) / inc)
plot_diff(query_dfs[0][ind], query_dfs[1][ind], config.METRIC_NAMES, queries)

<Figure size 1152x720 with 5 Axes>

In [8]:
plot_diff_query_ind(query_dfs[0], query_dfs[1], ind, config.METRIC_NAMES, queries)

<Figure size 1152x720 with 1 Axes>

<Figure size 1152x720 with 1 Axes>

<Figure size 1152x720 with 1 Axes>

<Figure size 1152x720 with 1 Axes>

<Figure size 1152x720 with 1 Axes>

In [8]:
stopwords = load_stopwords(config.AUS_STOPWORD_PATH)

In [9]:
from nltk.stem.snowball import SnowballStemmer
from nltk.stem import WordNetLemmatizer 
# import nltk
# import ssl

# try:
#     _create_unverified_https_context = ssl._create_unverified_context
# except AttributeError:
#     pass
# else:
#     ssl._create_default_https_context = _create_unverified_https_context
# nltk.download('wordnet')
    

stemmer = SnowballStemmer("english")
lemmatizer = WordNetLemmatizer()

In [10]:
for topic_id, topic in queries.items():
    terms = set(re.sub('[^0-9a-zA-Z]+', ' ', topic['topic']).lower().split())
    query = {term for term in terms if term not in stopwords}
    print(topic_id, ['{0}-->{1}'.format(t, stemmer.stem(t)) for t in query])
    print(topic_id, ['{0}-->{1}'.format(t, lemmatizer.lemmatize(t)) for t in query])

1 ['liquidation-->liquid', 'money-->money', 'recovered-->recov', 'reinstating-->reinstat', 'regards-->regard']
1 ['liquidation-->liquidation', 'money-->money', 'recovered-->recovered', 'reinstating-->reinstating', 'regards-->regard']
2 ['writing-->write', 'settlement-->settlement', 'variation-->variat']
2 ['writing-->writing', 'settlement-->settlement', 'variation-->variation']
3 ['degree-->degre', 'control-->control', 'champerty-->champerti', 'requisite-->requisit', 'maintenance-->mainten']
3 ['degree-->degree', 'control-->control', 'champerty-->champerty', 'requisite-->requisite', 'maintenance-->maintenance']
4 ['boat-->boat', 'fees-->fee', 'agency-->agenc', 'effective-->effect', 'cause-->caus', 'sale-->sale']
4 ['boat-->boat', 'fees-->fee', 'agency-->agency', 'effective-->effective', 'cause-->cause', 'sale-->sale']
5 ['damages-->damag', 'reduced-->reduc', 'represents-->repres', 'betterment-->better', 'sum-->sum', 'according-->accord']
5 ['damages-->damage', 'reduced-->reduced', 'rep

In [3]:
for x in ['organised', 'organise', 'organisation', 'organising']:
    print(stemmer.stem(x))

organis
organis
organis
organis
