Skip to content

Commit

Permalink
work on upload parsed corpus
Browse files Browse the repository at this point in the history
  • Loading branch information
interrogator committed Aug 19, 2019
1 parent c8a6509 commit 39a74db
Show file tree
Hide file tree
Showing 7 changed files with 180 additions and 87 deletions.
80 changes: 54 additions & 26 deletions buzzword/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,9 @@
# where downloadable CSVs get stored
if not os.path.isdir("csv"):
os.makedirs("csv")
# where uploaded corpora are stored
if not os.path.isdir("uploads"):
os.makedirs("uploads")


def _get_layout():
Expand All @@ -32,41 +35,66 @@ def _get_layout():


def _make_explore_layout(slug, name):
"""
Simulate globals and generate layout for explore page
"""
corpus = CORPORA[slug]
SEARCHES = OrderedDict({name: corpus})
TABLES = OrderedDict({"initial": INITIAL_TABLES[slug]})
return _make_tabs(SEARCHES, TABLES, slug, **CONFIG)
searches = OrderedDict({name: corpus})
tables = OrderedDict({"initial": INITIAL_TABLES[slug]})
return _make_tabs(searches, tables, slug, **CONFIG)


def _populate_explore_layouts():
"""
Can be used to create explore page on startup, save loading time
"""
for name, meta in CORPUS_META.items():
slug = meta["slug"]
LAYOUTS[slug] = _make_explore_layout(slug, name)


def _get_explore_layout(slug):
"""
Get (and maybe generate) the explore layout for this slug
"""
gen = (k for k, v in CORPUS_META.items() if v["slug"] == slug)
name = next(gen, None)
name = name or slug
# store the default explore for each corpus in a dict for speed
if slug in LAYOUTS:
return LAYOUTS[slug]
layout = _make_explore_layout(slug, name)
LAYOUTS[slug] = layout
return layout


@app.callback(Output("page-content", "children"), [Input("url", "pathname")])
def _choose_correct_page(pathname):
"""
When the URL changes, get correct page and populate page-content with it
"""
pages = dict(
about=about.layout,
guide=guide.layout,
building=building.layout,
start=start.layout,
depgrep=depgrep.layout,
)
pathname = pathname
if pathname is None:
raise PreventUpdate
if pathname == "/about":
return about.layout
if pathname == "/guide":
return guide.layout
if pathname == "/building":
return building.layout
if pathname == "/depgrep":
return depgrep.layout
if pathname.startswith("/explore"):
if not pathname:
return start.layout
if pathname in pages:
return pages[pathname]
if pathname.startswith("explore"):
slug = pathname.rstrip("/").split("/")[-1]
# if corpus not found, redirect
if slug not in CORPORA:
pathname = "/"
else:
gen = (k for k, v in CORPUS_META.items() if v["slug"] == slug)
name = next(gen, None)
name = name or slug
if slug in LAYOUTS:
layout = LAYOUTS[slug]
else:
layout = _make_explore_layout(slug, name)
LAYOUTS[slug] = layout
# app.title = "buzzword: {}".format(name)
return layout
if pathname in {"", "/", "/start"}:
# app.title = "buzzword: home"
return start.layout
# find corpus name by slug
return _get_explore_layout(slug)
if pathname in {"", "/"}:
return start.layout
else:
return "404"
Expand Down
24 changes: 24 additions & 0 deletions buzzword/parts/assets/custom.css
Original file line number Diff line number Diff line change
Expand Up @@ -18,3 +18,27 @@ p, ul, ol {
padding-bottom: 10px !important;
padding-top: 10px !important;
}

@-webkit-keyframes flasher {
from { background-color: red; }
to { background-color: inherit; }
}
@-moz-keyframes flasher {
from { background-color: red; }
to { background-color: inherit; }
}
@-o-keyframes flasher {
from { background-color: red; }
to { background-color: inherit; }
}
@keyframes flasher {
from { background-color: red; }
to { background-color: inherit; }
}

.flash {
-webkit-animation: flasher 3s infinite; /* Safari 4+ */
-moz-animation: flasher 3s infinite; /* Fx 5+ */
-o-animation: flasher 3s infinite; /* Opera 12+ */
animation: flasher 3s infinite; /* IE 10+ */
}
17 changes: 4 additions & 13 deletions buzzword/parts/nav.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,22 +9,13 @@
("About", "https://buzzword.readthedocs.io/en/latest/about/"),
]

hrefs = [html.Li([html.A(name, target="_blank", href=url)]) for name, url in LINKS]

navbar = html.Div(
[
html.Img(
src="../assets/bolt.jpg", height=42, width=38, style=style.BLOCK_MIDDLE_35
),
html.Img(src="../assets/bolt.jpg", height=42, width=38, style=style.NAV_HEADER),
dcc.Link("buzzword", href="/", style=style.NAV_HEADER),
html.Div(
html.Ul(
[
html.Li([html.A(name, target="_blank", href=url)])
for name, url in LINKS
],
className="nav navbar-nav",
),
className="pull-right",
),
html.Div(html.Ul(hrefs, className="nav navbar-nav"), className="pull-right"),
],
className="navbar navbar-default navbar-static-top",
)
107 changes: 68 additions & 39 deletions buzzword/parts/start.py
Original file line number Diff line number Diff line change
@@ -1,32 +1,34 @@
import base64
import os
import traceback
from datetime import date

import dash_core_components as dcc
import dash_html_components as html
from buzz.constants import SPACY_LANGUAGES
from buzz.corpus import Corpus
from buzzword.parts.main import app, CORPORA, INITIAL_TABLES, CORPUS_META, CONFIG
from buzzword.parts.strings import _slug_from_name
from buzzword.parts.strings import _slug_from_name, _make_description
from dash.dependencies import Input, Output, State
from dash.exceptions import PreventUpdate
from buzzword.parts.nav import navbar
from buzzword.parts import style


def _make_row(row_data, index=None):
def _make_row(row_data, index=None, upload=False):
"""
Make row for corpus table
"""
clas = "flash" if upload else "normal-row"
row = [html.Td(children=index)]
for j, value in enumerate(row_data):
if not j:
cell = html.Td(html.A(href=row_data[4], children=value))
cell = html.Td(html.A(href=row_data[4], children=value, className=clas))
elif j == 4:
hyper = html.A(href=value, children="ⓘ", target="_blank")
cell = html.Td(className="no-underline", children=hyper)
cell = html.Td(children=hyper, className=clas)
else:
cell = html.Td(children=value)
cell = html.Td(children=value, className=clas)
row.append(cell)
return html.Tr(row)

Expand Down Expand Up @@ -108,30 +110,43 @@ def _make_upload_parse_space():
)


def _store_corpus(contents, filenames, corpus_name):
def _store_corpus(contents, filenames, slug):
"""
From content and filenames, build a corpus and return the path to it
"""
extensions = set()
if not os.path.isdir("uploads"):
os.makedirs("uploads")
store_at = os.path.join(CONFIG["root"], "uploads", corpus_name)
corpus_size = 0
is_parsed = all(i.endswith(("conll", "conllu")) for i in filenames)
if is_parsed:
slug = slug + "-parsed"
store_at = os.path.join(CONFIG["root"], "uploads", slug)
os.makedirs(store_at)
for content, filename in zip(contents, filenames):
extensions.add(os.path.splitext(filename)[-1])
if len(extensions) > 1:
break
content_type, content_string = content.split(",")
content_type, content_string = content.split(",", 1)
decoded = base64.b64decode(content_string)
corpus_size += len(decoded)
outpath = os.path.join(store_at, filename)
with open(outpath, "wb") as fo:
fo.write(decoded)
if not len(extensions):
raise ValueError("No file extensions provided")
elif len(extensions) > 1:
raise ValueError("Multiple extensions provided: {}".format(extensions))
is_parsed = all(i.endswith(("conll", "conllu")) for i in filenames)
return store_at, is_parsed
return store_at, is_parsed, corpus_size


def _validate_input(contents, names, corpus_name, slug):
"""
Check that uploaded corpus-to-be is valid
"""
endings = set([os.path.splitext(i)[-1] for i in names])
if not endings:
return "File extension not provided."
if len(endings) > 1:
return "All uploaded files need to have the same extension."
allowed = {".conll", ".conllu", ".txt"}
if endings.pop() not in allowed:
allowed = ", ".join(allowed)
return "Uploaded file extension must be one of: {}".format(allowed)
up_dir_exists = os.path.isdir(os.path.join(CONFIG["root"], "uploads", slug))
if corpus_name in CORPUS_META or up_dir_exists:
return f"A corpus named '{corpus_name}' already exists. Try a different name."
return ""


@app.callback(
Expand All @@ -157,28 +172,37 @@ def _upload_files(n_clicks, contents, names, corpus_lang, corpus_name, table_row

if n_clicks is None:
raise PreventUpdate
msg = ""
try:
path, is_parsed = _store_corpus(contents, names, corpus_name)
corpus = Corpus(path)
if not is_parsed:

slug = _slug_from_name(corpus_name)
msg = _validate_input(contents, names, corpus_name, slug)

if msg:
return bool(msg), msg, table_rows

path, is_parsed, size = _store_corpus(contents, names, slug)
corpus = Corpus(path)
if not is_parsed:
try:
corpus = corpus.parse(cons_parser=None, language=corpus_lang)
except Exception as error:
msg = str(error)
raise
if not msg:
slug = _slug_from_name(corpus_name)
CORPORA[slug] = corpus.load()
CORPUS_META[corpus_name] = dict(slug=slug)
INITIAL_TABLES[slug] = CORPORA[slug].table(show="p", subcorpora="file")
except Exception as error:
msg = f"Problem when parsing the corpus: {str(error)}"
traceback.print_exc()
return bool(msg), msg, table_rows

CORPORA[slug] = corpus.load()
CORPUS_META[corpus_name] = dict(slug=slug)
INITIAL_TABLES[slug] = CORPORA[slug].table(show="p", subcorpora="file")
slug = _slug_from_name(corpus_name)
href = "/explore/{}".format(slug)
index = len(CORPUS_META)
date = date.today().strftime("%d.%m.%Y")
desc = "User-uploaded data"
desc = _make_description(names, size)
toks = len(CORPORA[slug])
row_data = [corpus_name, date, corpus_lang, desc, href, toks]
row = _make_row(row_data, index=index)
# get long name for language
long_lang = next(k for k, v in SPACY_LANGUAGES.items() if v == corpus_lang)
long_lang = long_lang.capitalize()
row_data = [corpus_name, date, long_lang, desc, href, toks]
row = _make_row(row_data, index=index, upload=True)
table_rows.append(row)
return bool(msg), msg, table_rows

Expand All @@ -194,7 +218,10 @@ def show_uploaded(contents, filenames):
"""
if not contents:
raise PreventUpdate
markdown = "* " + "\n* ".join([i for i in filenames])
markdown = "* " + "\n* ".join([i for i in filenames[:10]])
if len(filenames) > 10:
rest = len(filenames) - 10
markdown += f"\n* and {rest} more ..."
return dcc.Markdown(markdown)


Expand All @@ -217,10 +244,12 @@ def show_uploaded(contents, filenames):
)
uphead = html.H3("Upload data", style=style.VERTICAL_MARGINS)

upload_text = html.P(
link = "https://buzzword.readthedocs.io/en/latest/building/'"
md = (
"You can upload either CONLL-U files, or plaintext with optional annotations. "
"See 'Creating corpora' for an explanation of possible data formats."
"See [`Creating corpora`]({}) for more information.".format(link)
)
upload_text = dcc.Markdown(md)

upload = _make_upload_parse_space()

Expand Down
24 changes: 24 additions & 0 deletions buzzword/parts/strings.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,18 @@
from buzz.constants import SHORT_TO_LONG_NAME


def _make_description(names, size):
"""
Describe a user-uploaded corpus
"""
desc = "User-uploaded data, {}. {} file{}: {}"
form_names = ", ".join(names[:3])
if len(names) > 3:
form_names += "..."
plu = "s" if len(names) != 1 else ""
return desc.format(_format_size(size), len(names), plu, form_names)


def _make_table_name(history):
"""
Generate a table name from its history
Expand All @@ -33,6 +45,18 @@ def _make_table_name(history):
return f"{basic} -- from search #{parent}"


def _format_size(size):
"""
Format size in bytes, kb, or mb
"""
if size < 1000:
return f"{size} bytes"
if size >= 1000000:
return f"{size/1000000:.2f} MB"
if size >= 1000:
return f"{size/1000:.2f} kB"


def _make_search_name(history, size):
"""
Generate a search name from its history
Expand Down
11 changes: 4 additions & 7 deletions buzzword/parts/style.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,17 +13,14 @@
"paddingLeft": "5px",
"paddingRight": "5px",
}
INLINE = {
"display": "inline-block",
"paddingLeft": "5px",
"paddingRight": "5px",
}
INLINE = {"display": "inline-block", "paddingLeft": "5px", "paddingRight": "5px"}
NAV_HEADER = {
"display": "inline-block",
"verticalAlign": "middle",
"color": "#555555",
"text-decoration": "none",
"font-size": 32,
"textDecoration": "none",
"fontSize": 32,
"paddingTop": "12px",
}
MARGIN_5_MONO = {"marginLeft": 5, "marginRight": 5, "fontFamily": "monospace"}
BOLD_DARK = {"fontWeight": "bold", "color": "#555555"}
Expand Down

0 comments on commit 39a74db

Please sign in to comment.