Skip to content

Commit

Permalink
Merge pull request #24 from l0rb/master
Browse files Browse the repository at this point in the history
storing corpora meta-data in the db
  • Loading branch information
interrogator committed Dec 2, 2019
2 parents be80744 + fce6289 commit 1916ca2
Show file tree
Hide file tree
Showing 11 changed files with 213 additions and 40 deletions.
10 changes: 10 additions & 0 deletions accounts/urls.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
from django.urls import path

from accounts import views as accounts_views


urlpatterns = [
path('signup/', accounts_views.signup, name='signup'),
path('logout/', accounts_views.logout_view, name='logout'),
path('login/', accounts_views.login_view, name='login'),
]
5 changes: 1 addition & 4 deletions buzzword/urls.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,13 +3,10 @@
from django.conf import settings
from django.conf.urls.static import static

from accounts import views as accounts_views

urlpatterns = [
path('', include("start.urls")),
path('signup/', accounts_views.signup, name='signup'),
path('logout/', accounts_views.logout_view, name='logout'),
path('login/', accounts_views.login_view, name='login'),
path('', include("accounts.urls")),
path('explore/', include('explore.urls')),
path('admin/', admin.site.urls),
path('django_plotly_dash/', include('django_plotly_dash.urls')),
Expand Down
4 changes: 2 additions & 2 deletions corpora.json.example
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
"language": "English",
"path": "dtrt/do-the-right-thing-parsed",
"desc": "Script of Spike Lee's 1989 drama, with various speaker and scene annotations",
"len": 17713,
"length": 17713,
"drop_columns": ["voice_over", "camera_angle"],
"disabled": false,
"date": "1989",
Expand All @@ -16,7 +16,7 @@
"language": "English",
"path": "unabomber/manifesto-parsed",
"desc": "Text of the Unabomber's manifesto",
"len": 37347,
"length": 37347,
"add_governor": false,
"disabled": true,
"date": "1995",
Expand Down
38 changes: 38 additions & 0 deletions explore/migrations/0001_initial.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
# Generated by Django 2.2.7 on 2019-12-02 13:18

from django.db import migrations, models
import django.db.models.deletion


class Migration(migrations.Migration):

initial = True

dependencies = [
]

operations = [
migrations.CreateModel(
name='Corpus',
fields=[
('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
('slug', models.SlugField(max_length=255)),
('language', models.CharField(max_length=255)),
('path', models.TextField()),
('desc', models.TextField(default='')),
('len', models.BigIntegerField()),
('diabled', models.BooleanField(default=False)),
('date', models.DateField()),
('load', models.BooleanField(default=True)),
('url', models.URLField(max_length=255)),
],
),
migrations.CreateModel(
name='DropColumn',
fields=[
('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
('column_name', models.CharField(max_length=255)),
('corpus', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to='explore.Corpus')),
],
),
]
49 changes: 49 additions & 0 deletions explore/migrations/0002_auto_20191202_1431.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
# Generated by Django 2.2.7 on 2019-12-02 14:31

from django.db import migrations, models


class Migration(migrations.Migration):

dependencies = [
('explore', '0001_initial'),
]

operations = [
migrations.RenameField(
model_name='corpus',
old_name='diabled',
new_name='disabled',
),
migrations.AddField(
model_name='corpus',
name='add_governor',
field=models.BooleanField(null=True),
),
migrations.AddField(
model_name='corpus',
name='name',
field=models.CharField(default='name', max_length=255),
preserve_default=False,
),
migrations.AlterField(
model_name='corpus',
name='date',
field=models.DateField(null=True),
),
migrations.AlterField(
model_name='corpus',
name='len',
field=models.BigIntegerField(null=True),
),
migrations.AlterField(
model_name='corpus',
name='slug',
field=models.SlugField(max_length=255, unique=True),
),
migrations.AlterField(
model_name='corpus',
name='url',
field=models.URLField(max_length=255, null=True),
),
]
18 changes: 18 additions & 0 deletions explore/migrations/0003_auto_20191202_2136.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
# Generated by Django 2.2.7 on 2019-12-02 21:36

from django.db import migrations


class Migration(migrations.Migration):

dependencies = [
('explore', '0002_auto_20191202_1431'),
]

operations = [
migrations.RenameField(
model_name='corpus',
old_name='len',
new_name='length',
),
]
58 changes: 58 additions & 0 deletions explore/models.py
Original file line number Diff line number Diff line change
@@ -1 +1,59 @@
import datetime
from django.db import models

from explorer.parts.strings import _slug_from_name

class Corpus(models.Model):
slug = models.SlugField(max_length=255, unique=True) # this can't be null because a name needs to exist
name = models.CharField(max_length=255)
language = models.CharField(max_length=255) # probably turn this into a model later
path = models.TextField()
desc = models.TextField(default="")
length = models.BigIntegerField(null=True)
add_governor = models.BooleanField(null=True)
#drop_columns = array -> needs to be relation
disabled = models.BooleanField(default=False)
date = models.DateField(null=True)
load = models.BooleanField(default=True)
url = models.URLField(max_length=255, null=True)

@classmethod
def from_json(cls, jsondata, corpus_name):
slug = jsondata.get("slug", _slug_from_name(corpus_name))
try:
corp = cls.objects.get(slug=slug)
return corp
except cls.DoesNotExist:
pass
language = jsondata.get("language")
path = jsondata.get("path")
desc = jsondata.get("desc", "")
length = jsondata.get("length")
disabled = jsondata.get("disabled", False)
date = datetime.datetime.strptime(jsondata.get("date", '1900'), '%Y').date()
load = jsondata.get("load", True)
url = jsondata.get("url")

has_error = False
if not path:
has_error = True
logging.error('no path = no good')
if not language:
has_error = True
logging.error('language missing')
if has_error:
raise Exception('some problem with loading corpus from json. check error log')

corp = Corpus(name=corpus_name, slug=slug, language=language, path=path, desc=desc, length=length, disabled=disabled, date=date, load=load, url=url)
corp.save()

for drop_col in jsondata.get('drop_columns', []):
col = DropColumn(corpus=corp, column_name=drop_col)
col.save()

return corp

class DropColumn(models.Model):
corpus = models.ForeignKey(Corpus, on_delete=models.CASCADE)
column_name = models.CharField(max_length=255)

11 changes: 8 additions & 3 deletions explorer/parts/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
from buzz.corpus import Corpus

from .strings import _capitalize_first, _downloadable_name

from explore.models import Corpus as CorpusModel

def _get_specs_and_corpus(search_from, searches, corpora, slug):
"""
Expand Down Expand Up @@ -227,8 +227,7 @@ def register_callbacks():
"""
from . import callbacks


def _get_corpora_meta(corpora_file):
def _get_corpora_json_contents(corpora_file):
"""
Get the contents of corpora.json, or an empty dict
"""
Expand All @@ -239,6 +238,12 @@ def _get_corpora_meta(corpora_file):
with open(corpora_file, "r") as fo:
return json.loads(fo.read())

def _get_corpora_meta(corpora_file):
contents = _get_corpora_json_contents(corpora_file)
corpora = []
for corpus_name, corpus_json in contents.items():
corpora.append(CorpusModel.from_json(corpus_json, corpus_name))
return corpora

def _special_search(df, col, search_string, skip):
"""
Expand Down
45 changes: 19 additions & 26 deletions explorer/parts/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,20 +21,17 @@
LAYOUTS = dict()


def _get_corpus_config(local_conf, global_conf, name):
def _get_corpus_config(corpus, global_conf):
"""
Return global conf plus individual settings for corpus
"""
conf = {**global_conf}
settings = {"max_dataset_rows", "drop_columns", "add_governor", "load", "slug"}
for setting in settings:
loc = local_conf.get(setting)
loc = getattr(corpus, setting, None)
if loc is not None:
conf[setting] = loc
else:
if setting == "slug":
conf[setting] = _slug_from_name(name)
conf["corpus_name"] = name
conf["corpus_name"] = corpus.name
return conf


Expand All @@ -47,26 +44,24 @@ def _get_corpora(corpus_meta):
corpora = dict()
tables = dict()
corpora_config = dict()
for i, (corpus_name, metadata) in enumerate(corpus_meta.items(), start=1):
if metadata.get("disabled"):
print("Skipping corpus because it is disabled: {}".format(corpus_name))
for corpus in corpus_meta:
if corpus.disabled:
print("Skipping corpus because it is disabled: {}".format(corpus.name))
continue
slug = metadata.get("slug", _slug_from_name(corpus_name))
corpus = Corpus(metadata["path"])
conf = _get_corpus_config(metadata, GLOBAL_CONFIG, corpus_name)
buzz_corpus = Corpus(corpus.path)
conf = _get_corpus_config(corpus, GLOBAL_CONFIG)
if conf["load"]:
print("Loading corpus into memory: {} ...".format(corpus_name))
corpus = corpus.load(add_governor=conf["add_governor"])
corpus = _preprocess_corpus(corpus, **conf)
print("Loading corpus into memory: {} ...".format(corpus.name))
buzz_corpus = buzz_corpus.load(add_governor=conf["add_governor"])
buzz_corpus = _preprocess_corpus(buzz_corpus, **conf)
else:
print("NOT loading corpus into memory: {} ...".format(corpus_name))
initial_table = corpus.table(show="p", subcorpora="file")
corpora[slug] = corpus
tables[slug] = initial_table
corpora_config[slug] = conf
print("NOT loading corpus into memory: {} ...".format(corpus.name))
initial_table = buzz_corpus.table(show="p", subcorpora="file")
corpora[corpus.slug] = buzz_corpus
tables[corpus.slug] = initial_table
corpora_config[corpus.slug] = conf
return corpora, tables, corpora_config


CORPUS_META = _get_corpora_meta(GLOBAL_CONFIG.get("corpora_file"))

CORPORA, INITIAL_TABLES, CORPORA_CONFIGS = _get_corpora(CORPUS_META)
Expand Down Expand Up @@ -98,8 +93,6 @@ def load_layout(slug, set_and_register=True):
# before the pages are visited. comes at expense of some memory,
# but the app should obviously be able to handle all datasets in use
if GLOBAL_CONFIG["load_layouts"]:
for corpus_name, metadata in CORPUS_META.items():
if metadata.get("disabled"):
continue
slug = metadata.get("slug", _slug_from_name(corpus_name))
load_layout(slug, set_and_register=False)
for corpus in CORPUS_META:
if not corpus.disabled:
load_layout(corpus.slug, set_and_register=False)
8 changes: 5 additions & 3 deletions start/templates/start/start.html
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,11 @@
On this page will be a navbar, introduction, table of corpora, and potentially the upload/parse space
</p>
<h2>Corpora</h2>
<a href="explore/do-the-right-thing">Do the right thing</a>
<br>
<a href="explore/manifesto">Unabomber manifesto</a>

{% for corpus in corpora %}
<a href="explore/{{ corpus.slug }}">{{ corpus.name }}</a>
<br>
{% endfor %}

{% endblock %}

7 changes: 5 additions & 2 deletions start/views.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
from django.http import HttpResponse
from django.shortcuts import render

import explore.models

def start(request):
return render(request, 'start/start.html')
context = {
'corpora': explore.models.Corpus.objects.filter(disabled=False,load=True)
}
return render(request, 'start/start.html', context)

0 comments on commit 1916ca2

Please sign in to comment.