Merge pull request #24 from l0rb/master

storing corpora meta-data in the db
interrogator · Dec 2, 2019 · 1916ca2 · 1916ca2
2 parents be80744 + fce6289
commit 1916ca2
Show file tree

Hide file tree

Showing 11 changed files with 213 additions and 40 deletions.
diff --git a/accounts/urls.py b/accounts/urls.py
@@ -0,0 +1,10 @@
+from django.urls import path
+
+from accounts import views as accounts_views
+
+
+urlpatterns = [
+    path('signup/', accounts_views.signup, name='signup'),
+    path('logout/', accounts_views.logout_view, name='logout'),
+    path('login/', accounts_views.login_view, name='login'),
+]
diff --git a/buzzword/urls.py b/buzzword/urls.py
@@ -3,13 +3,10 @@
 from django.conf import settings
 from django.conf.urls.static import static
 
-from accounts import views as accounts_views
 
 urlpatterns = [
     path('', include("start.urls")),
-    path('signup/', accounts_views.signup, name='signup'),
-    path('logout/', accounts_views.logout_view, name='logout'),
-    path('login/', accounts_views.login_view, name='login'),
+    path('', include("accounts.urls")),
     path('explore/', include('explore.urls')),
     path('admin/', admin.site.urls),
     path('django_plotly_dash/', include('django_plotly_dash.urls')),

diff --git a/corpora.json.example b/corpora.json.example
@@ -4,7 +4,7 @@
     "language": "English",
     "path": "dtrt/do-the-right-thing-parsed",
     "desc": "Script of Spike Lee's 1989 drama, with various speaker and scene annotations",
-    "len": 17713,
+    "length": 17713,
     "drop_columns": ["voice_over", "camera_angle"],
     "disabled": false,
     "date": "1989",
@@ -16,7 +16,7 @@
     "language": "English",
     "path": "unabomber/manifesto-parsed",
     "desc": "Text of the Unabomber's manifesto",
-    "len": 37347,
+    "length": 37347,
     "add_governor": false,
     "disabled": true,
     "date": "1995",

diff --git a/explore/migrations/0001_initial.py b/explore/migrations/0001_initial.py
@@ -0,0 +1,38 @@
+# Generated by Django 2.2.7 on 2019-12-02 13:18
+
+from django.db import migrations, models
+import django.db.models.deletion
+
+
+class Migration(migrations.Migration):
+
+    initial = True
+
+    dependencies = [
+    ]
+
+    operations = [
+        migrations.CreateModel(
+            name='Corpus',
+            fields=[
+                ('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
+                ('slug', models.SlugField(max_length=255)),
+                ('language', models.CharField(max_length=255)),
+                ('path', models.TextField()),
+                ('desc', models.TextField(default='')),
+                ('len', models.BigIntegerField()),
+                ('diabled', models.BooleanField(default=False)),
+                ('date', models.DateField()),
+                ('load', models.BooleanField(default=True)),
+                ('url', models.URLField(max_length=255)),
+            ],
+        ),
+        migrations.CreateModel(
+            name='DropColumn',
+            fields=[
+                ('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
+                ('column_name', models.CharField(max_length=255)),
+                ('corpus', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to='explore.Corpus')),
+            ],
+        ),
+    ]
diff --git a/explore/migrations/0002_auto_20191202_1431.py b/explore/migrations/0002_auto_20191202_1431.py
@@ -0,0 +1,49 @@
+# Generated by Django 2.2.7 on 2019-12-02 14:31
+
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ('explore', '0001_initial'),
+    ]
+
+    operations = [
+        migrations.RenameField(
+            model_name='corpus',
+            old_name='diabled',
+            new_name='disabled',
+        ),
+        migrations.AddField(
+            model_name='corpus',
+            name='add_governor',
+            field=models.BooleanField(null=True),
+        ),
+        migrations.AddField(
+            model_name='corpus',
+            name='name',
+            field=models.CharField(default='name', max_length=255),
+            preserve_default=False,
+        ),
+        migrations.AlterField(
+            model_name='corpus',
+            name='date',
+            field=models.DateField(null=True),
+        ),
+        migrations.AlterField(
+            model_name='corpus',
+            name='len',
+            field=models.BigIntegerField(null=True),
+        ),
+        migrations.AlterField(
+            model_name='corpus',
+            name='slug',
+            field=models.SlugField(max_length=255, unique=True),
+        ),
+        migrations.AlterField(
+            model_name='corpus',
+            name='url',
+            field=models.URLField(max_length=255, null=True),
+        ),
+    ]
diff --git a/explore/migrations/0003_auto_20191202_2136.py b/explore/migrations/0003_auto_20191202_2136.py
@@ -0,0 +1,18 @@
+# Generated by Django 2.2.7 on 2019-12-02 21:36
+
+from django.db import migrations
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ('explore', '0002_auto_20191202_1431'),
+    ]
+
+    operations = [
+        migrations.RenameField(
+            model_name='corpus',
+            old_name='len',
+            new_name='length',
+        ),
+    ]
diff --git a/explore/models.py b/explore/models.py
@@ -1 +1,59 @@
+import datetime
 from django.db import models
+
+from explorer.parts.strings import _slug_from_name
+
+class Corpus(models.Model):
+    slug = models.SlugField(max_length=255, unique=True) # this can't be null because a name needs to exist
+    name = models.CharField(max_length=255)
+    language = models.CharField(max_length=255) # probably turn this into a model later
+    path = models.TextField()
+    desc = models.TextField(default="")
+    length = models.BigIntegerField(null=True)
+    add_governor = models.BooleanField(null=True)
+    #drop_columns = array -> needs to be relation
+    disabled = models.BooleanField(default=False)
+    date = models.DateField(null=True)
+    load = models.BooleanField(default=True)
+    url = models.URLField(max_length=255, null=True)
+
+    @classmethod
+    def from_json(cls, jsondata, corpus_name):
+        slug = jsondata.get("slug", _slug_from_name(corpus_name))
+        try:
+            corp = cls.objects.get(slug=slug)
+            return corp
+        except cls.DoesNotExist:
+            pass
+        language = jsondata.get("language")
+        path = jsondata.get("path")
+        desc = jsondata.get("desc", "")
+        length = jsondata.get("length")
+        disabled = jsondata.get("disabled", False)
+        date = datetime.datetime.strptime(jsondata.get("date", '1900'), '%Y').date()
+        load = jsondata.get("load", True)
+        url = jsondata.get("url")
+
+        has_error = False
+        if not path:
+            has_error = True
+            logging.error('no path = no good')
+        if not language:
+            has_error = True
+            logging.error('language missing')
+        if has_error:
+            raise Exception('some problem with loading corpus from json. check error log')
+
+        corp = Corpus(name=corpus_name, slug=slug, language=language, path=path, desc=desc, length=length, disabled=disabled, date=date, load=load, url=url)
+        corp.save()
+
+        for drop_col in jsondata.get('drop_columns', []):
+            col = DropColumn(corpus=corp, column_name=drop_col)
+            col.save()
+
+        return corp
+
+class DropColumn(models.Model):
+    corpus = models.ForeignKey(Corpus, on_delete=models.CASCADE)
+    column_name = models.CharField(max_length=255)
+
diff --git a/explorer/parts/helpers.py b/explorer/parts/helpers.py
@@ -10,7 +10,7 @@
 from buzz.corpus import Corpus
 
 from .strings import _capitalize_first, _downloadable_name
-
+from explore.models import Corpus as CorpusModel
 
 def _get_specs_and_corpus(search_from, searches, corpora, slug):
     """
@@ -227,8 +227,7 @@ def register_callbacks():
     """
     from . import callbacks
 
-
-def _get_corpora_meta(corpora_file):
+def _get_corpora_json_contents(corpora_file):
     """
     Get the contents of corpora.json, or an empty dict
     """
@@ -239,6 +238,12 @@ def _get_corpora_meta(corpora_file):
     with open(corpora_file, "r") as fo:
         return json.loads(fo.read())
 
+def _get_corpora_meta(corpora_file):
+    contents = _get_corpora_json_contents(corpora_file)
+    corpora = []
+    for corpus_name, corpus_json in contents.items():
+        corpora.append(CorpusModel.from_json(corpus_json, corpus_name))
+    return corpora
 
 def _special_search(df, col, search_string, skip):
     """

diff --git a/explorer/parts/main.py b/explorer/parts/main.py
@@ -21,20 +21,17 @@
 LAYOUTS = dict()
 
 
-def _get_corpus_config(local_conf, global_conf, name):
+def _get_corpus_config(corpus, global_conf):
     """
     Return global conf plus individual settings for corpus
     """
     conf = {**global_conf}
     settings = {"max_dataset_rows", "drop_columns", "add_governor", "load", "slug"}
     for setting in settings:
-        loc = local_conf.get(setting)
+        loc = getattr(corpus, setting, None)
         if loc is not None:
             conf[setting] = loc
-        else:
-            if setting == "slug":
-                conf[setting] = _slug_from_name(name)
-    conf["corpus_name"] = name
+    conf["corpus_name"] = corpus.name
     return conf
 
 
@@ -47,26 +44,24 @@ def _get_corpora(corpus_meta):
     corpora = dict()
     tables = dict()
     corpora_config = dict()
-    for i, (corpus_name, metadata) in enumerate(corpus_meta.items(), start=1):
-        if metadata.get("disabled"):
-            print("Skipping corpus because it is disabled: {}".format(corpus_name))
+    for corpus in corpus_meta:
+        if corpus.disabled:
+            print("Skipping corpus because it is disabled: {}".format(corpus.name))
             continue
-        slug = metadata.get("slug", _slug_from_name(corpus_name))
-        corpus = Corpus(metadata["path"])
-        conf = _get_corpus_config(metadata, GLOBAL_CONFIG, corpus_name)
+        buzz_corpus = Corpus(corpus.path)
+        conf = _get_corpus_config(corpus, GLOBAL_CONFIG)
         if conf["load"]:
-            print("Loading corpus into memory: {} ...".format(corpus_name))
-            corpus = corpus.load(add_governor=conf["add_governor"])
-            corpus = _preprocess_corpus(corpus, **conf)
+            print("Loading corpus into memory: {} ...".format(corpus.name))
+            buzz_corpus = buzz_corpus.load(add_governor=conf["add_governor"])
+            buzz_corpus = _preprocess_corpus(buzz_corpus, **conf)
         else:
-            print("NOT loading corpus into memory: {} ...".format(corpus_name))
-        initial_table = corpus.table(show="p", subcorpora="file")
-        corpora[slug] = corpus
-        tables[slug] = initial_table
-        corpora_config[slug] = conf
+            print("NOT loading corpus into memory: {} ...".format(corpus.name))
+        initial_table = buzz_corpus.table(show="p", subcorpora="file")
+        corpora[corpus.slug] = buzz_corpus
+        tables[corpus.slug] = initial_table
+        corpora_config[corpus.slug] = conf
     return corpora, tables, corpora_config
 
-
 CORPUS_META = _get_corpora_meta(GLOBAL_CONFIG.get("corpora_file"))
 
 CORPORA, INITIAL_TABLES, CORPORA_CONFIGS = _get_corpora(CORPUS_META)
@@ -98,8 +93,6 @@ def load_layout(slug, set_and_register=True):
 # before the pages are visited. comes at expense of some memory,
 # but the app should obviously be able to handle all datasets in use
 if GLOBAL_CONFIG["load_layouts"]:
-    for corpus_name, metadata in CORPUS_META.items():
-        if metadata.get("disabled"):
-            continue
-        slug = metadata.get("slug", _slug_from_name(corpus_name))
-        load_layout(slug, set_and_register=False)
+    for corpus in CORPUS_META:
+        if not corpus.disabled:
+            load_layout(corpus.slug, set_and_register=False)
diff --git a/start/templates/start/start.html b/start/templates/start/start.html
@@ -9,9 +9,11 @@
     On this page will be a navbar, introduction, table of corpora, and potentially the upload/parse space
 </p>
 <h2>Corpora</h2>
-<a href="explore/do-the-right-thing">Do the right thing</a>
-<br>
-<a href="explore/manifesto">Unabomber manifesto</a>
+
+{% for corpus in corpora %}
+    <a href="explore/{{ corpus.slug }}">{{ corpus.name }}</a>
+    <br>
+{% endfor %}
 
 {% endblock %}
 
diff --git a/start/views.py b/start/views.py
@@ -1,6 +1,9 @@
 from django.http import HttpResponse
 from django.shortcuts import render
-
+import explore.models
 
 def start(request):
-    return render(request, 'start/start.html')
+    context = {
+        'corpora': explore.models.Corpus.objects.filter(disabled=False,load=True)
+    }
+    return render(request, 'start/start.html', context)