Lepszy import stron/numeru/tomu

iplweb · May 7, 2020 · 84b7483 · 84b7483
1 parent c6969b2
commit 84b7483
Show file tree

Hide file tree

Showing 5 changed files with 34 additions and 10 deletions.
diff --git a/HISTORY.rst b/HISTORY.rst
@@ -2,6 +2,12 @@
 Historia zmian
 ==============
 
+202005.36
+---------
+
+* poprawki importu rekordów z plików DBF oraz procedur wycinających
+  dane na temat numeru i tomu (#845)
+
 202004.35
 ---------
 

diff --git a/src/bpp/models/abstract.py b/src/bpp/models/abstract.py
@@ -719,8 +719,8 @@ def ret(res):
 parsed_informacje_regex = re.compile(
     r"(\[online\])?\s*"
     r"(?P<rok>\d\d+)"
-    r"(\s*(vol|t|r|bd)\.*\s*(?P<tom>\d+))?"
-    r"(\s*(iss|nr|z|h)\.*\s*(?P<numer>\d+\w*(\/\d*\w*)?))?",
+    r"(\s*(vol|t|r|bd)\.*\s*\[?(?P<tom>\d+)\]?)?"
+    r"(\s*(iss|nr|z|h|suppl|supl)\.*\s*(?P<numer>\d+\w*(\/\d*\w*)?))?",
     flags=re.IGNORECASE,
 )
 
@@ -732,11 +732,6 @@ def parse_informacje_as_dict(
     if not informacje:
         return {}
 
-    # matches = re.search(parsed_informacje_regex, informacje)
-    # if matches:
-    #     return matches.groupdict()
-    # return {}
-
     p = parsed_informacje_regex.search(informacje)
     if p is not None:
         return p.groupdict()

diff --git a/src/bpp/tests/test_models/test_abstract.py b/src/bpp/tests/test_models/test_abstract.py
@@ -378,6 +378,8 @@ def test_eksport_pbn_open_access(wydawnictwo_zwarte, openaccess_data):
         ("1960 T. 8 nr 2", "1960", "8", "2"),
         ("1960 T.8nr2", "1960", "8", "2"),
         ("1960 T.8 nr 2", "1960", "8", "2"),
+        ("2018 Vol.77 suppl.2", "2018", "77", "2"),
+        ("2020 T. [59] supl.", "2020", "59", None),
     ],
 )
 def test_parse_informacje(input, exp_rok, exp_tom, exp_nr):

diff --git a/src/import_dbf/Makefile b/src/import_dbf/Makefile
@@ -32,16 +32,20 @@ importuj-dyscypliny:
 	$(MANAGE) importuj_dyscypliny -v0 2019 "/Volumes/Dane zaszyfrowane/UMWroclaw/bpp-assets/dyscypliny/2019.xlsx"
 	$(MANAGE) importuj_dyscypliny -v0 2020 "/Volumes/Dane zaszyfrowane/UMWroclaw/bpp-assets/dyscypliny/2020.xlsx"
 
-integruj-dbf-publikacje: get-assets
+integruj-dbf-publikacje-pre:
 	$(MANAGE) integruj_dbf --enable-charakter-kbn-jezyk --charaktery-enrichment-xls $(ASSETS_DIR)/bg-umw/charaktery_formalne.xlsx
 	$(MANAGE) integruj_dbf --enable-zrodlo
 	$(MANAGE) rozszerz_skroty_zrodel $(ASSETS_DIR)/bg-umw/zrodla.xlsx
 	$(PG_DUMP) > pre-integruj-dbf-enable-publikacja.sql
+
+integruj-dbf-publikacje-post:
 	$(MANAGE) integruj_dbf --enable-publikacja > integruj-dbf-publikacja-log.txt
 	$(MANAGE) integruj_dbf --enable-zatwierdz-podwojne-przypisania
 	$(MANAGE) integruj_dbf --enable-b-a
 	$(MANAGE) integruj_dbf --enable-przypisz-jednostki
 
+integruj-dbf-publikacje: get-assets integruj-dbf-publikacje-pre integruj-dbf-publikacje-post
+
 przypisz-dyscypliny:
 	$(MANAGE) przypisz_dyscypliny -v0 2017 --ustawiaj-pierwsza-gdy-dwie --disable-cache
 	$(MANAGE) przypisz_dyscypliny -v0 2018 --ustawiaj-pierwsza-gdy-dwie --disable-cache
@@ -100,6 +104,8 @@ restore-import-dbf-state: dropdb-createdb
 restore-pre-integruj-publikacje-state: dropdb-createdb
 	cat pre-integruj-dbf-enable-publikacja.sql | $(PSQL)
 
+restart-integruj-publikacje: restore-pre-integruj-publikacje-state integruj-dbf-publikacje-post
+
 integruj-liste-wydawcow: get-assets
 	$(MANAGE) import_wydawca_poziom $(ASSETS_DIR)/poziomy-wydawcow-2017-2020.xlsx
 	$(MANAGE) import_wydawca_alias $(ASSETS_DIR)/bg-umw/aliasy_wydawcow.xlsx

diff --git a/src/import_dbf/util.py b/src/import_dbf/util.py
@@ -19,6 +19,7 @@
     const,
     parse_informacje,
     wez_zakres_stron,
+    parse_informacje_as_dict,
 )
 from bpp.system import User
 from bpp.util import pbar
@@ -947,11 +948,25 @@ def integruj_publikacje(offset=None, limit=None):
                 # E: bibliogr. poz
 
                 kw["szczegoly"] = elem.get("a")
+
+                # A: moze byc to 'rok tom' lub 'rok numer' lub 'rok numer tom'
+                pi = parse_informacje_as_dict(elem.get("a"))
+                if pi.get("rok") and (pi.get("numer") or pi.get("tom")):
+                    assert not kw.get("tom")
+                    assert not kw.get("nr_zeszytu")
+
+                    kw["tom"] = pi.get("tom")
+                    kw["nr_zeszytu"] = pi.get("numer")
+
                 kw["informacje"] = exp_combine(kw.get("informacje"), elem.get("b"))
 
                 if elem.get("b"):
-                    assert not kw.get("tom")
-                    kw["tom"] = elem.get("b")
+                    if elem.get("b").startswith("nr "):
+                        assert not kw.get("nr_zeszytu")
+                        kw["nr_zeszytu"] = elem.get("b").replace("nr ", "")
+                    else:
+                        assert not kw.get("tom")
+                        kw["tom"] = elem.get("b")
 
                 kw["szczegoly"] = exp_combine(kw["szczegoly"], elem.get("c"))