update

lingpy · Nov 12, 2018 · 410f166 · 410f166
1 parent 594f00f
commit 410f166
Show file tree

Hide file tree

Showing 8 changed files with 7,422 additions and 0 deletions.
diff --git a/MANIFEST.in b/MANIFEST.in
@@ -0,0 +1,4 @@
+include *.md LICENSE
+graft src
+global-exclude *.py[co]
+
diff --git a/pip-requirements.txt b/pip-requirements.txt
@@ -0,0 +1,2 @@
+clldutils
+segments
diff --git a/setup.py b/setup.py
@@ -0,0 +1,29 @@
+#import distribute_setup
+#distribute_setup.use_setuptools()
+
+from setuptools import setup, find_packages,Extension
+import codecs
+# setup package name etc as a default
+pkgname = 'poepy'
+
+
+setup(
+        name=pkgname,
+        description="A Python library for handling annotated rhymes.",
+        version='0.1.0',
+        packages=find_packages(where='src'),
+        package_dir={'': 'src'},
+        zip_safe=False,
+        license="GPL",
+        include_package_data=True,
+        install_requires=['lingpy'],
+        url='https://github.com/lingpy/poepy',
+        long_description=codecs.open('README.md', 'r', 'utf-8').read(),
+        long_description_content_type='text/markdown',
+        #entry_points={
+        #    'console_scripts': ['sinopy=sinopy.cli:main'],
+        #},
+        author='Johann-Mattis List',
+        author_email='list@shh.mpg.de',
+        keywords='Chinese linguistics, historical linguistics, computer-assisted language comparison'
+        )
diff --git a/src/poepy/__init__.py b/src/poepy/__init__.py
@@ -0,0 +1 @@
+from .poepy import *
diff --git a/src/poepy/conf/poems.rc b/src/poepy/conf/poems.rc
@@ -0,0 +1,18 @@
+# Basic namespaces
+counterpart	str	counterpart
+doculect	str	language,doculect,dialect,taxon,taxa
+concept	str	gloss,concept,concepts
+iso	str	iso,isocode
+tokens	lambda x:x.split()	tokens,tokenized_counterpart,ipatokens
+segments	lambda x:x.split()	segments
+
+alignment	lambda x: x.split()	alignment
+line_order	int	line_order
+
+# specific names for alignments
+conceptid	int	conceptid
+
+# New namespaces for partial cognacy
+rhymeids	lambda x: [int(s) for s in x.split()]	crossids
+
+
diff --git a/src/poepy/data/Wang1980.tsv b/src/poepy/data/Wang1980.tsv
diff --git a/src/poepy/data/references.bib b/src/poepy/data/references.bib
@@ -0,0 +1,15 @@
+
+@Book{Wang1980,
+  Title                    = {{S}hījīng {Y}ùndú},
+  Address                  = {Shànghǎi 上海},
+  Author                   = {Wáng Lì 王力},
+  Publisher                = {Shànghǎi Gǔjī 上海古籍},
+  Year                     = {1980},
+
+  Owner                    = {mattis},
+  Shortauthor              = {Wáng},
+  Timestamp                = {2016.01.09},
+  Usera                    = {Rhyme readings in the Book of Odes},
+  Userb                    = {詩經韻讀}
+}
+
diff --git a/src/poepy/poepy.py b/src/poepy/poepy.py
@@ -0,0 +1,66 @@
+from clldutils.path import Path, remove, path_component
+from segments import Tokenizer
+from lingpy import *
+import networkx as nx
+from tqdm import tqdm
+from itertools import combinations
+from tabulate import tabulate
+
+def poepy_path(*comps):
+    return Path(__file__).parent.joinpath(*comps).as_posix()
+
+
+class Poems(Alignments):
+
+    def __init__(self, infile, ref='rhymeids', line='line', poem='poem',
+            stanza='stanza', alignment='alignment', fuzzy=True,
+            conf=poepy_path('conf', 'poems.rc'), **keywords):
+
+        Alignments.__init__(self, infile, col=poem, row=stanza, conf=conf,
+                segments=line, ref=ref, alignment=alignment, fuzzy=fuzzy,
+                transcription='line_in_source')
+
+    def stats(self):
+        print('Poems:       {0}'.format(len(self.cols)))
+        print('Stanzas:     {0}'.format(len(self.rows)))
+        print('Lines:       {0}'.format(len(self)))
+        print('Rhyme words: {0}'.format(sum([len(self.msa[self._ref][key]['ID']) for key
+            in self.msa[self._ref]])))
+        print('Rhymes:      {0}'.format(len(self.msa[self._ref])))
+        print('Words:       {0}'.format(sum([len(self[idx, 'line']) for idx in
+            self])))
+
+    def get_rhyme_network(self, ref='rhymeids'):
+        G = nx.Graph()
+
+        for key, msa in tqdm(self.msa[ref].items()):
+            for idx, seq in zip(msa['ID'], msa['seqs']):
+                node = ' '.join(seq)
+                try:
+                    G.node[node]['weight'] += 1
+                    G.node[node]['occurrences'] += [str(idx)]
+                except KeyError:
+                    G.add_node(node, weight=1, occurrences=[str(idx)])
+
+            for (idxA, seqA), (idxB, seqB) in combinations(
+                    zip(msa['ID'], msa['seqs']), r=2):
+                nodeA, nodeB = ' '.join(seqA), ' '.join(seqB)
+                try:
+                    G[nodeA][nodeB]['weight'] += 1
+                    G[nodeA][nodeB]['stanza'] += [self[idx, 'stanza']]
+                except KeyError:
+                    G.add_edge(nodeA, nodeB, weight=1, stanza=[self[idx,
+                    'stanza']])
+        self.G = G
+
+    def get_connected_components(self):
+        if not hasattr(self, 'G'):
+            raise ValueError('compute the rhyme network first')
+        self.comps = {}
+        for i, comp in enumerate(nx.connected_components(self.G)):
+            self.comps[i+1] = list(comp)
+
+    #def pprint(self, stanzas):
+    #    table = []
+    #    for stanza in stanzas:
+