New: add dataset search by name similarity.

iamaziz · Feb 3, 2016 · 734d5bc · 734d5bc
1 parent 54d5d8e
commit 734d5bc
Show file tree

Hide file tree

Showing 4 changed files with 64 additions and 9 deletions.
diff --git a/pydataset/__init__.py b/pydataset/__init__.py
@@ -2,6 +2,7 @@
 # main interface to pydataset module
 
 from .datasets_handler import __print_item_docs, __read_csv, __datasets_desc
+from .support import find_similar
 
 
 def data(item=None, show_doc=False):
@@ -33,7 +34,7 @@ def data(item=None, show_doc=False):
             df = __read_csv(item)
             return df
         except KeyError:
-            raise Exception('Wrong dataset name! Try: data() to see available.')
+            find_similar(item)
     else:
         return __datasets_desc()
 

diff --git a/pydataset/locate_datasets.py b/pydataset/locate_datasets.py
@@ -31,10 +31,10 @@ def __get_data_folder_path():
 for dirname, dirnames, filenames in os_walk(data_path):
 
     # store item name and path to all csv files.
-    for filename in filenames:
-        if filename.endswith('.csv'):
+    for fname in filenames:
+        if fname.endswith('.csv') and not fname.startswith('.'):
             # e.g. pydataset-package/rdata/csv/boot/acme.csv
-            item_path = path_join(dirname, filename)
+            item_path = path_join(dirname, fname)
             # e.g acme.csv
             item_file = os_path.split(item_path)[1]
             # e.g. acme
@@ -43,9 +43,9 @@ def __get_data_folder_path():
             items[item] = item_path
 
     # store item name and path to all html files.
-    for filename in filenames:
-        if filename.endswith('.html'):
-            item_path = path_join(dirname, filename)
+    for fname in filenames:
+        if fname.endswith('.html') and not fname.startswith('.'):
+            item_path = path_join(dirname, fname)
             item_file = os_path.split(item_path)[1]
             item = item_file.replace('.html', '')
             docs[item] = item_path

diff --git a/pydataset/support.py b/pydataset/support.py
@@ -0,0 +1,54 @@
+
+from difflib import SequenceMatcher as SM
+from collections import Counter
+from .locate_datasets import __items_dict
+
+
+DATASET_IDS = list(__items_dict().keys())
+ERROR = ('Not valid dataset name and no similar found! '
+         'Try: data() to see available.')
+
+
+def similarity(w1, w2, threshold=0.5):
+    """compare two strings 'words', and
+    return ratio of smiliarity, be it larger than the threshold,
+    or 0 otherwise.
+
+    NOTE: if the result more like junk, increase the threshold value.
+    """
+    ratio = SM(None, str(w1).lower(), str(w2).lower()).ratio()
+    return ratio if ratio > threshold else 0
+
+
+def search_similar(s1, dlist=DATASET_IDS, MAX_SIMILARS=10):
+    """Returns the top MAX_SIMILARS [(dataset_id : smilarity_ratio)] to s1"""
+
+    similars = {s2: similarity(s1, s2)
+                for s2 in dlist
+                if similarity(s1, s2)}
+
+    # a list of tuples [(similar_word, ratio) .. ]
+    top_match = Counter(similars).most_common(MAX_SIMILARS+1)
+
+    return top_match
+
+
+def find_similar(query):
+
+    result = search_similar(query)
+
+    if result:
+        top_words, ratios = zip(*result)
+
+        print('Did you mean:')
+        print(', '.join(t for t in top_words))
+        # print(', '.join('{:.1f}'.format(r*100) for r in ratios))
+
+    else:
+        raise Exception(ERROR)
+
+
+if __name__ == '__main__':
+
+    s = 'ansc'
+    find_similar(s)
diff --git a/setup.py b/setup.py
@@ -15,10 +15,10 @@
                  "Python (in dataframe structure)."),
     author='Aziz Alto',
     url='https://github.com/iamaziz/PyDataset',
-    download_url='https://github.com/iamaziz/PyDataset/tarball/0.1.1',
+    download_url='https://github.com/iamaziz/PyDataset/tarball/0.2.0',
     license = 'MIT',
     author_email='iamaziz.alto@gmail.com',
-    version='0.1.1',
+    version='0.2.0',
     install_requires=['pandas'],
     packages=['pydataset', 'pydataset.utils'],
     package_data={'pydataset': ['*.gz', 'resources.tar.gz']}