Skip to content

Commit

Permalink
New: add dataset search by name similarity.
Browse files Browse the repository at this point in the history
  • Loading branch information
iamaziz committed Feb 3, 2016
1 parent 54d5d8e commit 734d5bc
Show file tree
Hide file tree
Showing 4 changed files with 64 additions and 9 deletions.
3 changes: 2 additions & 1 deletion pydataset/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
# main interface to pydataset module

from .datasets_handler import __print_item_docs, __read_csv, __datasets_desc
from .support import find_similar


def data(item=None, show_doc=False):
Expand Down Expand Up @@ -33,7 +34,7 @@ def data(item=None, show_doc=False):
df = __read_csv(item)
return df
except KeyError:
raise Exception('Wrong dataset name! Try: data() to see available.')
find_similar(item)
else:
return __datasets_desc()

Expand Down
12 changes: 6 additions & 6 deletions pydataset/locate_datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,10 +31,10 @@ def __get_data_folder_path():
for dirname, dirnames, filenames in os_walk(data_path):

# store item name and path to all csv files.
for filename in filenames:
if filename.endswith('.csv'):
for fname in filenames:
if fname.endswith('.csv') and not fname.startswith('.'):
# e.g. pydataset-package/rdata/csv/boot/acme.csv
item_path = path_join(dirname, filename)
item_path = path_join(dirname, fname)
# e.g acme.csv
item_file = os_path.split(item_path)[1]
# e.g. acme
Expand All @@ -43,9 +43,9 @@ def __get_data_folder_path():
items[item] = item_path

# store item name and path to all html files.
for filename in filenames:
if filename.endswith('.html'):
item_path = path_join(dirname, filename)
for fname in filenames:
if fname.endswith('.html') and not fname.startswith('.'):
item_path = path_join(dirname, fname)
item_file = os_path.split(item_path)[1]
item = item_file.replace('.html', '')
docs[item] = item_path
Expand Down
54 changes: 54 additions & 0 deletions pydataset/support.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@

from difflib import SequenceMatcher as SM
from collections import Counter
from .locate_datasets import __items_dict


DATASET_IDS = list(__items_dict().keys())
ERROR = ('Not valid dataset name and no similar found! '
'Try: data() to see available.')


def similarity(w1, w2, threshold=0.5):
"""compare two strings 'words', and
return ratio of smiliarity, be it larger than the threshold,
or 0 otherwise.
NOTE: if the result more like junk, increase the threshold value.
"""
ratio = SM(None, str(w1).lower(), str(w2).lower()).ratio()
return ratio if ratio > threshold else 0


def search_similar(s1, dlist=DATASET_IDS, MAX_SIMILARS=10):
"""Returns the top MAX_SIMILARS [(dataset_id : smilarity_ratio)] to s1"""

similars = {s2: similarity(s1, s2)
for s2 in dlist
if similarity(s1, s2)}

# a list of tuples [(similar_word, ratio) .. ]
top_match = Counter(similars).most_common(MAX_SIMILARS+1)

return top_match


def find_similar(query):

result = search_similar(query)

if result:
top_words, ratios = zip(*result)

print('Did you mean:')
print(', '.join(t for t in top_words))
# print(', '.join('{:.1f}'.format(r*100) for r in ratios))

else:
raise Exception(ERROR)


if __name__ == '__main__':

s = 'ansc'
find_similar(s)
4 changes: 2 additions & 2 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,10 +15,10 @@
"Python (in dataframe structure)."),
author='Aziz Alto',
url='https://github.com/iamaziz/PyDataset',
download_url='https://github.com/iamaziz/PyDataset/tarball/0.1.1',
download_url='https://github.com/iamaziz/PyDataset/tarball/0.2.0',
license = 'MIT',
author_email='iamaziz.alto@gmail.com',
version='0.1.1',
version='0.2.0',
install_requires=['pandas'],
packages=['pydataset', 'pydataset.utils'],
package_data={'pydataset': ['*.gz', 'resources.tar.gz']}
Expand Down

0 comments on commit 734d5bc

Please sign in to comment.