# Finding Similar Neighborhoods

##Twitter data grabber - 
This code listens to twitter stream for live foursquare checkins.

In [None]:
"""Retrieve checkins tweets

    We use mongodb to save the tweets
    Mongodb is hosted on local machine and at port 27017
    
    This method runs for 1 hour and then saves all the tweets found in the mongodb
"""
from timeit import default_timer as clock
from time import sleep
import TwitterAPI as twitter
import twitter_helper as th
import arguments
import ConfigParser
DB = None
SAVE = None
import CommonMongo as cm
DB = cm.connect_to_db('foursquare', 'localhost', 27017)[0]

import CheckinAPICrawler as cac
CRAWLER = cac.CheckinAPICrawler()
from Queue import Queue
from threading import Thread
# the size of mongo bulk insert, in multiple of pool size
INSERT_SIZE = 7
CHECKINS_QUEUE = Queue((INSERT_SIZE+3)*cac.BITLY_SIZE)
NB_TWEETS = 0
NUM_VALID = 0


def post_process(checkins):
    """use `crawler` to follow URL within `checkins` and update them with
    information regarding the actual Foursquare checkin."""
    infos = CRAWLER.checkins_from_url([c.lid for c in checkins])
    if not infos:
        return None
    finalized = []
    global NUM_VALID
    for checkin, info in zip(checkins, infos):
        if info:
            converted = checkin._asdict()
            id_, uid, vid, time = info
            del converted['id']
            converted['_id'] = id_
            converted['uid'] = uid
            converted['lid'] = vid
            converted['time'] = time
            finalized.append(converted)
            NUM_VALID += 1
        CHECKINS_QUEUE.task_done()
    return finalized


def accumulate_checkins():
    """Call save checkin as soon as a batch is complete (or the last one was
    received)."""
    waiting_for_crawling = []
    while True:
        checkin = CHECKINS_QUEUE.get()
        if not checkin:
            # receive None, signaling end of the time allowed
            CHECKINS_QUEUE.task_done()
            break
        waiting_for_crawling.append(checkin)
        if len(waiting_for_crawling) == INSERT_SIZE*cac.BITLY_SIZE:
            status = save_checkins(post_process(waiting_for_crawling), SAVE)
            del waiting_for_crawling[:]
            # status is None when CRAWLER.checkins_from_url returns None
            if status is None:
                break
    save_checkins(post_process(waiting_for_crawling), SAVE)


def save_checkins(complete, saving_method):
    """Save `complete` using `saving_method`."""
    print 'Save `complete` using `saving_method`.'
    if not complete:
        return None
    saving_method(complete)
    return True


def save_checkins_mongo(complete):
    """Save `complete` in DB."""
    try:
        DB.checkin.insert(complete, continue_on_error=True)
        print('insert {}'.format(len(complete)))
    except cm.pymongo.errors.DuplicateKeyError:
        pass
    except cm.pymongo.errors.OperationFailure as err:
        print(err, err.code)


def read_twitter_stream(client, end, logging_step=60):
    """Iterate over tweets and put those matched by parse_tweet in a queue,
    until current time is more than `end`. Log info every `logging_step` new
    valid candidate."""
    global NB_TWEETS
    req = client.request('statuses/filter', {'track': '4sq,swarmapp'})
    new_tweet = 'get {}, {}/{}, {:.1f} seconds to go'
    nb_cand = 0
    for item in req.get_iterator():
        candidate = th.parse_tweet(item)
        NB_TWEETS += 1
        if candidate:
            #print candidate
            CHECKINS_QUEUE.put_nowait(candidate)
            nb_cand += 1
            if nb_cand % logging_step == 0:
                cac.logging.info(new_tweet.format(candidate.tid,
                                                  nb_cand, NB_TWEETS,
                                                  end - clock()))
            if clock() >= end:
                CHECKINS_QUEUE.put_nowait(None)
                break

if DB:
    DB.checkin.ensure_index([('loc', cm.pymongo.GEOSPHERE),
                             ('lid', cm.pymongo.ASCENDING),
                             ('city', cm.pymongo.ASCENDING),
                             ('time', cm.pymongo.ASCENDING)])
SAVE = save_checkins_mongo
config = ConfigParser.ConfigParser()
config.read('api_keys.cfg')
api = twitter.TwitterAPI(config.get('twitter', 'consumer_key'),
               config.get('twitter', 'consumer_secret'),
               config.get('twitter', 'access_token'),
               config.get('twitter', 'access_token_secret'))
accu = Thread(target=accumulate_checkins, name='AccumulateCheckins')
accu.daemon = True
accu.start()
start = clock()
end = start + 1*60*60 # 1 hour
failures = th.Failures(initial_waiting_time=2.0)
while clock() < end:
    print clock()
    try:
        read_twitter_stream(api, end)
        print 'stream read'
    except (KeyboardInterrupt, SystemExit):
        CHECKINS_QUEUE.put_nowait(None)
        print 'keyboard interrupt'
        raise
    except:
        msg = 'Fail to read or enqueue tweet\n'
        #print msg
        cac.logging.exception(msg)
        waiting_time = failures.fail()
        if clock() + waiting_time > end or \
           failures.recent_failures >= 5 or \
           not accu.is_alive():
            CHECKINS_QUEUE.put_nowait(None)
            break
        msg = 'Will wait for {:.0f} seconds'.format(waiting_time)
        cac.logging.info(msg)
        failures.do_sleep()

CHECKINS_QUEUE.join()
report = 'insert {} valid checkins in {:.2f}s (out of {}).'
cac.logging.info(report.format(NUM_VALID, clock() - start, NB_TWEETS))
sleep(5)


## Fill the database with foursquare data
Use collected tweet and AskFourquare to request information about users and venues, before inserting them in a Mongo database

We need to specify the city before we can fetch foursquare data.

In [None]:
"""Use collected tweet and AskFourquare to fill user and venue table in the
Database."""
from time import sleep
from threading import Thread
import foursquare
import CommonMongo as cm
import Chunker
from Queue import Queue
from RequestsMonitor import RequestsMonitor
from AskFourquare import gather_all_entities_id
from AskFourquare import user_profile, venue_profile
import sys
import arguments
import ssl
import ConfigParser

config = ConfigParser.ConfigParser()
config.read('api_keys.cfg')
CLIENT_ID = config.get('foursquare', 'FOURSQUARE_ID2')
CLIENT_SECRET = config.get('foursquare', 'FOURSQUARE_SECRET2')

ENTITY_KIND = 'venue'
if ENTITY_KIND == 'venue':
    RATE = 5000
    REQ = 'venues'
    DB_FIELD = 'lid'
    PARSE = venue_profile
elif ENTITY_KIND == 'user':
    RATE = 500
    REQ = 'users'
    DB_FIELD = 'uid'
    PARSE = user_profile
else:
    raise ValueError(ENTITY_KIND + ' is unknown')
CLIENT = foursquare.Foursquare(CLIENT_ID, CLIENT_SECRET)
IDS_QUEUE = Queue(5)
ENTITIES_QUEUE = Queue(105)
LIMITOR = RequestsMonitor(RATE)
TABLE = []
TO_BE_INSERTED = []
INVALID_ID = []


def convert_entity_for_mongo(entity):
    suitable = entity._asdict()
    suitable['_id'] = suitable['id']
    del suitable['id']
    return suitable


def entities_getter():
    foursquare_is_down = False
    while True:
        batch = IDS_QUEUE.get()
        if foursquare_is_down:
            IDS_QUEUE.task_done()
            continue
        go, wait = LIMITOR.more_allowed(CLIENT)
        if not go:
            sleep(wait + 3)
        for id_ in batch:
            REQ(id_, multi=True)
        answers = []
        try:
            answers = list(CLIENT.multi())
        except foursquare.ParamError as e:
            print(e)
            invalid = str(e).split('/')[-1].replace(' ', '+')
            answers = individual_query(batch, invalid)
        except foursquare.ServerError as e:
            print(e)
            foursquare_is_down = True
        except ssl.SSLError as e:
            print(e)
            foursquare_is_down = True

        for a in answers:
            dispatch_answer(a)
        IDS_QUEUE.task_done()


def dispatch_answer(a):
    """According to the type of answer of `a`, either enqueue it for DB
    insertion or try to save its id as invalid."""
    if a is None:
        print('None answer')
    elif not isinstance(a, foursquare.FoursquareException):
        parsed = PARSE(a[ENTITY_KIND])
        if parsed:
            ENTITIES_QUEUE.put(parsed)
        else:
            # no lon and lat
            vid = a[ENTITY_KIND].get('id')
            if vid:
                INVALID_ID.append(vid)
    else:
        print(a)
        # deleted venue or server error
        potential_id = str(a).split()[1]
        if len(potential_id) == 24:
            INVALID_ID.append(potential_id)


def individual_query(batch, invalid):
    print(batch, invalid)
    answers = []
    for id_ in batch:
        a = None
        if id_ != invalid:
            try:
                a = REQ(id_)
            except (KeyboardInterrupt, SystemExit):
                raise
            except:
                print(sys.exc_info()[1])
        answers.append(a)
    assert len(answers) == len(batch)
    return answers


def entities_putter():
    while True:
        entity = ENTITIES_QUEUE.get()
        TO_BE_INSERTED.append(convert_entity_for_mongo(entity))
        if len(TO_BE_INSERTED) >= 400:
            mongo_insertion()
        ENTITIES_QUEUE.task_done()


def mongo_insertion():
    global TO_BE_INSERTED
    if len(TO_BE_INSERTED) == 0:
        return
    try:
        TABLE.insert(TO_BE_INSERTED, continue_on_error=True)
    except cm.pymongo.errors.DuplicateKeyError:
        pass
    except cm.pymongo.errors.OperationFailure as e:
        print(e, e.code)
    del TO_BE_INSERTED[:]


REQ = getattr(CLIENT, REQ)
db = cm.connect_to_db('foursquare', '127.0.0.1', 27017)[0]
checkins = db['checkin']
TABLE = db[ENTITY_KIND]
if ENTITY_KIND == 'venue':
    TABLE.ensure_index([('loc', cm.pymongo.GEOSPHERE),
                        ('city', cm.pymongo.ASCENDING),
                        ('cat', cm.pymongo.ASCENDING)])
t = Thread(target=entities_getter, name='Query4SQ')
t.daemon = True
t.start()
t = Thread(target=entities_putter, name='InsertDB')
t.daemon = True
t.start()
total_entities = 0

city = 'moscow' #Specify city here

chunker = Chunker.Chunker(foursquare.MAX_MULTI_REQUESTS)
previous = [e['_id'] for e in TABLE.find({'city': city})]
print previous
potential = gather_all_entities_id(checkins, DB_FIELD, city=city)
print('but already {} {}s in DB.'.format(len(previous), ENTITY_KIND))
import persistent as p
region = city or 'world'
invalid_filename = 'non_{}_id_{}'.format(ENTITY_KIND, region)
try:
    INVALID_ID = p.load_var(invalid_filename)
except IOError:
    pass
print('and {} {}s are invalid.'.format(len(INVALID_ID), ENTITY_KIND))
new_ones = set(potential).difference(set(previous))
new_ones = new_ones.difference(set(INVALID_ID))
outside = set([e['_id'] for e in TABLE.find({'city': None}, {'_id': 1})])
outside.intersection_update(new_ones)
print('and {} {}s are outside range.'.format(len(outside), ENTITY_KIND))
new_ones = new_ones.difference(outside)
print('So only {} new ones.'.format(len(new_ones)))
for batch in chunker(new_ones):
    IDS_QUEUE.put(batch)
    total_entities += len(batch)

IDS_QUEUE.join()
ENTITIES_QUEUE.join()
mongo_insertion()
print('{}/{} invalid id'.format(len(INVALID_ID), total_entities))
print('{}/{} requests'.format(CLIENT.rate_remaining, CLIENT.rate_limit))
p.save_var(invalid_filename, INVALID_ID)


Maintain a tree of foursquare categories

In [3]:
"""Maintain a tree of Foursquare categories and provide query methods."""
from collections import namedtuple
import persistent as p
import string
Category = namedtuple('Category', ['id', 'name', 'depth', 'sub'])
import enum
Field = enum.Enum('Field', 'id name')  # pylint: disable=C0103
import bidict
CAT_TO_ID = bidict.bidict({None: '0', 'Venue': '1'})
ID_TO_INDEX = bidict.bidict({None: 0, '0': 0, '1': 1})


def parse_categories(top_list, depth=0):
    """Recursively build Categories"""
    if len(top_list) == 0:
        return []
    res = []
    for cat in top_list:
        subs = []
        if isinstance(cat, dict) and 'categories' in cat:
            subs = parse_categories(cat['categories'], depth+1)
        id_, name = str(cat['id']), unicode(cat['name'])
        CAT_TO_ID[name] = id_
        res.append(Category(id_, name, depth+1, subs))
    return res


def get_categories(client=None):
    """Return categories list from disk or from Foursquare website using
    client"""
    if client is None:
        raw_cats = p.load_var('raw_categories')['categories']
    else:
        raw_cats = client.venues.categories()
        p.save_var('raw_categories', raw_cats)
        raw_cats = raw_cats['categories']
    cats = Category('1', 'Venue', 0, parse_categories(raw_cats))
    # pylint: disable=E1101
    id_index = [(id_, idx + 100)
                for idx, id_ in enumerate(sorted(CAT_TO_ID.values()))
                if id_ not in ['0', '1']]
    ID_TO_INDEX.update(id_index)
    return cats


CATS = globals()['get_categories']()


def search_categories(query, cats=CATS, field=None):
    """Return a category matching query (either by name or id) and its path
    inside cats."""
    if field is None:
        field = choose_type(query)
    if cats[field] == query:
        return cats, [query]
    for sub_category in cats.sub:
        found, path = search_categories(query, sub_category, field)
        if found is not None:
            return found, [cats[field]] + path
    return None, None


def choose_type(query):
    """Return appropriate field index for `query`."""
    if query[0] in string.digits:
        return 0
    return 1


def pre_traversal(cats, field):
    """Return a flat list of `field` by performing a depth first traversal of
    `cats`, visiting the root first."""
    assert field in [0, 1]
    if not cats.sub:
        return [cats[field]]
    all_subs = [pre_traversal(sub, field) for sub in cats.sub]
    return [cats[field]] + [s for sub in all_subs for s in sub]


def json_traversal(cats, field):
    """."""
    if not cats.sub:
        return {'name': cats[field]}
    all_subs = [json_traversal(sub, field) for sub in cats.sub]
    return {'name': cats[field],
            'children': all_subs}


def get_subcategories(query, field=None):
    """Return a list of `query` and all its sub categories"""
    root, _ = search_categories(query)
    field = choose_type(query) if not field else field.value - 1
    return pre_traversal(root, field)


cbar, bpath = search_categories('Bar')
all_college = get_subcategories('4d4b7105d754a06372d81259', Field.id)
# print(all_college)
j = json_traversal(search_categories('1')[0], 1)
import json
import codecs
with codecs.open('flare.json', 'w', 'utf8') as f:
    json.dump(j, f)


persistent.py/load_var
CommonMongo.py/connect_to_db
CommonMongo.py/build_query
[{u'_id': u'34100109', u'checkins': 1}]

Invalid checkin id



[u'34100109']
persistent.py/load_var
persistent.py/load_var
persistent.py/load_var
[u'34100109']
Still 1 users to process.
34100109
persistent.py/save_var
persistent.py/save_var
persistent.py/save_var
1080


##Data Processing
This main step is to transform the raw data collected into a feature matrix whose rows are each venue of a city with enough visits and column are the features

In [8]:
"""Try to describe venue by various features."""
import prettyplotlib as ppl
import matplotlib.pyplot as plt
from collections import Counter, defaultdict, OrderedDict
from sklearn.neighbors import KernelDensity
import CommonMongo as cm
#import FSCategories as fsc
#import explore as xp
import numpy as np
import pandas as pd
import utils as u
import random as r
import itertools
import scipy.io as sio
import scipy.cluster.vq as cluster
from scipy.stats import multivariate_normal
import re
import string
import persistent as p
import Surrounding as s
NOISE = re.compile(r'[\s'+string.punctuation+r']')
DB = None
CLIENT = None
LEGEND = 'v^<>s*xo|8d+'
CATS = ['Arts & Entertainment', 'College & University', 'Food',
        'Nightlife Spot', 'Outdoors & Recreation', 'Shop & Service',
        'Professional & Other Places', 'Residence', 'Travel & Transport']
RADIUS = 350
SMOOTH = multivariate_normal([0, 0], (RADIUS/2.5)*np.eye(2))
SMOOTH_MAX = SMOOTH.pdf([0, 0])


def geo_project(city, entities):
    """Return {id: euclidean projection in `city`} for objects in
    `entities`."""
    ids, loc = zip(*[(_['_id'], list(reversed(_['loc']['coordinates'])))
                     for _ in entities])
    project = cm.cities.GEO_TO_2D[city]
    return dict(zip(ids, project(np.array(loc))))


@u.memodict
def is_event(cat_id):
    """Does `cat_id` represent an event."""
    return cat_id in fsc.get_subcategories('Event', fsc.Field.id)


def global_info(city, standalone=False):
    """Gather global statistics about `city`."""
    lvenues = geo_project(city, DB.venue.find({'city': city}, {'loc': 1}))
    lcheckins = geo_project(city, DB.checkin.find({'city': city}, {'loc': 1}))
    lphotos = geo_project(city, CLIENT.world.photos.find({'hint': city},
                                                         {'loc': 1}))
    local_projection = [lvenues, lcheckins, lphotos]
    visits = xp.get_visits(CLIENT, xp.Entity.venue, city)
    visitors = xp.get_visitors(CLIENT, city)
    density = estimate_density(city)
    activity = [visits, visitors, density]
    global TOP_CATS
    TOP_CATS = p.load_var('top_cats.my')
    infos = {'venue': [] if standalone else ['cat', 'cats'],
             'photo': ['taken'] if standalone else ['venue']}
    svenues = s.Surrounding(DB.venue, {'city': city}, infos['venue'], lvenues)
    scheckins = s.Surrounding(DB.checkin, {'city': city}, ['time'], lcheckins)
    sphotos = s.Surrounding(CLIENT.world.photos, {'hint': city},
                            infos['photo'], lphotos)
    surroundings = [svenues, scheckins, sphotos]
    p.save_var('{}_s{}s.my'.format(city, 'venue'), svenues)
    if standalone:
        for name, var in zip(['venue', 'checkin', 'photo'], surroundings):
            p.save_var('{}_s{}s.my'.format(city, name), var)
    return local_projection + activity + surroundings


def describe_city(city):
    """Compute feature vector for selected venue in `city`."""
    CATS2 = p.load_var('cat_depth_2.my')
    # a few venues don't have level 2 categories (TODO add it manually?)
    CATS2.update({cat: int(idx*1e5) for idx, cat in enumerate(CATS)})
    info = global_info(city)
    lvenues, lcheckins, lphotos = info[:3]
    visits, visitors, density = info[3:6]
    nb_visitors = np.unique(np.array([v for place in visitors.itervalues()
                                      for v in place])).size
    svenues, scheckins, sphotos = info[6:]
    categories = categories_repartition(city, svenues, lvenues, RADIUS)
    venues = DB.venue.find({'city': city, 'closed': {'$ne': True},
                            'cat': {'$ne': None}, 'usersCount': {'$gt': 1}},
                           {'cat': 1})
    chosen = [v['_id'] for v in venues
              if len(visits.get(v['_id'], [])) > 4 and
              len(np.unique(visitors.get(v['_id'], []))) > 1 and
              not is_event(v['cat'])]
    print("Chosen {} venues in {}.".format(len(chosen), city))
    info, _ = venues_info(chosen, visits, visitors, density, depth=2,
                          tags_freq=False)
    print("{} of them will be in the matrix.".format(len(info)))
    numeric = np.zeros((len(info), 31), dtype=np.float32)
    numeric[:, :5] = np.array([info['likes'], info['users'], info['checkins'],
                               info['H'], info['Den']]).T
    print('venues with no level 2 category:')
    print([info.index[i] for i, c in enumerate(info['cat'])
           if CATS2[c] % int(1e5) == 0])
    numeric[:, 5] = [CATS2[c] for c in info['cat']]
    numeric[:, 24] = np.array(info['Ht'])
    for idx, vid in enumerate(info.index):
        surrounding = full_surrounding(vid, lvenues, lphotos, lcheckins,
                                       svenues, scheckins, sphotos, city)
        cat, focus, ratio, around_visits = surrounding
        numeric[idx, 6:15] = cat
        numeric[idx, 15] = focus
        numeric[idx, 16] = ratio
        own_visits = visits[vid]
        numeric[idx, 17] = is_week_end_place(own_visits)
        daily_visits = xp.aggregate_visits(own_visits, 1, 4)[0]
        numeric[idx, 18:24] = xp.to_frequency(daily_visits)
        numeric[idx, 25:31] = xp.to_frequency(around_visits)
    weird = np.argwhere(np.logical_or(np.isnan(numeric), np.isinf(numeric)))
    numeric[weird] = 0.0
    sio.savemat(city+'_fv', {'v': numeric, 'c': categories,
                             'i': np.array(list(info.index)),
                             'stat': [nb_visitors]}, do_compression=True)


def venues_info(vids, visits=None, visitors=None, density=None, depth=10,
                tags_freq=True):
    """Return various info about from the venue ids `vids`."""
    tags = defaultdict(int)
    city = DB.venue.find_one({'_id': vids[0]})['city']
    visits = visits or xp.get_visits(CLIENT, xp.Entity.venue, city)
    visitors = visitors or xp.get_visitors(CLIENT, city)
    density = density or estimate_density(city)
    venues = list(DB.venue.find({'_id': {'$in': vids}},
                                {'cat': 1, 'name': 1, 'loc': 1,
                                 'price': 1, 'rating': 1, 'tags': 1,
                                 'likes': 1, 'usersCount': 1,
                                 'checkinsCount': 1}))

    msg = 'Asked for {} but get only {}'.format(len(vids), len(venues))
    assert len(vids) == len(venues), msg
    res = pd.DataFrame(index=[_['_id'] for _ in venues])

    def add_col(field):
        res[field.replace('Count', '')] = [_[field] for _ in venues]
    for field in ['name', 'price', 'rating', 'likes',
                  'usersCount', 'checkinsCount']:
        add_col(field)
    if tags_freq:
        res['tags'] = [[normalized_tag(t) for t in _['tags']] for _ in venues]
    loc = [_['loc']['coordinates'] for _ in venues]
    get_cat = lambda c, d: top_category(c) if d == 1 else parenting_cat(c, d)
    res['cat'] = [get_cat(_['cat'], depth) for _ in venues]
    res['vis'] = [len(visits[id_]) for id_ in res.index]
    res['H'] = [venue_entropy(visitors[id_]) for id_ in res.index]
    res['Ht'] = [time_entropy(visits[id_]) for id_ in res.index]
    coords = np.fliplr(np.array(loc))
    points = cm.cities.GEO_TO_2D[city](coords)
    res['Den'] = density(points)
    if tags_freq:
        for venue in venues:
            for tag in venue['tags']:
                tags[normalized_tag(tag)] += 1
    return res, OrderedDict(sorted(tags.iteritems(), key=lambda x: x[1],
                                   reverse=True))


def estimate_density(city):
    """Return a Gaussian KDE of venues in `city`."""
    kde = KernelDensity(bandwidth=175, rtol=1e-4)
    surround = xp.build_surrounding(DB.venue, city, likes=-1, checkins=1)
    kde.fit(surround.venues[:, :2])
    max_density = approximate_maximum_density(kde, surround.venues[:, :2])
    # pylint: disable=E1101
    return lambda xy: np.exp(kde.score_samples(xy))/max_density


def approximate_maximum_density(kde, venues, precision=128):
    """Evaluate the kernel on a grid and return the max value."""
    # pylint: disable=E1101
    xgrid = np.linspace(np.min(venues[:, 0]), np.max(venues[:, 0]), precision)
    ygrid = np.linspace(np.min(venues[:, 1]), np.max(venues[:, 1]), precision)
    X, Y = np.meshgrid(xgrid, ygrid)
    xy = np.vstack([X.ravel(), Y.ravel()]).T
    estim = np.exp(kde.score_samples(xy))
    return estim.max()


def smoothed_location(loc, center, radius, city, pmapping):
    """Return a list of weight (obtained by a 2D Gaussian with `radius`)
    corresponding to the relative distance of points in `loc` with
    `center`. `pmapping` is a dictionnary {id: 2dpos} and `center` a 2D
    point."""
    if len(loc) == 0:
        return []
    if len(loc) == 1:
        return [1.0]
    assert len(center) == 2
    # TODO: loc could directly be the subset
    ploc = np.array([pmapping[_] for _ in loc]) - center
    return SMOOTH.pdf(ploc/20)/SMOOTH_MAX


def full_surrounding(vid, vmapping, pmapping, cmapping, svenues, scheckins,
                     sphotos, city, radius=350):
    """Return a list of photos, checkins and venues categories in a `radius`
    around `vid`, within `city`. The mappings are dict({id: 2dpos})"""
    cat_distrib = categories_repartition(city, svenues, vmapping, radius, vid)
    center = vmapping[vid]
    pids, infos, _ = sphotos.around(center, radius)
    pvenue = infos[0]
    cids, infos, _ = scheckins.around(center, radius)
    ctime = infos[0]
    focus = photo_focus(vid, center, pids, pvenue, radius, pmapping)
    photogeny, c_smoothed = photo_ratio(center, pids, cids, radius, pmapping,
                                        cmapping)
    if len(ctime) < 5:
        print(vid + ' is anomalous because there is less than 5 check-in in a 350m radius')
    if len(ctime) == 0:
        surround_visits = np.ones(6)
    else:
        surround_visits = xp.aggregate_visits(ctime, 1, 4, c_smoothed)[0]
    return cat_distrib, focus, photogeny, surround_visits


def photo_focus(vid, center, pids, pvenue, radius, mapping):
    """Return the ratio of photos with venue id around `vid` that are indeed
    about it."""
    this_venue = 0
    all_venues = 0
    smoothed = smoothed_location(pids, center, radius, None, mapping)
    for pid, weight in zip(pvenue, smoothed):
        if pid:
            if pid == vid:
                this_venue += weight
            else:
                all_venues += weight
    return 0 if all_venues < 1e-4 else this_venue / all_venues


def photo_ratio(center, pids, cids, radius, pmapping, cmapping):
    """Return nb_photos/nb_checkins around `vid`, weighted by Gaussian."""
    p_smoothed = smoothed_location(pids, center, radius, None, pmapping)
    c_smoothed = smoothed_location(cids, center, radius, None, cmapping)
    # sum of c_smoothed ≠ 0 because for the venue to exist, there must be some
    # checkins around. NOTE: actually, there are anomalous venues for which it
    # is not the case
    return np.sum(p_smoothed)/np.sum(c_smoothed), c_smoothed


def is_week_end_place(place_visits):
    """Tell if a place is more visited during the weekend."""
    is_we_visit = lambda h, d: d == 5 or (d == 4 and h >= 20) or \
        (d == 6 and h <= 20)
    we_visits = [1 for v in place_visits if is_we_visit(v.hour, v.weekday())]
    return int(len(we_visits) > 0.5*len(place_visits))


def categories_repartition(city, svenues, vmapping, radius, vid=None):
    """Return the distribution of top level Foursquare categories in
    `ball` (ie around `vid`) (or the whole `city` without weighting if
    None)."""
    smoothed_loc = itertools.cycle([1.0])
    if vid:
        vids, vcats, _ = svenues.around(vmapping[vid], radius)
        smoothed_loc = smoothed_location(vids, vmapping[vid], radius, city,
                                         vmapping)
    else:
        vids, vcats, _ = svenues.all()
    vcats = vcats[0]
    distrib = defaultdict(int)
    for own_cat, weight in zip(vcats, smoothed_loc):
        for cat in own_cat:
            distrib[TOP_CATS[cat]] += weight
    distrib = np.array([distrib[c] for c in CATS])
    # Can't be zero because there is always at least the venue itself in
    # surrounding.
    return distrib / np.sum(distrib)


def venue_entropy(visitors):
    """Compute the entropy of venue given the list of its `visitors`."""
    # pylint: disable=E1101
    return u.compute_entropy(np.array(Counter(visitors).values(), dtype=float))


def time_entropy(visits):
    """Compute entropy of venue with respect to time of the day of its
    checkins."""
    hours = np.bincount([t.hour for t in visits], minlength=24)
    return u.compute_entropy(hours.astype(float))/np.log(24.0)


def normalized_tag(tag):
    """normalize `tag` by removing punctuation and space character."""
    return NOISE.sub('', tag).lower()


def count_tags(tags):
    """Count occurence of a list of list of tags."""
    return Counter([normalized_tag(t) for oneset in tags for t in oneset])


@u.memodict
def top_category(cat):
    return parenting_cat(cat, 1)


def parenting_cat(cat, depth=1):
    """Return the name of category id `cat` (or name), stopping at level
    `depth`."""
    if not cat:
        return None
    _, path = fsc.search_categories(cat)
    cat_is_name = fsc.choose_type(cat)
    answer = path[depth] if len(path) > depth else path[-1]
    if cat_is_name:
        return answer
    return fsc.CAT_TO_ID[:answer]


def get_loc(vid):
    """Return coordinated of the venue `vid` (or None if it's not in DB)."""
    res = DB.venue.find_one({'_id': vid}, {'loc': 1})
    if res:
        return u.get_nested(res, ['loc', 'coordinates'])
    return None


def get_venue(vid, depth=1):
    """Return a textual description of venue `vid` or None."""
    venue = DB.venue.find_one({'_id': vid}, {'cat': 1, 'name': 1})
    if not venue:
        return None
    cat = parenting_cat(venue.get('cat'), depth)
    venue['cat'] = cat or '???'
    return (venue['cat'], venue['name'], vid)


def photos_around(id_, centroid, offset, daily, radius=200):
    """Gather photos timestamp in a `radius` around `id_` and return its time
    pattern (`daily` or not), and its distance to every `centroid`."""
    center = get_loc(id_)
    photos = xp.get_visits(CLIENT, xp.Entity.photo, ball=(center, radius))
    kind = xp.to_frequency(xp.aggregate_visits(photos.values(), offset)[daily])
    nb_class = centroid.shape[0]
    # pylint: disable=E1101
    classes = np.linalg.norm(np.tile(kind, (nb_class, 1)) - centroid, axis=1)
    return len(photos), kind, classes, np.argmin(classes)


def named_ticks(kind, offset=0, chunk=3):
    """Return ticks label for kind in ('day', 'week', 'mix')."""
    if kind is 'day':
        period = lambda i: '{}--{}'.format(i % 24, (i+chunk) % 24)
        return [period(i) for i in range(0+offset, 24+offset, chunk)]
    days = 'mon tue wed thu fri sat sun'.split()
    if kind is 'week':
        return days
    if kind is 'mix':
        period = '1 2 3'.split()
        return [d+''+p for d in days for p in period]
    raise ValueError('`kind` argument is not valid')


def draw_classes(centroid, offset, chunk=3):
    """Plot each time patterns in `centroid`."""
    size = centroid.shape[0]
    for i, marker in zip(range(size), LEGEND[:size]):
        ppl.plot(centroid[i, :], marker+'-', ms=9, c=ppl.colors.set1[i])
    if centroid.shape[1] == 24/chunk:
        plt.xticks(range(24/chunk), named_ticks('day', offset, chunk))
    else:
        plt.xticks(range(7*3), named_ticks('mix'))


def get_distorsion(ak, kl, sval):
    """Compute the sum of euclidean distance from `sval` to its
    centroid"""
    return np.sum(np.linalg.norm(ak[kl, :] - sval, axis=1))


import arguments
city = 'moscow'
DB, CLIENT = cm.connect_to_db('foursquare', '127.0.0.1', '27017')
do_cluster = lambda val, k: cluster.kmeans2(val, k, 20, minit='points')

def getclass(c, kl, visits):
    #Return {id: time pattern} of the venues in class `c` of
    #`kl`.
    return {v[0]: v[1] for v, k in zip(visits.iteritems(), kl) if k == c}

def peek_at_class(c, kl, visits, k=15):
    #Return a table of `k` randomly chosen venues in class `c` of
    #`kl`.
    sample = r.sample([get_venue(i)
                       for i in getclass(c, kl, visits).keys()], k)
    return pd.DataFrame({'cat': [_[0] for _ in sample],
                         'name': [_[1] for _ in sample],
                         'id': [_[2] for _ in sample]})
for c in cm.cities.SHORT_KEY:
    if c == 'newyork':
        continue
    describe_city(c)


utils.py/memodict
utils.py/memodict


"import arguments\ncity = 'moscow'\nDB, CLIENT = cm.connect_to_db('foursquare', '127.0.0.1', '27017')\ndo_cluster = lambda val, k: cluster.kmeans2(val, k, 20, minit='points')\n\ndef getclass(c, kl, visits):\n    #Return {id: time pattern} of the venues in class `c` of\n    #`kl`.\n    return {v[0]: v[1] for v, k in zip(visits.iteritems(), kl) if k == c}\n\ndef peek_at_class(c, kl, visits, k=15):\n    #Return a table of `k` randomly chosen venues in class `c` of\n    #`kl`.\n    sample = r.sample([get_venue(i)\n                       for i in getclass(c, kl, visits).keys()], k)\n    return pd.DataFrame({'cat': [_[0] for _ in sample],\n                         'name': [_[1] for _ in sample],\n                         'id': [_[2] for _ in sample]})\nfor c in cm.cities.SHORT_KEY:\n    if c == 'newyork':\n        continue\n    describe_city(c)\n"

##Computation

Defines query_in_one_city, which performs a single similarity query between a GeoJSON polygon from one city to another

In [23]:
"""Find closest region in every other cities in the world."""
import cities as c
import json
import itertools
from scipy.spatial import cKDTree, ConvexHull
import approx_emd as app
import numpy as np
import neighborhood as nb
import persistent as p
import shapely.geometry as sgeo
from operator import itemgetter


# load venues location for all cities
print('start loading city info')
cities = set(c.SHORT_KEY)
cities_venues_raw = {name: p.load_var(name+'_svenues.my')
                     for name in cities}
cities_desc = {name: nb.cn.gather_info(name, raw_features=True,
                                       hide_category=True) for name in cities}
cities_venues = {}
cities_index = {}
cities_kdtree = {}
for city in cities:
    vids, _, locs = cities_venues_raw[city].all()
    vindex = cities_desc[city]['index']
    cities_venues[city] = np.zeros((len(vindex), 2))
    cities_index[city] = dict(itertools.imap(lambda x: (x[1], x[0]),
                                             enumerate(vindex)))
    for vid, loc in itertools.izip(vids, locs):
        pos = cities_index[city].get(vid)
        if pos is not None:
            cities_venues[city][pos, :] = loc
    cities_kdtree[city] = cKDTree(cities_venues[city])
print('done')


def retrieve_closest_venues(query_venues, query_city, target_city):
    """For the given query, return a list of venues indices for knn level of
    50"""
    mask = np.where(np.in1d(cities_desc[query_city]['index'], query_venues))[0]
    query_features = cities_desc[query_city]['features'][mask, :]
    all_target_features = cities_desc[target_city]['features']
    tindex = cities_desc[target_city]['index']
    candidates = app.get_candidates_venues(query_features,
                                           all_target_features, k=60)
    threshold = int(len(tindex)*1.0*len(query_venues) /
                    len(cities_desc[query_city]['index']))
    return candidates, threshold


def query_in_one_city(source, target, region):
    """`source` and `target` are two cities name while `region` is a JSON
    polygon. Return the five polygon in `target` that are the closest to
    `region` according to approximate EMD metrics."""
    raw_result = []
    infos = nb.interpret_query(source, target, region, 'emd')
    _, right, _, regions_distance, vids, _ = infos
    vindex = np.array(right['index'])
    vloc = cities_venues[target]
    infos = retrieve_closest_venues(vids, source, target)
    candidates, _ = infos
    print(source, target)

    eps, mpts = 250, 10 if len(vloc) < 5000 else 40
    clusters = app.good_clustering(vloc, list(sorted(candidates)), eps, mpts)
    areas = []
    for cluster in clusters:
        venues_areas = app.cluster_to_venues(cluster, vloc,
                                             cities_kdtree[target], 4)
        if len(venues_areas) == 0:
            continue
        for venues in venues_areas:
            vids = vindex[venues]
            venues = right['features'][venues, :]
            dst = regions_distance(venues.tolist(),
                                   nb.weighting_venues(venues[:, 1]))
            areas.append({'venues': set(vids), 'dst': dst})
    res = [a['dst'] for a in areas]
    venues_so_far = set()
    for idx in np.argsort(res):
        cand = set(areas[idx]['venues'])
        if not venues_so_far.intersection(cand):
            venues_so_far.update(cand)
        else:
            continue
        raw_result.append(areas[idx])
        if len(raw_result) >= 5:
            break
    return raw_result


def venues_to_geojson(vids, city):
    """Convert a list of venues id into a GeoJSON polygon"""
    mask = itemgetter(*vids)(cities_index[city])
    locs = cities_venues[city][mask, :]
    hull = locs[ConvexHull(locs).vertices, :]
    geohull = c.euclidean_to_geo(city, hull)
    return sgeo.mapping(sgeo.Polygon(np.fliplr(geohull)))


with open('thirdworld.json') as inf:
    regions = json.load(inf)
from collections import namedtuple, defaultdict
Query = namedtuple('Query', 'origin targets name geo'.split())
queries = []
for region in regions:
    origin = region['properties']['origin']
    targets = cities.difference([origin])
    name = region['properties']['name']
    geo = region['geometry']
    queries.append(Query(origin, targets, name, geo))

results = defaultdict(list)
for query in queries:
    this_query = {}
    for city in query.targets:
        for res in query_in_one_city(query.origin, city, query.geo):
            results[query.name].append((city, res['dst'], res['venues']))


start loading city info


KeyError: 'houston'