In [1]:
import json
import s2sphere as s2
from copy import deepcopy

In [2]:
with open('collegeonly.json', 'r', encoding='utf8') as read_file:
    collegecells = json.load(read_file)

In [3]:
with open('univonly.json', 'r', encoding='utf8') as read_file:
    univcells = json.load(read_file)

In [4]:
with open('geonames.json', 'r', encoding='utf8') as read_file:
    geonames = json.load(read_file)

In [5]:
with open('usunivandcollegeindex.json', 'r', encoding='utf8') as read_file:
    usindex = json.load(read_file)['elements']

In [6]:
def convertToTokenDict(cell_list):
    ldict = {}
    for element in cell_list:
        for token in element['cover16s']:
            if token in ldict:
                raise ValueError
            else:
                ldict[token] = element
    return ldict

In [7]:
univldict = convertToTokenDict(univcells)

In [8]:
collegeldict = convertToTokenDict(collegecells)

In [9]:
len(univcells)

21187

In [10]:
len(collegecells)

24465

In [11]:
def findspecificcode(namelist):
    ftcode = []
    for el in namelist:
        if el['feature-code'] == 'UNIV':
            ftcode.append(el)
    return ftcode

In [12]:
univonlygeonames = findspecificcode(geonames)

In [13]:
len(univonlygeonames)

1359

In [14]:
len(usindex)

6780

In [15]:
# def linkCoordinateToBound(ldicts, namelist, numexpansions=0):
#     matched = 0
#     unmatched = 0
#     matchedindex = []
#     unmatchedindex = []
#     def expandTokens(token):
#         instcell = s2.CellId.from_token(token)
#         nbrs = set(map(lambda x: x.to_token(), list(instcell.get_edge_neighbors()) + [instcell]))
#         return nbrs
    
#     for ldict in ldicts:
#         for ltok in ldict:
#             if ldict[ltok].get('matched'):
#                 ldict[ltok].pop('matched')
    
    
#     for institution in namelist:
#         insttok = s2.CellId.from_lat_lng(s2.LatLng.from_degrees(float(institution['latitude']), float(institution['longitude']))).parent(16).to_token()
        
#         allnbrs = set([insttok])
#         allmmbrs = set()
        
#         for expindex in range(0,numexpansions+1):
#             for celltok in list(allnbrs):
#                 if celltok not in allmmbrs:
#                     if expindex != numexpansions:
#                         thenbrs = expandTokens(celltok)
#                         allnbrs.update(thenbrs)
#                     allmmbrs.add(celltok)
  
#         somematch = False
#         for ldict in ldicts:
#             for exptok in allmmbrs:
#                 coorsbound = ldict.get(exptok)
#                 if coorsbound:
#                     if ldict[exptok].get('matched'):
#                         ldict[exptok]['matched'].append(institution)
#                     else:
#                         ldict[exptok]['matched'] = [institution]
#                     somematch = True
#                     break
                    
#         if somematch:
#             matchedindex.append(institution)
#             matched += 1
#         else:
#             unmatchedindex.append(institution)
#             unmatched += 1
        
#     print(matched)
#     print(unmatched)
#     return matchedindex, unmatchedindex
        

In [16]:
# matched, unmatched = linkCoordinateToBound([univldict, collegeldict], usindex, numexpansions=2)

In [17]:
def stufff(nameindex, celllists, selectiveness=20, numexpansions=1, clean=False):

    
    def cleanSlate():
    
        from nltk.corpus import stopwords
        import string
        from string import punctuation
        import re
        
        def expandTokens(token):
            instcell = s2.CellId.from_token(token)
            nbrs = set(map(lambda x: x.to_token(), list(instcell.get_edge_neighbors()) + [instcell]))
            return nbrs
        
        #############
        #REMOVALS
        for nm in nameindex:
            for key in nm.keys():
                if key in ['regex', 'hit10s', 'hit11s', 'hit12s', 'hit13s', 'hit14s', 'hit15s', 'hit16s', 'cover16s']:
                    nm.pop(key)

        for celllist in celllists:
            for el in celllist:
                for key in el.keys():
                    if key in ['regex', 'hit10s', 'hit11s', 'hit12s', 'hit13s', 'hit14s', 'hit15s', 'hit16s']:
                        nm.pop(key)


        #####################
        #HIT TOKENS
        for celllist in celllists:
            for el in celllist:
                for i in range(16, 17):
                    b = s2.CellUnion(map(lambda x: s2.CellId.from_token(x), el.get('cover16s')))
                    b.expand(i)
                    b.normalize()
                    el['hit' + str(i) + 's'] = list(map(lambda x: x.to_token(), b.cell_ids()))

        for nm in nameindex:
            insttok = s2.CellId.from_lat_lng(s2.LatLng.from_degrees(float(nm['latitude']), float(nm['longitude']))).parent(16).to_token()    
            allnbrs = set([insttok])
            allmmbrs = set()

            for expindex in range(0,numexpansions+1):
                for celltok in list(allnbrs):
                    if celltok not in allmmbrs:
                        if expindex != numexpansions:
                            thenbrs = expandTokens(celltok)
                            allnbrs.update(thenbrs)
                        allmmbrs.add(celltok)

            for i in range(16,17):
                a = s2.CellUnion(map(lambda x: s2.CellId.from_token(x), list(allmmbrs)))
                a.expand(i)
                a.normalize()
                nm['hit' + str(i) + 's'] = list(map(lambda x: x.to_token(), a.cell_ids()))
    
        #######################
        #WORDS
        exclude = set(string.punctuation)
        exclude.update(list(stopwords.words('english')))
        exclude.add('')
        r = re.compile(r'[\s{}]+'.format(re.escape(punctuation)))
        
        for nm in nameindex:
    #         splitalias = [tt for tt in (r.split(nm['alias'].lower())) if tt not in exclude]
            splitnm = [tt for tt in (r.split(nm['institution'].lower())) if tt not in exclude]
    #         finalwords = splitalias + splitnm
            finalwords = splitnm
            nm['regex'] = finalwords

        for celllist in celllists:
            for el in celllist:
                finalwords = []
                for onename in el['allnames']:
                    finalwords.extend([sss.lower() for sss in r.split(onename) if sss.lower() not in exclude])
                el['regex'] = finalwords

    ##############
    #OPERATIONS SETUP
    def setup():
        
        for nm in nameindex:
            if 'cover16s' in nm:
                nm.pop('cover16s')
            if 'estimated' in nm:
                nm.pop('cover16s')
            if 'children' in nm:
                nm.pop('children')
                    
        from collections import Counter
        
        comprehensiveindexwords = []
        for indexel in nameindex:
            comprehensiveindexwords.extend(list(set(indexel['regex'])))

        indexcountsdict = Counter(comprehensiveindexwords)

        indexcountsvalues = list(indexcountsdict.values())
        indexcountwords = list(indexcountsdict.keys())
        indexcountsvalues, indexcountwords = zip(*sorted(zip(indexcountsvalues, indexcountwords)))

        bannedwords = set(indexcountwords[::-1][:len(indexcountwords) // selectiveness])

        
        cellnmdict = {}
        for celllist in celllists:
            for el in celllist:
                for cellregnm in el['regex']:
                    if cellregnm in cellnmdict:
                        if not el in cellnmdict[cellregnm]:
                            cellnmdict[cellregnm].append(el)
                    else:
                        cellnmdict[cellregnm] = [el]

        cellldict = {}
        for celllist in celllists:
            for el in celllist:
                for hitkey in ['hit16s']:
                    for celltok in el[hitkey]:
                        if celltok in cellldict:
                            if not el in cellldict[celltok]:
                                cellldict[celltok].append(el)
                        else:
                            cellldict[celltok] = [el]
        return bannedwords, cellldict, cellnmdict
        
    def colidebylocationidentifybyname():
        for printindex, nm in enumerate(nameindex):

    #         if printindex % 10 == 0:
    #             print("printindex")
    #             print(printindex)

            collision = []
            for hitkey in ['hit16s']:
                for nmtok in nm[hitkey]:
                    possiblecollision = cellldict.get(nmtok)
                    if possiblecollision:
                        for elfrompossible in possiblecollision:
                            if elfrompossible not in collision:
                                collision.append(elfrompossible)

            #added in
            finalchoices = collision
            if finalchoices:
                completecover16s = set()
                repcellid = s2.CellId.from_lat_lng(s2.LatLng.from_degrees(float(nm['latitude']), float(nm['longitude']))).parent(16).to_token()
                for finalchoice in finalchoices:
                    if repcellid in finalchoice['cover16s']:
                        completecover16s.update(finalchoice['cover16s'])
                nm['cover16s'] = list(completecover16s)

#             bannedwordencounters = 0
#             candidates = []
#             potentialcandidates = []

#             if collision:
#                 for regnm in nm['regex']:
#                     if regnm not in bannedwords:
#                         if regnm in cellnmdict:
#                             for innerel in cellnmdict[regnm]:
#                                 if innerel in collision:
#                                     if innerel not in candidates:
#                                         candidates.append(innerel)
#                     else:
#                         if regnm in cellnmdict:
#                             for innerel in cellnmdict[regnm]:
#                                 if innerel in collision:
#                                     if (regnm, innerel) not in potentialcandidates:
#                                         potentialcandidates.append((regnm, innerel))
#                         bannedwordencounters += 1


#                 if bannedwordencounters == len(nm['regex']):
#                     secondorderbannedencounters = 0
#                     for regnm in nm['regex']:
#                         if regnm not in secondorderbannedwords:
#                             for pcand in potentialcandidates:
#                                 if pcand[0] == regnm and pcand[1] not in candidates:
#                                     candidates.append(pcand[1])
#                         else:
#                             secondorderbannedencounters += 1

#                     if secondorderbannedencounters == bannedwordencounters:
#                         print("ALL SECOND ORDER ENCOUNTERED")
#                         print(nm['regex'])
#                         print()

#                 secondorderexactcandidates = []
#                 secondordersubsetcandidates = []
#                 nmregexset = set(nm['regex'])
#                 for cand in candidates:
#                     candregexset = set(cand['regex'])
#                     if candregexset == nmregexset:
#                         secondorderexactcandidates.append(cand)
#                     elif candregexset.issubset(nmregexset) or nmregexset.issubset(candregexset):
#                         secondordersubsetcandidates.append(cand)

#                 finalchoices = []
#                 if len(secondorderexactcandidates) == 0 and len(secondordersubsetcandidates) == 0:
#                     finalchoices = []
#                 elif len(secondorderexactcandidates) == 1:
#                     finalchoices = secondorderexactcandidates
#                 elif len(secondorderexactcandidates) > 1:
#                     finalchoices = secondorderexactcandidates
#                 elif len(secondordersubsetcandidates) == 1:
#                     finalchoices = secondordersubsetcandidates
#                 elif len(secondordersubsetcandidates) > 1:
#                     finalchoices = secondordersubsetcandidates

#                 if finalchoices:
#                     completecover16s = set()
#                     repcellid = s2.CellId.from_lat_lng(s2.LatLng.from_degrees(float(nm['latitude']), float(nm['longitude']))).parent(16).to_token()
#                     for finalchoice in finalchoices:
#                         if repcellid in finalchoice['cover16s']:
#                             completecover16s.update(finalchoice['cover16s'])
#                     nm['cover16s'] = list(completecover16s)
    
    def makesurecoversunique():
        restart = True
        while restart:
            print("repeat")
            ldictmapping = {}
            restart = False
            for inel in nameindex:
                indcover16s = inel.get('cover16s', None)
                if indcover16s:
                    for tok16 in indcover16s:
                        if ldictmapping.get(tok16, None):
                            this16s = set(inel['cover16s'])
                            ldict16s = set(ldictmapping[tok16]['cover16s'])
                            intersection16s = ldict16s.intersection(this16s)
                            if float(ldictmapping[tok16]['undergraduate']) >= float(inel['undergraduate']):
                                inel['cover16s'] = list(this16s.difference(intersection16s))
                            else:
                                ldictmapping[tok16]['cover16s'] = list(ldict16s.difference(intersection16s))
                            restart = True
                            break
                        else:
                            ldictmapping[tok16] = inel
                if restart:
                    break
                            
    def expandpoints():         
        justpointsfornow = []
        for pointnm in nameindex:
            if not pointnm.get('cover16s', None):
                justpointsfornow.append(pointnm)
        
        ldictmapping = {}
        for inel in nameindex:
            indcover16s = inel.get('cover16s', None)
            if indcover16s:
                for tok16 in indcover16s:
                    if ldictmapping.get(tok16, None):
                        print("duplicate")
                    else:
                        ldictmapping[tok16] = inel

        import math
        def get_cell_ids(lat, long, radius):
            EARTH_RADIUS = 6371000  # radius of Earth in meters
            region = s2.Cap.from_axis_angle(s2.LatLng.from_degrees(lat, long).to_point(), s2.Angle.from_degrees(360*radius/(2*math.pi*EARTH_RADIUS)))
            coverer = s2.RegionCoverer()
            coverer.min_level = 16
            coverer.max_level = 16
            coverer.max_cells = 1000
            coverer.level_mod = 1
            cells = coverer.get_covering(region)
            return [x.to_token() for x in cells]

        urbanizationcalc = {'City: Midsize': 2.380979060935266e-08,
                 'City: Small': 2.378871715446322e-08,
                 'Town: Fringe': 2.3028870479638616e-08,
                 'Rural: Remote': 1.940241468738347e-08,
                 'Suburb: Large': 1.998525951622446e-08,
                 'Town: Remote': 1.8832485911217132e-08,
                 'Town: Distant': 2.2870535568747298e-08,
                 'Suburb: Small': 3.2827424802967285e-08,
                 'City: Large': 1.7779747365296302e-08,
                 'Rural: Fringe': 1.612806927185429e-08,
                 'Rural: Distant': 1.7585095137780952e-08,
                 'Suburb: Midsize': 3.056580135032997e-08}

        tometersconstant = 40589771116743.48
        onehundreddownplusextra = [100000, 10000, 1000, 100, 99, 98, 97, 96, 95, 94, 93, 92, 91, 90, 89, 88, 87, 86, 85, 84, 83, 82, 81, 80, 79, 78, 77, 76, 75, 74, 73, 72, 71, 70, 69, 68, 67, 66, 65, 64, 63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48, 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9.9, 9.8, 9.700000000000001, 9.600000000000001, 9.500000000000002, 9.400000000000002, 9.300000000000002, 9.200000000000003, 9.100000000000003, 9.000000000000004, 8.900000000000004, 8.800000000000004, 8.700000000000005, 8.600000000000005, 8.500000000000005, 8.400000000000006, 8.300000000000006, 8.200000000000006, 8.100000000000007, 8.000000000000007, 7.9000000000000075, 7.800000000000008, 7.700000000000008, 7.6000000000000085, 7.500000000000009, 7.400000000000009, 7.30000000000001, 7.20000000000001, 7.10000000000001, 7.000000000000011, 6.900000000000011, 6.800000000000011, 6.700000000000012, 6.600000000000012, 6.500000000000012, 6.400000000000013, 6.300000000000013, 6.2000000000000135, 6.100000000000014, 6.000000000000014, 5.900000000000015, 5.800000000000015, 5.700000000000015, 5.600000000000016, 5.500000000000016, 5.400000000000016, 5.300000000000017, 5.200000000000017, 5.100000000000017, 5.000000000000018, 4.900000000000018, 4.8000000000000185, 4.700000000000019, 4.600000000000019, 4.5000000000000195, 4.40000000000002, 4.30000000000002, 4.200000000000021, 4.100000000000021, 4.000000000000021, 3.9000000000000212, 3.800000000000021, 3.700000000000021, 3.600000000000021, 3.500000000000021, 3.400000000000021, 3.3000000000000207, 3.2000000000000206, 3.1000000000000205, 3.0000000000000204, 2.9000000000000203, 2.8000000000000203, 2.70000000000002, 2.60000000000002, 2.50000000000002, 2.40000000000002, 2.30000000000002, 2.2000000000000197, 2.1000000000000196, 2.0000000000000195, 1.9000000000000195, 1.8000000000000194, 1.7000000000000193, 1.6000000000000192, 1.500000000000019, 1.400000000000019, 1.300000000000019, 1.2000000000000188, 1.1000000000000187, 1]
#         onehundreddownplusextra = [100000, 10000, 1000, 100, 50, 10, 5, 2, 1.5, 1]
        newcover16set = set(list(ldictmapping.keys()))

        for incrementaldown in onehundreddownplusextra:
            print(incrementaldown)
            for nm in justpointsfornow:
                if nm.get('urbanization', None) and nm.get('urbanization', None) in urbanizationcalc:
                    radiustocover = (math.sqrt((tometersconstant*urbanizationcalc[nm['urbanization']]) / (math.pi))) / (incrementaldown)
                else:
                    radiustocover = 508.14570704546776 / incrementaldown
                    
                new16s = get_cell_ids(float(nm['latitude']), float(nm['longitude']), radiustocover)

                
                newcover16set.difference_update(nm.get('cover16s', set()))

                if set(new16s).isdisjoint(newcover16set):
                    newcover16set.update(nm.get('cover16s', set()))
                    nm['cover16s'] = list(set(new16s))
                    nm['estimated'] = True
                    newcover16set.update(set(new16s))
                else:
                    newcover16set.update(nm.get('cover16s', set()))
    
    ########################################################
    #MAIN
    if clean:
        cleanSlate()
    
    bannedwords, cellldict, cellnmdict = setup()
    
#     secondorderbannedwords = set(['college', 'university', 'school', 'institute', 'community', 'academy', 'technical', 'state', 'center', 'career', 'new', 'campus', 'county', 'american', 'san', 'city', 'education', 'international', 'south', 'valley', 'north', 'central', 'national', 'southern', 'st', 'saint', 'professional', 'western', 'west', 'careers', 'schools', 'graduate', 'northwest', 'los', 'eastern', 'de', 'east', 'area', 'pacific', 'main', 'southwest', 'district', 'america', 'studies', 'santa', 'river', 'regional', 'northern', 'united', 'mexico', 'la', 'junior', 'hills', 'bay', 'southwestern', 'mountain', 'mount', 'mary', 'le', 'colleges', 'universal', 'springs', 'lakes', 'island'])
    secondorderbannedwords = set(['college', 'university', 'school', 'institute', 'community', 'academy', 'campus', 'colleges'])
    
    colidebylocationidentifybyname()
    makesurecoversunique()
    expandpoints()
    

In [18]:
stufff(usindex, [collegecells, univcells],clean=True)

repeat
repeat
repeat
repeat
repeat
repeat
repeat
repeat
repeat
repeat
repeat
repeat
repeat
repeat
repeat
repeat
repeat
repeat
repeat
repeat
repeat
repeat
repeat
repeat
repeat
repeat
repeat
repeat
repeat
repeat
repeat
repeat
repeat
repeat
repeat
repeat
repeat
repeat
repeat
repeat
repeat
repeat
repeat
repeat
repeat
repeat
repeat
repeat
repeat
repeat
repeat
repeat
repeat
repeat
repeat
repeat
repeat
repeat
repeat
repeat
repeat
repeat
repeat
repeat
repeat
repeat
repeat
repeat
repeat
repeat
repeat
repeat
repeat
repeat
repeat
repeat
repeat
repeat
repeat
repeat
repeat
repeat
repeat
repeat
repeat
repeat
repeat
repeat
repeat
repeat
repeat
repeat
repeat
repeat
repeat
repeat
repeat
repeat
repeat
repeat
repeat
repeat
repeat
repeat
repeat
repeat
repeat
repeat
repeat
repeat
repeat
repeat
repeat
repeat
repeat
repeat
repeat
repeat
repeat
repeat
repeat
repeat
repeat
repeat
repeat
repeat
100000
10000
1000
100
99
98
97
96
95
94
93
92
91
90
89
88
87
86
85
84
83
82
81
80
79
78
77
76
75
74
73
72
71
70
69
68


In [4]:
with open('newusindex8.json', 'w', encoding='utf8') as write_file:
    json.dump(usindex, write_file, indent=4)

In [1]:
import json
import s2sphere as s2
from copy import deepcopy

In [2]:
with open('newusindex7.json', 'r', encoding='utf8') as read_file:
    usindex = json.load(read_file)

In [3]:
for el in usindex:
    if 'marked' in el:
        el.pop('marked')
    if 'hit16s' in el:
        el.pop('hit16s')
    if el.get('cover16s') == []:
        el.pop('cover16s')

In [4]:
finalldictmapping = {}
for inel in usindex:
    indcover16s = inel.get('cover16s', None)
    if indcover16s:
        for tok16 in indcover16s:
            if finalldictmapping.get(tok16, None):
                print("duplicate")
            else:
                finalldictmapping[tok16] = inel

In [5]:
[el['institution'] for el in usindex if not el.get('cover16s')]

['American Baptist Seminary of the West',
 'American Conservatory Theater',
 'Associated Technical College-Los Angeles',
 'Church Divinity School of the Pacific',
 'Claremont Graduate University',
 'Claremont McKenna College',
 'Western University of Health Sciences',
 'Fuller Theological Seminary',
 'Graduate Theological Union',
 'Harvey Mudd College',
 "Lyle's College of Beauty",
 'Modern Beauty Academy',
 'North-West College-Pasadena',
 'Hope International University',
 'Pacific School of Religion',
 'Pacific States University',
 'Pitzer College',
 'Starr King School for the Ministry',
 'Marshall B Ketchum University',
 'Claremont School of Theology',
 'Berkeley City College',
 'The Wright Institute',
 'Community College of Denver',
 'Iliff School of Theology',
 'Naropa University',
 'Charter Oak State College',
 'Pontifical Faculty of the Immaculate Conception at the Dominican House of Studies',
 'Strayer University-District of Columbia',
 'Trinity Washington University',
 'Embry-R

In [9]:
[innn for inddd, innn in enumerate([el for el in usindex if not el.get('cover16s')]) if inddd in [177,220,294]]

[{'unitid': '245892',
  'institution': 'Antioch University-Midwest',
  'alias': 'Antioch University McGregor',
  'city': 'Yellow Springs',
  'state': 'Ohio',
  'zipcode': '45387',
  'urbanization': 'Rural: Fringe',
  'countycode': 'Greene County, OH',
  'countyname': 'Greene County',
  'longitude': '-83.908684',
  'latitude': '39.80321',
  'active': 'Yes',
  'headcount': '239',
  'undergraduate': '54',
  'fulltime': '114',
  'hit16s': ['8840a014d',
   '8840a0153',
   '8840a0155',
   '8840a0401',
   '8840a0403',
   '8840a041d',
   '8840a06a4',
   '8840a06ac',
   '8840a06b4',
   '8840a06b9',
   '8840a06bb',
   '8840a06bd'],
  'regex': ['antioch', 'university', 'midwest'],
  'cover16s': []},
 {'unitid': '442392',
  'institution': 'Antioch University-PhD Program in Leadership and Change',
  'alias': '',
  'city': 'Yellow Springs',
  'state': 'Ohio',
  'zipcode': '45387',
  'urbanization': 'Rural: Fringe',
  'countycode': 'Greene County, OH',
  'countyname': 'Greene County',
  'longitude': 

In [4]:
[(indd, ini) for indd, ini in enumerate([finalldictmapping[s2.CellId.from_lat_lng(s2.LatLng.from_degrees(float(el['latitude']), float(el['longitude']))).parent(16).to_token()] for el in usindex if not el.get('cover16s')]) if 'Online' in ini['institution']]

[]

In [5]:
[inii.pop('marked') for inii in usindex if inii.get('marked', None)]

[True, True, True, True, True, True, True, True, True, True, True]

In [4]:

restart = True
while restart:
    restart = False
    
    finalldictmapping = {}
    for inel in usindex:
        indcover16s = inel.get('cover16s', None)
        if indcover16s:
            for tok16 in indcover16s:
                if finalldictmapping.get(tok16, None):
                    print("duplicate")
                else:
                    finalldictmapping[tok16] = inel


    for inel in usindex:
        indcover16s = inel.get('cover16s', None)
        repcellid = s2.CellId.from_lat_lng(s2.LatLng.from_degrees(float(inel['latitude']), float(inel['longitude']))).parent(16).to_token()
        if not indcover16s:
            collisionss = finalldictmapping[repcellid]
            if int(inel['undergraduate']) > int(collisionss['undergraduate']) and 'Program' not in inel['institution'] and 'Online' not in inel['institution']:
                inel['cover16s'] = deepcopy(collisionss['cover16s'])
                collisionss.pop('cover16s')
                restart = True
                break
        elif len(indcover16s) == 1 and not inel.get('marked', False):
            allnbrs = [inel]
            for neighb in s2.CellId.from_token(repcellid).get_all_neighbors(16):
                neighbor=finalldictmapping.get(neighb.to_token(), None)
                if neighbor and neighbor not in allnbrs:
                    allnbrs.append(neighbor)
            
            alloptions = allnbrs
            bestoption = max(alloptions, key=lambda x: int(x['undergraduate']))
            
            popped = None
            if 'Program' in bestoption['institution'] and 'Online' in bestoption['institution']:
                if len(alloptions) > 1:
                    popped = alloptions.pop(alloptions.index(bestoption))
                    bestoption = max(alloptions, key=lambda x: int(x['undergraduate']))
            
            if popped:
                alloptions.append(popped)
                
            newtotal16 = set()
            
            [newtotal16.update(el.get('cover16s', set())) for el in alloptions]
            [el.pop('cover16s') for el in alloptions if 'cover16s' in el]
            for rrr in alloptions:
                rrr['marked'] = True
            
            bestoption['cover16s'] = list(newtotal16)
            
            restart=True
            break
            
            
            
    #                 if float(neighbor['headcount']) < float(inel['headcount']) and not neighbor.get('estimated', False):
    #                     print(neighbor['headcount'])
    #                     print(inel['headcount'])

    #                     print(neighbor['cover16s'])
    #                     print(neighbor['institution'])
    #         print()
    #             inel['cover16s'] = [repcellid]

    #         collisionss = finalldictmapping[repcellid]
    #         if collisionss != inel and collisionss.get('children', None):
    #             collisionss['children'].append(inel)
    #             encounters.append(inel)
    #         elif collisionss != inel:
    #             collisionss['children'] = [inel]
    #             encounters.append(inel)


In [22]:
categories = {}
for inel in usindex:
    indcover16s = inel.get('cover16s', None)
    if indcover16s:
        if categories.get(inel['urbanization'], None):
            totalarea = reduce(lambda x, y: x + s2.Cell(s2.CellId.from_token(y)).exact_area(), indcover16s, 0)
            if 'Berkeley' in inel['institution'] and 'University of California' in inel['institution']:
                print(totalarea)
                print(s2.Cell(s2.CellId.from_token(indcover16s[0])).average_area())
                print(len(indcover16s))
            categories[inel['urbanization']].append(totalarea)
        else:
            totalarea = reduce(lambda x, y: x + s2.Cell(s2.CellId.from_token(y)).exact_area(), indcover16s, 0)
            categories[inel['urbanization']] = [totalarea]
            

NameError: name 'reduce' is not defined

In [None]:
19793.17 / 4.876393597557198e-10

In [None]:
import statistics
import numpy as np

In [None]:
def reject_outliers(data, m=3):
    return data[abs(data - np.mean(data)) < m * np.std(data)]

In [None]:
categories

In [None]:
categories

In [None]:
finalcategoriesarea = {}
for catkey in categories:
    
    evallist = reject_outliers(np.array(categories[catkey]))
    
    finalcategoriesarea[catkey] = np.mean(evallist)

In [None]:
finalcategoriesarea

In [None]:
rando16 = ['8862153d3', '886215375', '886215253', '886215343', '886215255', '88626c0c3', '88626c0df', '88626c0ab', '88626accd', '8862153c7', '88626bf53', '88621534b', '886215321', '886215311', '88621524b', '886215161', '88626c761', '88626b8bb', '88626b8ab', '88626b8a7', '88626c733', '88626bf39', '88626c74f', '88626c72b', '88626c0e5', '886215345', '886215337', '88621515d', '88626bf47', '88626c0e1', '8862153c9', '88626c0a9', '88626b8c9', '8862153a5', '88621536b', '8862153cd', '88626c717', '88626ad29', '8862153a3', '886215373', '886215313', '88626c0b5', '88626c73f', '8862153df', '8862153b1', '886215305', '88626c097', '88626c74d', '88621531d', '8862152d9', '8862153eb', '88626c769', '886215307', '88626c0d3', '886215341', '88626c0dd', '88626b8a5', '88626c76b', '8862152e3', '88621524f', '88626b8a3', '886215309', '8862152fb', '88626c0c7', '88626c737', '88626bf51', '886215249', '88626bf35', '88626c759', '886215315', '88626b8af', '8862153c3', '88626c76f', '886215339', '88626c749', '886215323', '886215303', '88626c0c9', '88626c095', '88626acd3', '8862152e7', '88626c715', '88626c719', '88626c0cb', '8862153b7', '886215165', '88626bf45', '8862152e5', '88621532f', '88626b8bd', '88626c0cd', '8862152d3', '88626c73b', '8862153d5', '88621533b', '88626c0b9', '88626c129', '88626c72f', '88621539d', '88621530f', '88626c0bf', '8862152fd', '88626c739', '88626c743', '8862153db', '8862152d7', '88626c0e7', '88626acd5', '88626b8a1', '88626c745', '8862153af', '886215251', '88626c757', '88621524d', '88626bf37', '88626c725', '88626c0b7', '88626c741', '88626b8b1', '8862153e9', '88626c735', '88626c747', '8862152f9', '88626bf49', '88626c12b', '88621515f', '886215167', '8862152e1', '886215317', '88626b8cb', '8862153b9', '88626c76d', '88621531f', '88626c0a5', '88626c127', '88626c72d', '886215163', '88626bf4b', '8862153cf', '88626c727', '88626c0d1', '88626b8b3', '88626c0ad', '88621536d', '88626c0c5', '88626ad2b', '88626c731', '88626c0db', '886215377', '88626bf4f', '88626c0bd', '88626c0a7', '88626c0d5', '88626b8cf', '886215333', '88621530b', '88621531b', '8862152db', '886215235', '88621532d', '88626c0b3', '8862153dd', '886215347', '88621530d', '88626c75d', '8862153cb', '886215349', '886215233', '886215319', '8862153bb', '88626b8b7', '8862153a1', '88626c753', '886215257', '88626c0cf', '886215325', '88626c75b', '88626c765', '88621533f', '886215237', '88626c73d', '88626c0bb', '88626bf5b', '8862153d1', '886215335', '88621534f', '88626c751', '88621532b', '88621533d', '886215329', '88626b8a9', '8862152d5', '88626bf55', '88626c755', '88626c0b1', '88626c729', '88626c0af', '8862153e1', '8862153ab', '8862153e7', '88621539f', '886215327', '88626c723', '8862153c1', '88626c0e3', '88626bf4d', '8862153d7', '886215301', '8862153b5', '8862152ff', '88626c0a3', '88626c74b', '8862153c5', '88626b8ad', '8862153b3', '88626b8b5', '88626c713', '8862153a9', '8862153d9', '88626c75f', '88626c711', '88626bf57', '88626bf59', '88626ad2d', '88626c0d7', '886215371', '88626c0d9', '88626c099', '8862152d1', '886215369', '88621536f', '886215331', '88626c0e9', '88626c767', '8862153ad', '8862153a7']

In [None]:
randocell = s2.Cell(s2.CellId.from_token('8862153d3'))

In [59]:
randocell.exact_area()

4.3541615507706104e-10

In [62]:
from functools import reduce

In [68]:
newunino = reduce(lambda x, y: x + s2.Cell(s2.CellId.from_token(y)).exact_area(), rando16, 0)

In [69]:
newunino

1.0273740971752336e-07

In [65]:
totalarea = 0
for el in newunino.cell_ids():
    print(el.level())

AttributeError: 'list' object has no attribute 'cell_ids'

In [59]:
geonames[0]

{'geonameid': '11945546',
 'name': 'Escola Notra Senyora de Meritxell',
 'asciiname': 'Escola Notra Senyora de Meritxell',
 'alternatenames': 'Escola Notra Senyora de Meritxell',
 'latitude': '42.49727',
 'longitude': '1.4995',
 'feature-class': 'S',
 'feature-code': 'SCH',
 'country-code': 'AD',
 'cc2': '',
 'admin1-code': '7',
 'admin2-code': '',
 'admin3-code': '',
 'admin4-code': '',
 'population': '0',
 'elevation': '',
 'dem': '992',
 'timezone': 'Europe/Andorra',
 'modification-date': '9/8/18',
 'hit14s': ['12a5f4c3',
  '12a5f4dd',
  '12a5f4df',
  '12a5f4e4',
  '12a5f4e9',
  '12a5f4ef',
  '12a5f51d',
  '12a5f51f',
  '12a5f521'],
 'hit15s': ['12a5f4dd4',
  '12a5f4ddc',
  '12a5f4de4',
  '12a5f4dfc',
  '12a5f4e1',
  '12a5f4e24',
  '12a5f4e3c',
  '12a5f4e44',
  '12a5f4e7'],
 'hit16s': ['12a5f4ddf',
  '12a5f4de1',
  '12a5f4de3',
  '12a5f4e05',
  '12a5f4e07',
  '12a5f4e0c',
  '12a5f4e14',
  '12a5f4e1b',
  '12a5f4e6b',
  '12a5f4e6d',
  '12a5f4e6f',
  '12a5f4e74']}

In [None]:
for colel in collegecells:
    for nnn in colel['allnames']:
        if 'Auburn' in nnn:
            print(colel['regex'])

In [25]:
for el in univcells:
    if 'matched' not in el:
        print(el['allnames'])
        print('\n')

In [24]:
for el in univcells:
    if 'matched' in el:
        if len(el['matched']) > 1:
            print(el['allnames'])
            for matchedel in el['matched']:
                print(matchedel['institution'])
            print('\n')

In [55]:
for name in formalnames:
    if 'Berkeley' in name['name'] and 'University of California' in name['name']:
        print(name)

In [59]:
s2.CellId.from_lat_lng(s2.LatLng.from_degrees(37.87215, -122.25975)).parent(16).to_token()

In [None]:
exp1 = linkCoordinateToBound(locdict, formalnames, numexpansions=1)

In [None]:
exp2 = linkCoordinateToBound(locdict, formalnames, numexpansions=2)

In [None]:
#0expansion:
# 74232
# 211192

#1expansion:
# 83678
# 201746

#3expansions
# 99244
# 186180

In [26]:
b = set()

In [39]:
len(locdict)