In [20]:
import pandas as pd
import gzip
from collections import *
from networkx.algorithms import swap
import networkx as nx
from networkx.algorithms import community
import geopandas
from shapely.geometry import Point
from geopandas.tools import sjoin

In [12]:
uid_to_loc = {}

# Only get US folks
with open('/shared/0/projects/location-inference/data/user-to-location.inferred.with-names.tsv') as f:
    for line_no, line in enumerate(f, 1):
        cols = line.split('\t')
        country = cols[5]
        
        if line_no % 10000000 == 0:
            print('Saw %d locations; %d in US' % (line_no, len(uid_to_loc)))
        
        if country != 'US':
            continue
        uid = int(cols[0])
        loc = tuple(cols[1:6])
        uid_to_loc[uid] = loc

Saw 10000000 locations; 2277844 in US
Saw 20000000 locations; 4547861 in US
Saw 30000000 locations; 6692686 in US
Saw 40000000 locations; 8635052 in US
Saw 50000000 locations; 10589652 in US
Saw 60000000 locations; 12561177 in US
Saw 70000000 locations; 14506475 in US
Saw 80000000 locations; 16400642 in US
Saw 90000000 locations; 18414424 in US
Saw 100000000 locations; 20795317 in US
Saw 110000000 locations; 23603180 in US


In [None]:
mention_network = defaultdict(list)
g = nx.Graph()
size = 0
with gzip.open('/shared/0/projects/location-inference/data/mention-network.tsv.gz', 'rt') as f:
    for line_no, line in enumerate(f, 1):
        cols = line[:-1].split('\t')
        uid1 = int(cols[0])
        
        if line_no % 10000000 == 0:
            print("Saw %d edges; %d between %d users in US" % (line_no, size, len(mention_network)))
        
        if uid1 not in uid_to_loc:
            continue
        uid2 = int(cols[1])
        if uid2 not in uid_to_loc:
            continue     
            
        weight = int(cols[2])
        mention_network[uid1].append(uid2)
        mention_network[uid2].append(uid1)
        g.add_edge(uid1, uid2, weight=weight)
        g.add_edge(uid2, uid1, weight=weight)
        size += 1

Saw 10000000 edges; 1647868 between 1569260 users in US
Saw 20000000 edges; 3291418 between 2760048 users in US
Saw 30000000 edges; 4949515 between 3746987 users in US
Saw 40000000 edges; 6600789 between 4590032 users in US
Saw 50000000 edges; 8241438 between 5331878 users in US
Saw 60000000 edges; 9896997 between 6003730 users in US
Saw 70000000 edges; 11559807 between 6614145 users in US
Saw 80000000 edges; 13210719 between 7173757 users in US
Saw 90000000 edges; 14868118 between 7695452 users in US
Saw 100000000 edges; 16527745 between 8182358 users in US


In [8]:
print(len(g))

22464131


In [9]:
msa_gdf = gpd.GeoDataFrame.from_file("../data/tl_2017_us_cbsa.shp")
# The first element
msa_gdf.head(2)

Unnamed: 0,CSAFP,CBSAFP,GEOID,NAME,NAMELSAD,LSAD,MEMI,MTFCC,ALAND,AWATER,INTPTLAT,INTPTLON,geometry
0,462,40340,40340,"Rochester, MN","Rochester, MN Metro Area",M1,1,G3110,6415412346,75315930,43.9499166,-92.3356986,"POLYGON ((-92.67871699999999 44.195516, -92.67..."
1,450,39580,39580,"Raleigh, NC","Raleigh, NC Metro Area",M1,1,G3110,5485063049,76967367,35.7567464,-78.4604412,"POLYGON ((-78.546414 36.021826, -78.5464059999..."


In [19]:
user_df = defaultdict(list)
for uid, loc in uid_to_loc.items():
    user_df['user'].append(uid)
    try:
        user_df['Latitude'].append(float(loc[0]))
        user_df['Longitude'].append(float(loc[1]))
    except ValueError as e:
        print(loc)
user_df = pd.DataFrame(user_df)
user_df['Coordinates'] = list(zip(user_df.Longitude, user_df.Latitude))
user_df['Coordinates'] = user_df['Coordinates'].apply(Point)
user_gdf = gpd.GeoDataFrame(user_df, geometry='Coordinates')

In [None]:
uids_with_msa_code = sjoin(user_gdf, msa_gdf[['NAMELSAD', 'GEOID', 'geometry']], how="inner", op='intersects')
uids_with_msa_code.head()

  warn('CRS of frames being joined does not match!')


In [24]:
print(len(user_df))
print(len(uids_with_msa_code))

24216546
23373998


In [25]:
uids_with_msa_code.head()

Unnamed: 0,user,Latitude,Longitude,Coordinates,index_right,NAMELSAD,GEOID
0,421439774,35.220029,-97.491507,POINT (-97.49150663 35.22002857),272,"Oklahoma City, OK Metro Area",36420
11,421439708,35.340693,-97.473145,POINT (-97.47314453 35.34069293),272,"Oklahoma City, OK Metro Area",36420
150,421437993,35.630342,-97.350439,POINT (-97.35043858 35.63034219),272,"Oklahoma City, OK Metro Area",36420
219,2802,35.22102,-97.439638,POINT (-97.43963812 35.22102033),272,"Oklahoma City, OK Metro Area",36420
490,842872824,35.599118,-98.120128,POINT (-98.12012799999999 35.599118),272,"Oklahoma City, OK Metro Area",36420


In [26]:
uids_with_msa_code.to_csv('../data/us-users-with-msa.tsv', sep='\t')

In [34]:
user_to_msa = {}
#for i, row in uids_with_msa_code.iterrows():
#    user_to_msa[row['user']] = row['NAMELSAD']
for i, row in enumerate(uids_with_msa_code.itertuples(), 1):
    user_to_msa[row[1]] = row[6]
    if i % 1000000 == 0:
        print('Loaded %d users' % len(user_to_msa))

Loaded 1000000 users
Loaded 2000000 users
Loaded 3000000 users
Loaded 4000000 users
Loaded 5000000 users
Loaded 6000000 users
Loaded 7000000 users
Loaded 8000000 users
Loaded 9000000 users
Loaded 10000000 users
Loaded 11000000 users
Loaded 12000000 users
Loaded 13000000 users
Loaded 14000000 users
Loaded 15000000 users
Loaded 16000000 users
Loaded 17000000 users
Loaded 18000000 users
Loaded 19000000 users
Loaded 20000000 users
Loaded 21000000 users
Loaded 22000000 users
Loaded 23000000 users


In [35]:
print(len(user_to_msa))

23373998


In [36]:
def get_msa(uid):
    if uid in user_to_msa:
        return user_to_msa[uid]
    else:
        return 'None'
user_df['MSA'] = user_df['user'].apply(get_msa)

In [37]:
user_df.head()

Unnamed: 0,user,Latitude,Longitude,Coordinates,MSA
0,421439774,35.220029,-97.491507,POINT (-97.49150663 35.22002857),"Oklahoma City, OK Metro Area"
1,421439767,42.483664,-83.024755,POINT (-83.02475475999999 42.48366445),"Detroit-Warren-Dearborn, MI Metro Area"
2,421439782,42.099333,-72.557598,POINT (-72.55759844000001 42.09933324),"Springfield, MA Metro Area"
3,17,37.750265,-122.202902,POINT (-122.20290184 37.75026469),"San Francisco-Oakland-Hayward, CA Metro Area"
4,21,37.735392,-122.506313,POINT (-122.50631332 37.73539224),"San Francisco-Oakland-Hayward, CA Metro Area"


In [None]:
uid_to_block = {}
with gzip.open('/shared/0/projects/ses/agg_user_data.gz', 'rt') as f:
    for line_no, line in enumerate(f, 1):
        cols = line.split('\t')
        uid = int(cols[0])
        block = cols[1]
        uid_to_block[uid] = block
        if line_no % 500000 == 0:
            print('loaded %d lines' % line_no)
print(len(uid_to_block))

loaded 500000 lines
loaded 1000000 lines
loaded 1500000 lines
loaded 2000000 lines
loaded 2500000 lines
loaded 3000000 lines
loaded 3500000 lines
loaded 4000000 lines
loaded 4500000 lines
loaded 5000000 lines
loaded 5500000 lines
loaded 6000000 lines
loaded 6500000 lines
loaded 7000000 lines
loaded 7500000 lines


In [40]:
def get_block(uid):
    if uid in uid_to_block:
        return uid_to_block[uid]
    else:
        return float('nan')
user_df['Block_ID'] = user_df['user'].apply(get_block)

In [48]:
print(len(uid_to_block))
print(len(user_df))

15548003
24216546


In [41]:
user_df.head()

Unnamed: 0,user,Latitude,Longitude,Coordinates,MSA,Block_ID
0,421439774,35.220029,-97.491507,POINT (-97.49150663 35.22002857),"Oklahoma City, OK Metro Area",15000US400272015073
1,421439767,42.483664,-83.024755,POINT (-83.02475475999999 42.48366445),"Detroit-Warren-Dearborn, MI Metro Area",15000US260992680002
2,421439782,42.099333,-72.557598,POINT (-72.55759844000001 42.09933324),"Springfield, MA Metro Area",
3,17,37.750265,-122.202902,POINT (-122.20290184 37.75026469),"San Francisco-Oakland-Hayward, CA Metro Area",
4,21,37.735392,-122.506313,POINT (-122.50631332 37.73539224),"San Francisco-Oakland-Hayward, CA Metro Area",


In [49]:
user_df[['user', 'MSA', 'Block_ID']].dropna()\
   .to_csv('/shared/1/projects/ses/data/user/user-census-loc.tsv', sep='\t', index=False)