# Shapefile Analysis

### Importing

In [1]:
import pandas as pd
import geopandas as gp
import numpy as np
import nafot
import shapely
from shapely.geometry import Point
from shapely.ops import cascaded_union
from matplotlib import pyplot as plt
import matplotlib
import os
matplotlib.style.use('ggplot')
%matplotlib inline

### Load statistical area data and shapefile

In [2]:
# stat_area_df2 = gp.read_file('../data/_lamas/50400_stat_area_2008 - Copy/stat_2008_NEW_04Nov_1335.shp',
#                             encoding='windows-1255')
stat_area_df = nafot.borders.stat_area_wgs_df.copy()

Load extra data about the statistical areas

In [3]:
extra_df = pd.read_excel('../data/_lamas/50401_population/Pop_Sex_Age_Religion - edited.xlsx',
                           encoding='windows-1255')

# Get only the wanted columns
extra_df = extra_df[['SEMEL_YISHUV', 'STAT08','DistrictCode', 'DistrictHeb',
       'SubDistrictCode', 'SubDistrictHeb', 'MetrCode', 'MetrHeb', 'pop_thou']]

# Convert STAT08 into an integer
extra_df['STAT08'] = extra_df.STAT08.astype('int64', copy=False)

In [4]:
stat_area_df.head()

Unnamed: 0,OBJECTID,SEMEL_YISH,STAT08,Shape_Area,Shape_Leng,Shem_Yis_1,Shem_Yishu,YISHUV_STA,geometry
0,1,-2,0,462282300.0,724681.884774,,ùèç ììà ùéôåè,0,(POLYGON ((35.00832616096807 31.12433357493895...
1,2,7,1,6802526.0,11591.466653,SHAHAR,ùçø,70001,"POLYGON ((34.73932928249113 31.62604186681884,..."
2,3,10,1,444515.0,2981.46522,TIROSH,úéøåù,100001,"POLYGON ((34.8878463564387 31.75143638736428, ..."
3,4,11,1,5983378.0,11808.357935,NIR HEN,ðéø çï,110001,"POLYGON ((34.72779348911466 31.60676610854767,..."
4,5,13,1,17804270.0,17954.580825,HAZEVA,çöáä,130001,"POLYGON ((35.28651769274753 30.74930282971737,..."


### Create a distinctive id for a statistical area
(By concatenating SEMEL YISHUV and STAT08)

Create the ID column in the shapefile df (and set it as an index)

In [5]:
stat_area_df['stat_id'] = stat_area_df.apply(lambda row: int(str(row['SEMEL_YISH']).strip() +
                                        str(row['STAT08']).strip()), axis=1)
stat_area_df.set_index('stat_id', inplace=True, verify_integrity=True)

Create the ID column in the extra data df (and set it as an index)

In [6]:
extra_df['stat_id'] = extra_df.apply(lambda row: int(str(row['SEMEL_YISHUV']).strip() +
                                        str(row['STAT08']).strip()), axis=1)
extra_df.set_index('stat_id', inplace=True, verify_integrity=True)
extra_df.drop(['SEMEL_YISHUV', 'STAT08'], inplace=True, axis=1)

### Join the data frames

In [7]:
stat_area_df_joined = stat_area_df.join(extra_df)

### Save as a new GeoDataFrame

In [8]:
gdf = gp.GeoDataFrame(stat_area_df_joined, geometry='geometry')

In [9]:
# Remove no jurisdiction area
gdf.drop(-20, inplace=True)

## Create Aggregated Polygons

### Create Districts polygons

In [10]:
# Get all the districts codes
district_codes = pd.unique(gdf[~gdf.DistrictCode.isnull()].DistrictCode)

# Create a dictionary of the districts polygons
district_polygons = {code : cascaded_union(gdf[gdf.DistrictCode == code].geometry.values) for code in district_codes}

### Create SubDistricts polygons

In [11]:
# Get all the subdistricts codes
subdistrict_codes = pd.unique(gdf[~gdf.SubDistrictCode.isnull()].SubDistrictCode)

# Create a dictionary of the subdistricts polygons
subdistrict_polygons = {code : cascaded_union(gdf[gdf.SubDistrictCode == code].geometry.values) for code in subdistrict_codes}

### Create Yeshuvim polygons

In [12]:
# Get all the districts codes
yeshuvim_codes = pd.unique(gdf[~gdf.SEMEL_YISH.isnull()].SEMEL_YISH)

# Create a dictionary of the districts polygons
yeshuvim_polygons = {code : cascaded_union(gdf[gdf.SEMEL_YISH == code].geometry.values) for code in yeshuvim_codes}

### Statistical areas without hierarchy

In [13]:
single_stat_codes = gdf[gdf.DistrictCode.isnull()].index.values

## Create GeoDataFrames (For each level)

### Create District GeoDataFrame

In [14]:
districts_gdf = pd.DataFrame(district_codes, columns=['DistrictCode'])
districts_gdf['geometry'] = districts_gdf.apply(lambda row: district_polygons[row.DistrictCode], axis=1)
districts_gdf.set_index('DistrictCode', inplace=True)
districts_gdf = gp.GeoDataFrame(districts_gdf, geometry='geometry')

### Create SubDistrict GeoDataFrame

In [15]:
subdistricts_gdf = pd.DataFrame(subdistrict_codes, columns=['SubDistrictCode'])
subdistricts_gdf['geometry'] = subdistricts_gdf.apply(lambda row: subdistrict_polygons[row.SubDistrictCode], axis=1)
subdistricts_gdf.set_index('SubDistrictCode', inplace=True)
subdistricts_gdf = gp.GeoDataFrame(subdistricts_gdf, geometry='geometry')

### Create Yeshuvim GeoDataFrame

In [16]:
yeshuvim_gdf = pd.DataFrame(yeshuvim_codes, columns=['SEMEL_YISH'])
yeshuvim_gdf['geometry'] = yeshuvim_gdf.apply(lambda row: yeshuvim_polygons[row.SEMEL_YISH], axis=1)
yeshuvim_gdf.set_index('SEMEL_YISH', inplace=True)
yeshuvim_gdf = gp.GeoDataFrame(yeshuvim_gdf, geometry='geometry')

### Create Statistical Areas GeoDataFrame

In [17]:
stat_areas_gdf = gdf[['geometry']]

## Create Hierarchical lists

In [18]:
# Create a dict containing a list of SubDistrict for each District
districts_sub = {code : pd.unique(gdf[gdf.DistrictCode == code].SubDistrictCode.values) for code in district_codes}

# Create a dict containing a list of Yeshuvim for each SubDistrict
subdistrict_yesuvim = {code : pd.unique(gdf[gdf.SubDistrictCode == code].SEMEL_YISH.values) for code in subdistrict_codes}

# Create a dict containing a list of Statistical areas for each Yeshuv
yeshuvim_stat = {code : pd.unique(gdf[gdf.SEMEL_YISH == code].index.values) for code in yeshuvim_codes}

In [19]:
districts_stat = {code : pd.unique(gdf[gdf.DistrictCode == code].index.values) for code in district_codes}
subdistrict_stat = {code : pd.unique(gdf[gdf.SubDistrictCode == code].index.values) for code in subdistrict_codes}

# Hierarchical Search
#### Districts -> Subdistricts -> Yeshuvim -> Statistical Areas

### First Version - For Loops

In [20]:
def get_stat_area_v1(longtitude, latitude):
# def get_stat_area(point):
    # Create a point object
    point = Point(longtitude, latitude)
    
    # Get the district
    district = which_district_v1(point)
    # Get the subdistrict
    subdistrict = which_sub_district_v1(point, district)
    # Get the yeshuv
    yeshuv = which_yeshuv_v1(point, subdistrict)
    # Get the stat area
    stat_area = which_stat_v1(point, yeshuv)
    
    return stat_area                 
                  
def which_district_v1(point):
    # Go over the districts
    for i, poly in districts_gdf.itertuples():
        if poly.contains(point):
            return i

def which_sub_district_v1(point, district=None): 
    # Get subdistricts to check
    if district:
        subdistricts = subdistricts_gdf.loc[districts_sub[district]]
    else:
        subdistricts = subdistricts_gdf
    
    # Go over the subdistricts
    for i, poly in subdistricts.itertuples():
        if poly.contains(point):
            return i
        
def which_yeshuv_v1(point, subdistrict=None):
    # Get yeshuvim to check
    if subdistrict:
        yeshuvim = yeshuvim_gdf.loc[subdistrict_yesuvim[subdistrict]]
    else:
        yeshuvim = yeshuvim_gdf
    
    # Go over the yeshuvim
    for i, poly in yeshuvim.itertuples():
        if poly.contains(point):
            return i
        
def which_stat_v1(point, yeshuv=None):
    # Get subdistricts to check
    if yeshuv:
        stat_areas = stat_areas_gdf.loc[yeshuvim_stat[yeshuv]]
    else:
        stat_areas = stat_areas_gdf
    
    # Go over the stat_areas
    for i, poly in stat_areas.itertuples():
        if poly.contains(point):
            return i

### Second Version - sliced GeoDataframes

In [21]:
def get_stat_area_v2(longtitude, latitude):
# def get_stat_area(point):
    # Create a point object
    point = Point(longtitude, latitude)
    
    # Get the district
    district = which_district_v2(point)
    # Get the subdistrict
    subdistrict = which_sub_district_v2(point, district)
    # Get the yeshuv
    yeshuv = which_yeshuv_v2(point, subdistrict)
    # Get the stat area
    stat_area = which_stat_v2(point, yeshuv)
    
    return stat_area
                                    
def which_district_v2(point):
    district = districts_gdf[districts_gdf.contains(point)]
    
    if district.size > 0:
        return district.index[0]
    
def which_sub_district_v2(point, district=None): 
    # Get subdistricts to check
    if district:
        subdistricts = subdistricts_gdf.loc[districts_sub[district]]
    else:
        subdistricts = subdistricts_gdf
    
    subdistrict = subdistricts[subdistricts.contains(point)]
    
    if subdistrict.size > 0:
        return subdistrict.index[0]
        
def which_yeshuv_v2(point, subdistrict=None):
    # Get yeshuvim to check
    if subdistrict:
        yeshuvim = yeshuvim_gdf.loc[subdistrict_yesuvim[subdistrict]]
    else:
        yeshuvim = yeshuvim_gdf
    
    yeshuv = yeshuvim[yeshuvim.contains(point)]
    
    if yeshuv.size > 0:
        return yeshuv.index[0]
        
def which_stat_v2(point, yeshuv=None):
    # Get subdistricts to check
    if yeshuv:
        stat_areas = stat_areas_gdf.loc[yeshuvim_stat[yeshuv]]
    else:
        stat_areas = stat_areas_gdf
    
    stat_area = stat_areas[stat_areas.contains(point)]
    
    if stat_area.size > 0:
        return stat_area.index[0]

### Third Version - Preprocessed GeoDataframes

Create hierarchical GeoDataframes

In [22]:
# Subdistricts GeoDataframes by districts
districts_sub_gdf = {district : subdistricts_gdf.loc[districts_sub[district]].copy() for district in district_codes}

# Yeshuvim GeoDataframes by subdistricts
subdistrict_yesuvim_gdf = {subdistrict : yeshuvim_gdf.loc[subdistrict_yesuvim[subdistrict]].copy()
                           for subdistrict in subdistrict_codes}

# Statistical areas GeoDataframes by yeshuvim
yeshuvim_stat_gdf = {yeshuv : stat_areas_gdf.loc[yeshuvim_stat[yeshuv]].copy()
                           for yeshuv in yeshuvim_codes}

In [23]:
def get_stat_area_v3(longtitude, latitude):
# def get_stat_area(point):
    # Create a point object
    point = Point(longtitude, latitude)
    
    # Get the district
    district = which_district_v3(point)
    # Get the subdistrict
    subdistrict = which_sub_district_v3(point, district)
    # Get the yeshuv
    yeshuv = which_yeshuv_v3(point, subdistrict)
    # Get the stat area
    stat_area = which_stat_v3(point, yeshuv)
    
    return stat_area
                                    
def which_district_v3(point):
    district = districts_gdf[districts_gdf.contains(point)]
    
    if district.size > 0:
        return district.index[0]
    
def which_sub_district_v3(point, district=None): 
    # Get subdistricts to check
    if district:
        subdistricts = districts_sub_gdf[district]
    else:
        subdistricts = subdistricts_gdf
    
    subdistrict = subdistricts[subdistricts.contains(point)]
    
    if subdistrict.size > 0:
        return subdistrict.index[0]
        
def which_yeshuv_v3(point, subdistrict=None):
    # Get yeshuvim to check
    if subdistrict:
        yeshuvim = subdistrict_yesuvim_gdf[subdistrict]
    else:
        yeshuvim = yeshuvim_gdf
    
    yeshuv = yeshuvim[yeshuvim.contains(point)]
    
    if yeshuv.size > 0:
        return yeshuv.index[0]
        
def which_stat_v3(point, yeshuv=None):
    # Get subdistricts to check
    if yeshuv:
        stat_areas = yeshuvim_stat_gdf[yeshuv]
    else:
        stat_areas = stat_areas_gdf
    
    stat_area = stat_areas[stat_areas.contains(point)]
    
    if stat_area.size > 0:
        return stat_area.index[0]

### Fourth Version

In [24]:
def get_stat_area_v4(longtitude, latitude):
# def get_stat_area(point):
    # Create a point object
    point = Point(longtitude, latitude)
    
    # Get the district
    district = which_district_v1(point)

    # Get the stat area
    stat_area = which_stat_v4(point, district)
    
    return stat_area

def which_stat_v4(point, district=None):
    # Get subdistricts to check
    if district:
        stat_areas = stat_areas_gdf.loc[districts_stat[district]]
    else:
        stat_areas = stat_areas_gdf
    
    # Go over the stat_areas
    for i, poly in stat_areas.itertuples():
        global counter
        if poly.contains(point):
            return i

### Fifth Version

In [25]:
def get_stat_area_v5(longtitude, latitude):
# def get_stat_area(point):
    # Create a point object
    point = Point(longtitude, latitude)
    
    # Get the district
    subdistrict = which_sub_district_v1(point)

    # Get the stat area
    stat_area = which_stat_v5(point, subdistrict)
    
    return stat_area

def which_stat_v5(point, subdistrict=None):
    # Get subdistricts to check
    if subdistrict:
        stat_areas = stat_areas_gdf.loc[subdistrict_stat[subdistrict]]
    else:
        stat_areas = stat_areas_gdf
    
    # Go over the stat_areas
    for i, poly in stat_areas.itertuples():
        if poly.contains(point):
            return i

### Sixth Version

In [26]:
def get_stat_area_v6(longtitude, latitude):
    point = Point(longtitude, latitude)
    
    # Get the district
    yeshuv = which_yeshuv_v6(point)

    # Get the stat area
    stat_area = which_stat_v6(point, yeshuv)
    
    return stat_area

def which_yeshuv_v6(point):
    # Go over the yeshuvim
    for i, poly in yeshuvim_gdf.itertuples():
        if poly.contains(point):
            return i

def which_stat_v6(point, yeshuv=None):
    # Get subdistricts to check
    if yeshuv:
        stat_areas = stat_areas_gdf.loc[yeshuvim_stat[yeshuv]]
    else:
        stat_areas = stat_areas_gdf.loc[single_stat_codes]
    
    # Go over the stat_areas
    for i, poly in stat_areas.itertuples():
        if poly.contains(point):
            return i

### Seventh Version

In [27]:
subdistrict_ids = subdistricts_gdf.index.copy().values
subdistrict_polys = subdistricts_gdf.geometry.copy().values

yeshuvim_ids = yeshuvim_gdf.index.copy().values
yeshuvim_polys = yeshuvim_gdf.geometry.copy().values

stat_ids = stat_areas_gdf.index.copy().values
stat_polys = stat_areas_gdf.geometry.copy().values

# stat areas without district
stat_polys_singles = stat_polys.copy()[[np.where(stat_ids == code)[0][0] for code in single_stat_codes]]

yeshuvim_stat_polys = {yeshuv : stat_areas_gdf.geometry.loc[yeshuvim_stat[yeshuv]].copy().values
                           for yeshuv in yeshuvim_codes}

subdistrict_stat_polys = {subdistrict : stat_areas_gdf.geometry.loc[subdistrict_stat[subdistrict]].copy().values
                           for subdistrict in subdistrict_codes}

In [28]:
def get_stat_area_v7(longtitude, latitude):
    point = Point(longtitude, latitude)
    
    # Get the yeshuv
    yeshuv = which_yeshuv_v7(point)
    
    if yeshuv:
        # Get the stat area
        stat_area = which_stat_v7(point, yeshuv)
        return stat_area

def which_yeshuv_v7(point):
    # Go over the yeshuvim
    for i, poly in enumerate(yeshuvim_polys):
        if poly.contains(point):
            return yeshuvim_ids[i]
        
def which_stat_v7(point, yeshuv):
    # Get subdistricts to check
    stat_areas = yeshuvim_stat_polys[yeshuv]
    
    # Go over the stat_areas
    for i, poly in enumerate(stat_areas):
        if type(poly)==str:
            print (yeshuv)
        if poly.contains(point):
            return yeshuvim_stat[yeshuv][i]

### Eighth Version

In [29]:
def get_stat_area_v8(longtitude, latitude):
    point = Point(longtitude, latitude)
    
    # Go over the stat_areas
    for i, poly in enumerate(stat_polys):
        if poly.contains(point):
            return stat_ids[i] 

### Ninth Version

In [30]:
def get_stat_area_v9(longtitude, latitude):
    point = Point(longtitude, latitude)
    
    # Get the subdistrict
    subdistrict = which_subdistrict_v9(point)

    # Get the stat area
    stat_area = which_stat_v9(point, subdistrict)
    
    return stat_area

def which_subdistrict_v9(point):
    # Go over the yeshuvim
    for i, poly in enumerate(subdistrict_polys):
        if poly.contains(point):
            return subdistrict_ids[i]
        
def which_stat_v9(point, subdistrict=None):
    # Get subdistricts to check
    if subdistrict:
        stat_areas = subdistrict_stat_polys[subdistrict]
    else:
        stat_areas = stat_polys_singles
     
    # Go over the stat_areas
    for i, poly in enumerate(stat_areas):
        if poly.contains(point):
            return subdistrict_stat[subdistrict][i]

### Time Testing

In [31]:
loc_data = pd.read_csv('../data/samples/sample_1+2.csv')

In [32]:
lon ,lat = loc_data[['longtitude', 'latitude']].iloc[0]
print(lon, lat)

34.7736943535 32.0803662605


#### One sample

In [33]:
lon, lat = loc_data.sample()[['longtitude', 'latitude']].iloc[0]

In [46]:
%%timeit
get_stat_area_v1(lon, lat)

100 loops, best of 3: 10.4 ms per loop


In [47]:
%%timeit
get_stat_area_v2(lon, lat)

10 loops, best of 3: 21 ms per loop


In [48]:
%%timeit
get_stat_area_v3(lon, lat)

10 loops, best of 3: 19.9 ms per loop


In [49]:
%%timeit
get_stat_area_v4(lon, lat)

100 loops, best of 3: 5.11 ms per loop


In [50]:
%%timeit
get_stat_area_v5(lon, lat)

100 loops, best of 3: 5.72 ms per loop


In [51]:
%%timeit
get_stat_area_v6(lon, lat)

100 loops, best of 3: 5.25 ms per loop


In [52]:
%%timeit
get_stat_area_v7(lon, lat)

100 loops, best of 3: 3.51 ms per loop


In [53]:
%%timeit
get_stat_area_v8(lon, lat)

100 loops, best of 3: 10.5 ms per loop


In [54]:
%%timeit
get_stat_area_v9(lon, lat)

100 loops, best of 3: 3.43 ms per loop


In [55]:
%%timeit
nafot.borders.which_stat_area_wgs(lon, lat)

10 loops, best of 3: 22.3 ms per loop


#### Few samples

In [34]:
data = loc_data.sample(100).copy()

In [57]:
%%timeit
data['stat_area'] = data.apply(lambda row: get_stat_area_v1(row.longtitude, row.latitude),axis=1)

10 loops, best of 3: 106 ms per loop


In [58]:
%%timeit
data['stat_area'] = data.apply(lambda row: get_stat_area_v2(row.longtitude, row.latitude),axis=1)

1 loop, best of 3: 208 ms per loop


In [59]:
%%timeit
data['stat_area'] = data.apply(lambda row: get_stat_area_v3(row.longtitude, row.latitude),axis=1)

1 loop, best of 3: 196 ms per loop


In [60]:
%%timeit
data['stat_area'] = data.apply(lambda row: get_stat_area_v4(row.longtitude, row.latitude),axis=1)

10 loops, best of 3: 70.7 ms per loop


In [61]:
%%timeit
data['stat_area'] = data.apply(lambda row: get_stat_area_v5(row.longtitude, row.latitude),axis=1)

10 loops, best of 3: 62.2 ms per loop


In [62]:
%%timeit
data['stat_area'] = data.apply(lambda row: get_stat_area_v6(row.longtitude, row.latitude),axis=1)

10 loops, best of 3: 79.1 ms per loop


In [38]:
%load_ext Cython

In [None]:
%%cython

def get_stat_area_v7(longtitude, latitude):
    point = Point(longtitude, latitude)
    
    # Get the yeshuv
    yeshuv = which_yeshuv_v7(point)
    
    if yeshuv:
        # Get the stat area
        stat_area = which_stat_v7(point, yeshuv)
        return stat_area

def which_yeshuv_v7(point):
    # Go over the yeshuvim
    for i, poly in enumerate(yeshuvim_polys):
        if poly.contains(point):
            return yeshuvim_ids[i]
        
def which_stat_v7(point, yeshuv):
    # Get subdistricts to check
    stat_areas = yeshuvim_stat_polys[yeshuv]
    
    # Go over the stat_areas
    for i, poly in enumerate(stat_areas):
        if type(poly)==str:
            print (yeshuv)
        if poly.contains(point):
            return yeshuvim_stat[yeshuv][i]


In [35]:
%%timeit
data['stat_area'] = data.apply(lambda row: get_stat_area_v7(row.longtitude, row.latitude),axis=1)

1 loop, best of 3: 534 ms per loop


In [37]:
% prun -l 4 data['stat_area'] = data.apply(lambda row: get_stat_area_v7(row.longtitude, row.latitude),axis=1)

 

In [64]:
%%timeit
data['stat_area'] = data.apply(lambda row: get_stat_area_v8(row.longtitude, row.latitude),axis=1)

10 loops, best of 3: 111 ms per loop


In [65]:
%%timeit
data['stat_area'] = data.apply(lambda row: get_stat_area_v9(row.longtitude, row.latitude),axis=1)

10 loops, best of 3: 44.2 ms per loop


In [66]:
%%timeit
data['stat_area'] = data.apply(lambda row: nafot.borders.which_stat_area_wgs(row.longtitude, row.latitude),axis=1)

1 loop, best of 3: 223 ms per loop


In [67]:
points = data[['longtitude', 'latitude']].values

In [68]:
%%timeit
areas = np.array([])
global counter
for p in points:
    area = get_stat_area(p[0], p[1])
    areas = np.append(areas, area)

NameError: name 'get_stat_area' is not defined

## Sample

In [None]:
# Get the imsi list
imsi_list = pd.unique(loc_data.imsi)

# Get only "active" users
df = loc_data[['imsi', 'halfhouridx']].groupby('imsi').count()
imsi_active = df[df.halfhouridx>100].copy().index.values

# Get a sample of users
imsi_sm = np.random.choice(imsi_active, size=100, replace=False)

In [None]:
# Get a sample of 100 users (~1.3M records)
loc_data_sample = loc_data[loc_data.imsi.isin(imsi_sm)][['imsi', 'date_stamp', 'halfhouridx', 'longtitude', 'latitude']].copy()

In [None]:
# Add stat area column
loc_data_sample['stat_area'] = loc_data_sample.apply(lambda row: get_stat_area_v7(row.longtitude, row.latitude),axis=1)

In [None]:
# Export to csv
loc_data_sample.to_csv('sample_100_imsi_with_stat.csv', index=False)

In [None]:
loc_data_sample.head()

In [None]:
a = pd.unique(loc_data_sample.stat_area)

In [None]:
ids = gdf.index.values
ids

In [None]:
count

In [None]:
count = 0
lll = []
for i,y in yeshuvim_stat.items():
    if y.size <=1:
        lll.append(i)
        count+=1

In [None]:
gdf.loc[10034]

In [None]:
print(lll)

In [None]:
for i in a:
    if i not in ids:
        print ((i-1)/10)

In [None]:
yeshuvim_stat[3784]

In [None]:
loc_data_sample[loc_data_sample.stat_area == 100341.0]

In [None]:
loc_data_sample[loc_data_sample.stat_area.isnull()].shape

In [None]:
np.save('./222', gdf.index.values)

In [None]:
gdf.index.values

# Tests

In [None]:
loc_data = pd.read_csv('../data/samples/sample_1+2.csv')

In [None]:
sm = loc_data.sample(1000)

In [None]:
sm.head()

In [None]:
sm['DistrictCode'] = sm.apply(lambda row: which_district(row.longtitude, row.latitude), axis=1)

In [None]:
sm.head()

In [None]:
point = Point(31.922043, 34.873502)
point2 = Point(31.92312, 34.87234)
poly = stat_areas_gdf.iloc[2][0]

In [None]:
cascaded_union(gdf.geometry.values)

In [None]:
cascaded_union([poly for poly in district_polygons.values()])

In [None]:
cascaded_union([district_polygons[1], district_polygons[2]])

In [None]:
subdistrict_polygons[44]

In [None]:
district_polygons[2]

In [None]:
districts_gdf.loc[1:1].plot()

# OLD

In [None]:
# def which_district(longtitude, latitude):
# # convert coordinates and create Point object
#     x, y = nafot.wgs_to_itm(longtitude, latitude)
#     point = Point(x, y)
    
#     for i, row in districts_gdf.iterrows():
#         if row.geometry.contains(point):
#             return i

In [None]:
# def get_stat_area_v7(longtitude, latitude):
#     point = Point(longtitude, latitude)
    
#     # Get the yesahuv
#     yeshuv = which_yeshuv_v7(point)
    
#     if yeshuv:
#         # Get the stat area
#         stat_area = which_stat_v7(point, yeshuv)
#         return stat_area


# def which_yeshuv_v7(point):
#     # Go over the yeshuvim
#     for i, poly in enumerate(yeshuvim_polys):
#         if poly.contains(point):
#             return yeshuvim_ids[i]
        
# def which_stat_v7(point, yeshuv):
#     # Get subdistricts to check
#     stat_areas = yeshuvim_stat_polys[yeshuv]
    
#     # Go over the stat_areas
#     for i, poly in enumerate(stat_areas):
#         if type(poly)==str:
#             print (yeshuv)
#         if poly.contains(point):
#             return yeshuvim_stat[yeshuv][i]

# Hierarchical Search - digits after decimal point

Add a represnting point for each statistical area (the centroid)

In [None]:
gdf['centroid'] = gdf.apply(lambda row: row.geometry.centroid, axis=1)
gdf['centroid_lon'] = gdf.apply(lambda row: row.centroid.x, axis=1)
gdf['centroid_lat'] = gdf.apply(lambda row: row.centroid.y, axis=1)

In [None]:
# gdf['mindist'] = gdf.apply(lambda row: min([row.centroid.distance(other)
#                                                   for other in gdf.centroid.drop(row.name)]), axis=1)

In [None]:
# gdf.mindist.quantile(0.001)

In [None]:
# gdf_wgs = nafot.borders.stat_area_wgs_df.copy()
# gdf_wgs['centroid'] = gdf_wgs.apply(lambda row: row.geometry.centroid, axis=1)
# gdf_wgs['centroid_lon'] = gdf_wgs.apply(lambda row: row.centroid.x, axis=1)
# gdf_wgs['centroid_lat'] = gdf_wgs.apply(lambda row: row.centroid.y, axis=1)

In [None]:
# stat_list = []
# prec = 0.0005
# for row in gdf_small.itertuples():
#     if (abs(lon - row.centroid_lon) < prec) and (abs(lat - row.centroid_lat) < prec):
#             stat_list.append(row.Index)
# stat_list       

In [None]:
# stat_list = []
# prec = 3
# for row in gdf_small.itertuples():
#     if ((round(lon, prec) == round(row.centroid_lon, prec)) and (round(lat, prec) - round(row.centroid_lat, prec))):
#             stat_list.append(row.Index)
# stat_list        

In [None]:
# lon, lat = loc_data.sample()[['longtitude', 'latitude']].iloc[0]

In [None]:
# %%timeit
# stat_list = []
# for row in gdf.itertuples():
#     if (abs(round(lon, prec) - round(row.centroid_lon, prec) <= 0.001) and 
#         abs(round(lat, prec) - round(row.centroid_lat, prec)) <= 0.001):
#             stat_list.append(row.Index)
# len(stat_list)

In [None]:
# def get_stat_area_list(prec=3)
#     stat_list = []
#     for row in gdf.itertuples():
#         if (abs(round(lon, prec) - round(row.centroid_lon, prec) <= 0.01) and 
#             abs(round(lat, prec) - round(row.centroid_lat, prec)) <= 0.01):
#                 stat_list.append(row.Index)
#     return stat_list

In [None]:
# def get_stat_area_v7(longtitude, latitude):
#     point = Point(longtitude, latitude)
    
#     # Get the yeshuv
#     yeshuv = which_yeshuv_v7(point)
    
#     if yeshuv:
#         if yeshuvim_stat[yeshuv].size > 1:
#             # Get the stat area
#             stat_area = which_stat_v7(point, yeshuv)
#             return stat_area
#         else:
#             return yeshuv*10 +1

# def which_yeshuv_v7(point):
#     # Go over the yeshuvim
#     for i, poly in enumerate(yeshuvim_polys):
#         if poly.contains(point):
#             return yeshuvim_ids[i]
        
# def which_stat_v7(point, yeshuv):
#     # Get subdistricts to check
#     stat_areas = yeshuvim_stat_polys[yeshuv]
    
#     # Go over the stat_areas
#     for i, poly in enumerate(stat_areas):
#         if type(poly)==str:
#             print (yeshuv)
#         if poly.contains(point):
#             return yeshuvim_stat[yeshuv][i]

In [None]:
# gdf.to_file('./ss')