In [1]:
# This Notebook removes all overlapping villages from the state CSV file 
# Specifically, we delete a village if it overlaps with any other village in train/test/val 
# To run on a new state CSV just update the paths below

input_file = "../data/annos_csv/state24_paths_density_labels_13k_Feb22-Overlap.csv"
output_file = "../data/annos_csv/state24_paths_density_labels_13k_Feb26-NoOverlapDistrict.csv"

In [2]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import argparse
import os
import sys
import time
import pickle
import numpy as np
import pandas as pd
import math
from six.moves import xrange
import tensorflow as tf
import gdal
from matplotlib import pyplot as plt

df_annos = pd.read_csv(input_file, index_col=0)
r = 2.25

def distance(origin, destination):
    lat1, lon1 = origin
    lat2, lon2 = destination
    radius = 6371 # km

    dlat = math.radians(lat2-lat1)
    dlon = math.radians(lon2-lon1)
    a = math.sin(dlat/2) * math.sin(dlat/2) + math.cos(math.radians(lat1)) \
        * math.cos(math.radians(lat2)) * math.sin(dlon/2) * math.sin(dlon/2)
    c = 2 * math.atan2(math.sqrt(a), math.sqrt(1-a))
    d = radius * c
    return d
    
def get_latlongs():
    countt = 0
    latlongs = []
    for index, row in df_annos.iterrows():
        countt +=1
        village_id = row.village_id
        pop_density = row.pop_density
        lat = row.latitude
        lon = row.longitude
        latlongs.append((village_id, (lat, lon)))
    return latlongs

elems = get_latlongs()

In [None]:
df_annos_district = pd.read_csv("../data/annos_csv/India_pov_pop_Feb25.csv", index_col=0)
def different_districts(v_id, v_id2):
    try:
        d1 = df_annos_district[df_annos_district['village_id'] == v_id].district_id.iloc[0]
        d2 = df_annos_district[df_annos_district['village_id'] == v_id2].district_id.iloc[0]
        return (d1 != d2)
    except: 
        return False

In [None]:
overlapping_ids = []
km_distances = []
km_distances_overlapping = []
ids_to_remove = set([])

countt = 0
for (v_id, degrees) in elems:
    (lat1, lon1) = degrees
    countt += 1
    if countt % 1000 == 0:
            print(countt)
    for (v_id2, degrees2) in elems:
        (lat2, lon2) = degrees2
        if v_id != v_id2:
            distance_km = distance(degrees, degrees2)
            km_distances.append(distance_km)
            if (distance_km < r):
                if different_districts(v_id, v_id2):
                    ids_to_remove.add(v_id)
                    km_distances_overlapping.append(distance_km)
                    overlapping_ids.append(((v_id, v_id2), (degrees, degrees2))) 

1000


In [None]:
print(len(ids_to_remove))

In [None]:
df_annos = pd.read_csv(input_file, index_col=0)
print(df_annos.count(axis=0)[0])
y = list(ids_to_remove)
df_annos = df_annos[-df_annos['village_id'].isin(y)]
print(df_annos.count(axis=0)[0])

In [None]:
print(df_annos[df_annos['partition'] == 'train'].count(axis=0)[0])
print(df_annos[df_annos['partition'] == 'val'].count(axis=0)[0])
print(df_annos[df_annos['partition'] == 'test'].count(axis=0)[0])
relevant_ids = list(df_annos['village_id'])
df_temp = df_annos_district[df_annos_district['village_id'].isin(relevant_ids)]
print(df_temp.district_id.value_counts())

In [None]:
#TODO Manually: distribute the subdistricts accordingly
train = [410, 409, 413, 411, 420, 419, 412, 421, 418, 415, 416]
val = [423, 422, 424]
test = [417, 414]
# print(1308/(1308+2543+8827))
# print(2543/(1308+2543+8827))
# print(8827/(1308+2543+8827))

# total = (1739+1170+1078+1060+1018+1005+806+625+545+545+521+518+222+217+215+27)
# print((217+1060)/total)
# print((1170+545+518)/total)
# print((1739+1078+1005+806+625+545+215+27+521+1018+222)/total)
# print((217+1060)/total + (1170+545+518)/total + (1739+1078+1005+806+625+545+215+27+521+1018+222)/total)

In [None]:
print(df_annos.count(axis=0)[0])

relevant_ids = list(df_annos['village_id'])
df_relevant = df_annos_district[df_annos_district['village_id'].isin(relevant_ids)]
print(df_relevant.count(axis=0)[0])

#y = list(ids_to_remove)
df_temp = df_relevant[df_relevant['district_id'].isin(test)]
test_ids = list(df_temp['village_id'])
num_test = len(test_ids) 
#print(num_test)

df_temp = df_relevant[df_relevant['district_id'].isin(val)]
val_ids = list(df_temp['village_id'])
num_val = len(val_ids)
#print(num_val)

df_temp = df_relevant[df_relevant['district_id'].isin(train)]
train_ids = list(df_temp['village_id'])
num_train = len(train_ids)
#print(num_train)

print(num_train+num_test+num_val)
print(num_test/(num_train+num_test+num_val))
print(num_val/(num_train+num_test+num_val))
print(num_train/(num_train+num_test+num_val))


df_annos.set_value(df_annos['village_id'].isin(test_ids), "partition", "test")
df_annos.set_value(df_annos['village_id'].isin(val_ids), "partition", "val")
df_annos.set_value(df_annos['village_id'].isin(train_ids), "partition", "train")



In [None]:
print(df_annos[df_annos['partition'] == 'train'].count(axis=0)[0])
print(df_annos[df_annos['partition'] == 'val'].count(axis=0)[0])
print(df_annos[df_annos['partition'] == 'test'].count(axis=0)[0])

In [None]:
print(df_annos.count(axis=0)[0])

In [None]:
#TODO uncomment to save to CSV 
#df_annos.to_csv(output_file)