In [1]:
import os
import pandas as pd
import requests
import random
import glob
from PIL import Image
import time
from io import BytesIO

In [None]:
key = 'YOUR_API_KEY'

In [3]:
# load datasets
data = pd.read_csv("../data/CS/Hubballi_Dhaward_Water_Consumption_All.csv")
data.head()

Unnamed: 0,rrno,pzone,freq,permonthunitskl,sl_no,dma,priority_group,property_type,latitude,longitude,...,supply_in_hours,pressure,water_quality,other_connection_needed,willingness_to_pay,water_storage_type,sewage_connection_type,rainwater_harvesting,monthly_family_income,ration_card
0,46p0857,p1p2,5.0,63.8,4,H13D1,P1P1/P1P2,Rented,15.353236,75.14063,...,24,High,Satisfactory,Yes,,Both,Under Ground Drainage,No,,
1,46po836,p1p2,,,6,H13D1,P1P1/P1P2,Rented,15.353527,75.14095,...,24,High,Satisfactory,Yes,,Both,Under Ground Drainage,No,,
2,46po820,p1p2,,,8,H13D1,P1P1/P1P2,Owned,15.354342,75.141685,...,24,High,Satisfactory,Yes,,Both,Under Ground Drainage,No,,
3,46p0855,p1p2,5.0,15.8,12,H13D1,P1P1/P1P2,Rented,15.353131,75.14144,...,5,Medium,Not Satisfactory,Yes,Yes,Both,Under Ground Drainage,No,10-20K,
4,3467,p1p2,,,14,H20D1,P1P1/P1P2,Owned,15.352086,75.14111,...,5,High,Satisfactory,Yes,,No Storage,,,,


In [4]:
data.shape

(35980, 30)

In [5]:
# Check which images already exist in the folder
satellite_dir = '../data/raw/Hubballi_Satellite'
street_dir = '../data/raw/Hubballi_Original'

satellite_filenames_exist = glob.glob1(satellite_dir, '*.jpg')
satellite_filenames_exist = {i.split('.jpg')[0] for i in satellite_filenames_exist}

street_filenames_exist = glob.glob1(street_dir, '*.jpg')
street_filenames_exist = {i.split('.jpg')[0] for i in street_filenames_exist}

print(f"Existing satellite images: {len(satellite_filenames_exist)}")
print(f"Existing street view images: {len(street_filenames_exist)}")

Existing satellite images: 23986
Existing street view images: 17973


In [6]:
# Extract household id, longitude, and latitude from the dataframe
data_list = data.apply(lambda row: [row['sl_no'], row['latitude'], row['longitude']], axis=1).tolist()

In [7]:
from PIL import Image
import io

def crop_image(image_content):
    img = Image.open(io.BytesIO(image_content))
    width, height = img.size
    
    # Crop out bottom 20 pixels for both satellite and street view images
    cropped_img = img.crop((0, 0, width, height - 20))
    
    # Convert to RGB mode before returning
    return cropped_img.convert('RGB')

# extracting street view images
for data in data_list:
    FID, lon, lat = data[0], data[1], data[2]
    # zoom for satellite image
    zoom = 30
    # resolution for satellite image
    res1 = 640
    res2 = 660
    try:
        # download satellite image
        if '{}'.format(FID) in satellite_filenames_exist:
            print('Satellite image {} has been downloaded'.format(FID))
        else:
            sat_url = 'https://maps.googleapis.com/maps/api/staticmap?center={},{}&maptype=satellite&zoom={}&size={}x{}&key={}'.format(lon, lat, zoom, res1, res2, key)
            response = requests.get(sat_url)
            img = response.content
            if len(img) > 100:
                cropped_img = crop_image(img)
                cropped_img.save('../data/raw/Hubballi_Satellite/{}.jpg'.format(FID), 'JPEG')
                print('Successfully downloaded and cropped satellite image {}'.format(FID))
            else:
                print('Satellite downloading error {}'.format(FID))    
        
        # download streetview images
        if '{}'.format(FID) in street_filenames_exist:
            print('Street view {} has been downloaded'.format(FID))
        else:
            street_url = 'https://maps.googleapis.com/maps/api/streetview?size=600x420&location={},{}&fov=90&pitch=0&key={}'.format(lon, lat, key)
            status_url = 'https://maps.googleapis.com/maps/api/streetview/metadata?size=600x400&location={},{}&fov=90&pitch=0&key={}='.format(lon, lat, key)
            # check if streetview image exists, otherwise skip to next image
            if requests.get(status_url).json()['status'] == 'ZERO_RESULTS':
                print(f"No image exists for {FID}. Image skipped.")
                continue
            response = requests.get(street_url)
            img = response.content
            if len(img) > 100:
                cropped_img = crop_image(img)
                cropped_img.save('../data/raw/Hubballi_Original/{}.jpg'.format(FID), 'JPEG')
                print('Successfully downloaded and cropped {}'.format(FID))
            else:
                print('Streetview downloading error {}'.format(FID))
            
    except Exception as e:
        print('Error warning. Execution stopped!')
        print(f'Error details: {str(e)}')
        break

Satellite image 4 has been downloaded
No image exists for 4. Image skipped.
Successfully downloaded and cropped satellite image 6
No image exists for 6. Image skipped.
Successfully downloaded and cropped satellite image 8
Successfully downloaded and cropped 8
Satellite image 12 has been downloaded
No image exists for 12. Image skipped.
Satellite image 14 has been downloaded
No image exists for 14. Image skipped.
Satellite image 27 has been downloaded
No image exists for 27. Image skipped.
Satellite image 28 has been downloaded
No image exists for 28. Image skipped.
Satellite image 33 has been downloaded
No image exists for 33. Image skipped.
Successfully downloaded and cropped satellite image 71
Successfully downloaded and cropped 71
Successfully downloaded and cropped satellite image 72
Successfully downloaded and cropped 72
Successfully downloaded and cropped satellite image 73
Successfully downloaded and cropped 73
Satellite image 75 has been downloaded
No image exists for 75. Image

In [11]:
df = pd.read_csv('../data/CS/Hubballi_Dhaward_Water_Consumption_All_NearRoad.csv')
df = df.dropna(subset=['permonthunitskl'])  # For buildings with known water consumption

# Create the consump_class column
bins = [0, 8, 15, 25, float('inf')]
labels = ['0-8', '8-15', '15-25', '>25']
df['consump_class'] = pd.cut(df['permonthunitskl'], bins=bins, labels=labels, right=False)
df.shape

(24366, 33)

In [12]:
# Check if satellite and street view images exist and add binary columns to the DataFrame
def check_image_availability(row, satellite_dir, street_dir):
    satellite_image_path = os.path.join(satellite_dir, f"{row['sl_no']}.jpg")
    street_image_path = os.path.join(street_dir, f"{row['sl_no']}.jpg")
    row['has_satellite_image'] = 1 if os.path.exists(satellite_image_path) else 0
    row['has_street_image'] = 1 if os.path.exists(street_image_path) else 0
    return row

# Define image directories
satellite_dir = '../data/raw/Hubballi_Satellite'
street_dir = '../data/raw/Hubballi_Original'

# Apply the function to each row in the DataFrame
df = df.apply(lambda row: check_image_availability(row, satellite_dir, street_dir), axis=1)

# Count the number of rows with satellite images
satellite_count = df['has_satellite_image'].sum()

# Count the number of rows with street view images
street_view_count = df['has_street_image'].sum()

print(f"Number of rows with satellite images: {satellite_count}")
print(f"Number of rows with street view images: {street_view_count}")

Number of rows with satellite images: 24366
Number of rows with street view images: 19089
