In [2]:
import os
import glob
import sys
from lat_lon_parser import parse
import geopandas as gpd
import pandas as pd
import exif
import pillow_heif
import numpy as np

from PIL import Image

In [3]:
# define fucntion to get file exif information
def get_exif(filename):
    image = Image.open(filename)
    image.verify()
    return image.getexif().get_ifd(0x8825)

# define fucntion to get location data from exif
def get_geotagging(exif):
    geo_tagging_info = {}
    # If no file image info return location data as (N:0, W:0)
    if not exif:
        return {'GPSLatitudeRef': 'N',
                'GPSLatitude': '(0, 0, 0)',
                'GPSLongitudeRef': 'W',
                'GPSLongitude': '(0, 0, 0)'}
    else:
        gps_keys = ['GPSVersionID', 'GPSLatitudeRef', 'GPSLatitude', 'GPSLongitudeRef', 'GPSLongitude',
                    'GPSAltitudeRef', 'GPSAltitude', 'GPSTimeStamp', 'GPSSatellites', 'GPSStatus', 'GPSMeasureMode',
                    'GPSDOP', 'GPSSpeedRef', 'GPSSpeed', 'GPSTrackRef', 'GPSTrack', 'GPSImgDirectionRef',
                    'GPSImgDirection', 'GPSMapDatum', 'GPSDestLatitudeRef', 'GPSDestLatitude', 'GPSDestLongitudeRef',
                    'GPSDestLongitude', 'GPSDestBearingRef', 'GPSDestBearing', 'GPSDestDistanceRef', 'GPSDestDistance',
                    'GPSProcessingMethod', 'GPSAreaInformation', 'GPSDateStamp', 'GPSDifferential']

        for k, v in exif.items():
            try:
                geo_tagging_info[gps_keys[k]] = str(v)
            except IndexError:
                pass
        return geo_tagging_info


In [5]:
# set file path to image directory (downlaoded from one drive shared file)
path = "/Users/francescapontin/Downloads/ImagesFromLeedsLSOAs/"

# cerate empty list of dfs
dfs= []
master_df =pd.DataFrame()
paths =os.listdir(path)

# For sub-directory in the full directory
for p in paths:
    for root, dirs, files in os.walk(path+p):
        # Get the name of all files in directory
        for name in files:
            # get jpg files
            if name.endswith(".JPG"):
                file_path = os.path.join(root, name)
                image_info = get_exif(file_path)
                results = get_geotagging(image_info)
                df=pd.DataFrame(columns =['IMGname','Latitude','Longitude'])
                df.loc[0]= [name.rstrip(".JPG"),parse(results['GPSLatitude']),(parse(results['GPSLongitude'])*-1) ]
                df["file_path"] = str(file_path)
                master_df =pd.concat([master_df,df])
            # if not jpg get jpeg files
            elif name.endswith(".jpeg"):
                file_path = os.path.join(root, name)
                image_info = get_exif(file_path)
                results = get_geotagging(image_info)
                df=pd.DataFrame(columns =['IMGname','Latitude','Longitude'])
                df.loc[0]= [name.rstrip(".jpeg"),parse(results['GPSLatitude']),(parse(results['GPSLongitude'])*-1) ]
                df["file_path"] = str(file_path)
                master_df =pd.concat([master_df,df])    



In [6]:
# Get LSOA quantile and other info from file path
master_df['LSOA_naming'] =master_df['file_path'].str.split('/', expand=True)[5]
master_df[['lsoa_code', 'IMD_Q','LSOA_name']]= master_df['LSOA_naming'].str.split('_', expand =True)
master_df['LSOA_name'] = master_df['LSOA_name'].str.replace( r"([A-Z])", r" \1").str.strip()

# If no location info reaplce with Nan
master_df.loc[(master_df['Latitude']==0),'Longitude']=np.NaN
master_df.loc[(master_df['Latitude']==0),'Latitude']=np.NaN

# Convert to geodataframe 
master_gdf =gpd.GeoDataFrame(master_df,geometry=gpd.points_from_xy(master_df.Longitude, master_df.Latitude))
# Set CRS
master_gdf =master_gdf.set_crs("EPSG:4326")

  master_df['LSOA_name'] = master_df['LSOA_name'].str.replace( r"([A-Z])", r" \1").str.strip()


In [7]:
# Look at geo-data frame
master_gdf

Unnamed: 0,IMGname,Latitude,Longitude,file_path,LSOA_naming,lsoa_code,IMD_Q,LSOA_name,geometry
0,IMG_6228,53.816281,-1.640156,/Users/francescapontin/Downloads/ImagesFromLee...,E01011329_Q4_CalverleysCambridgeGardensWhiteco...,E01011329,Q4,Calverleys Cambridge Gardens Whitecote Rise,POINT (-1.64016 53.81628)
0,IMG_6229,53.815753,-1.638083,/Users/francescapontin/Downloads/ImagesFromLee...,E01011329_Q4_CalverleysCambridgeGardensWhiteco...,E01011329,Q4,Calverleys Cambridge Gardens Whitecote Rise,POINT (-1.63808 53.81575)
0,IMG_6221,53.815547,-1.634994,/Users/francescapontin/Downloads/ImagesFromLee...,E01011329_Q4_CalverleysCambridgeGardensWhiteco...,E01011329,Q4,Calverleys Cambridge Gardens Whitecote Rise,POINT (-1.63499 53.81555)
0,IMG_6220,53.818333,-1.634194,/Users/francescapontin/Downloads/ImagesFromLee...,E01011329_Q4_CalverleysCambridgeGardensWhiteco...,E01011329,Q4,Calverleys Cambridge Gardens Whitecote Rise,POINT (-1.63419 53.81833)
0,IMG_6222,53.815553,-1.634950,/Users/francescapontin/Downloads/ImagesFromLee...,E01011329_Q4_CalverleysCambridgeGardensWhiteco...,E01011329,Q4,Calverleys Cambridge Gardens Whitecote Rise,POINT (-1.63495 53.81555)
...,...,...,...,...,...,...,...,...,...
0,IMG_5267,53.850331,-1.606897,/Users/francescapontin/Downloads/ImagesFromLee...,E01011383_Q4_RaynelsTheDrive,E01011383,Q4,Raynels The Drive,POINT (-1.60690 53.85033)
0,IMG_5266,53.848183,-1.604142,/Users/francescapontin/Downloads/ImagesFromLee...,E01011383_Q4_RaynelsTheDrive,E01011383,Q4,Raynels The Drive,POINT (-1.60414 53.84818)
0,IMG_5262,53.845886,-1.601128,/Users/francescapontin/Downloads/ImagesFromLee...,E01011383_Q4_RaynelsTheDrive,E01011383,Q4,Raynels The Drive,POINT (-1.60113 53.84589)
0,IMG_5263,53.845497,-1.600775,/Users/francescapontin/Downloads/ImagesFromLee...,E01011383_Q4_RaynelsTheDrive,E01011383,Q4,Raynels The Drive,POINT (-1.60078 53.84550)


In [1]:
# # svae file info to excel doc
# master_gdf.to_excel("Image_location_info.xlsx",sheet_name='Image_locations')  

# AdvertData =pd.read_excel('/Users/francescapontin/Downloads/AdvertData.xlsx')

In [10]:
# list columns in advert data 
AdvertData.columns

Index(['Rinitials', 'Cinitials', 'Codedate', 'LSOA', 'IMGid', 'IMGdate',
       'ADid', 'ASSETid', 'Long', 'Lat', 'ADtype', 'ADsize', 'ADprodtype',
       'ADprod', 'ADbrand', 'Brandad', 'Admanagement', 'Price£',
       'Pricesourcelink', 'Unit', 'Portion', 'Totalwt', 'Wtsource',
       'Nutritionsource', 'Nutsourcelink', 'Fatdensity', 'Satfatdensity',
       'Sugardensity', 'Sodiumdensity', 'Ekcaldensity', 'Fibredensity',
       'Proteindensity', 'FVN', 'NPMscore', 'NPMstatus', 'ADcompliance'],
      dtype='object')

In [14]:
# remove file extension to get an image name
AdvertData['IMGname'] =AdvertData['IMGid'].str.strip('.JPG').str.strip('.jpg')

# Get a count of Image locaiton data from files that are in the master spreadsheet
master_gdf['IMGname'].isin(AdvertData['IMGid'].str.strip('.JPG').str.strip('.jpg')).value_counts()

In [16]:
# remove duplicates
AdvertData['duplicate_image_lsoa'] =AdvertData.duplicated(subset=['IMGname','LSOA'], keep=False)

In [17]:
AdvertData.loc[AdvertData['duplicate_image_lsoa']==True,:]

Unnamed: 0,Rinitials,Cinitials,Codedate,LSOA,IMGid,IMGdate,ADid,ASSETid,Long,Lat,...,Sodiumdensity,Ekcaldensity,Fibredensity,Proteindensity,FVN,NPMscore,NPMstatus,ADcompliance,IMGname,duplicate_image_lsoa
5,IW,IW,2023-05-17,E01011597,IMG_4660.JPG,2023-05-11,CF_GLMR_06_01,CF_GLMR_06,,,...,1033.00,312.70,1.3,17.51,0.0,20.0,1.0,1.0,IMG_4660,True
6,IW,IW,2023-05-17,E01011597,IMG_4660.JPG,2023-05-11,CF_GLMR_06_01,CF_GLMR_06,,,...,44.00,54.00,0.0,4.10,0.0,0.0,0.0,0.0,IMG_4660,True
19,IW,IW,2023-05-17,E01011597,IMG_4651.JPG,2023-05-11,CF_GLMR_14_01,CF_GLMR_14,,,...,4.00,117.00,0.6,2.80,0.0,0.0,0.0,0.0,IMG_4651,True
20,IW,IW,2023-05-17,E01011597,IMG_4651.JPG,2023-05-11,CF_GLMR_14_01,CF_GLMR_14,,,...,320.00,72.00,1.5,10.70,0.0,-3.0,0.0,0.0,IMG_4651,True
21,IW,IW,2023-05-17,E01011597,IMG_4677.JPG,2023-05-11,CF_GLMR_15_01,CF_GLMR_15,,,...,153.85,192.31,0.0,2.69,0.0,2.0,0.0,0.0,IMG_4677,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
920,MM,IW,2023-06-20,E01011271,20230516_162732.jpg,2023-05-16,GR_FHHS_05_01,GR_FHHS_05,,,...,,,,,,,,,20230516_162732,True
921,MM,IW,2023-06-20,E01011271,20230516_162732.jpg,2023-05-16,GR_FHHS_05_02,GR_FHHS_05,,,...,,,,,,,,,20230516_162732,True
922,MM,IW,2023-06-20,E01011271,20230516_162732.jpg,2023-05-16,GR_FHHS_05_03,GR_FHHS_05,,,...,,,,,,,,,20230516_162732,True
923,MM,IW,2023-06-20,E01011271,20230516_165621.jpg,2023-05-16,GR_FHHS_06_01,GR_FHHS_06,,,...,,,,,,,,,20230516_165621,True


In [18]:
# Add in validation step once data pre-processing complete
#a.merge(master_gdf, left_on=['IMGname','LSOA'], right_on=['IMGname','lsoa_code'], how='left',validate="1:1")

In [18]:
Add_gdf =gpd.GeoDataFrame(AdvertData.merge(master_gdf, left_on=['IMGname','LSOA'], right_on=['IMGname','lsoa_code'], how='left', indicator=True))

In [19]:
# re-format datetime to avoid explore() potting issues
Add_gdf_plot =Add_gdf
Add_gdf_plot['Codedate'] =Add_gdf['Codedate'].astype(str)
Add_gdf_plot['IMGdate'] =Add_gdf['IMGdate'].astype(str)

In [20]:
# For now only plot images with locaiton info
Add_gdf_plot=Add_gdf_plot.dropna(subset=['geometry'])

In [21]:
Add_gdf_plot.to_excel("Advert_image_location_info.xlsx",sheet_name='Image_locations')  



In [22]:
Add_gdf_plot.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
Int64Index: 548 entries, 0 to 875
Data columns (total 47 columns):
 #   Column                Non-Null Count  Dtype   
---  ------                --------------  -----   
 0   Rinitials             548 non-null    object  
 1   Cinitials             548 non-null    object  
 2   Codedate              548 non-null    object  
 3   LSOA                  548 non-null    object  
 4   IMGid                 548 non-null    object  
 5   IMGdate               548 non-null    object  
 6   ADid                  548 non-null    object  
 7   ASSETid               548 non-null    object  
 8   Long                  0 non-null      float64 
 9   Lat                   0 non-null      float64 
 10  ADtype                547 non-null    float64 
 11  ADsize                547 non-null    float64 
 12  ADprodtype            542 non-null    float64 
 13  ADprod                543 non-null    object  
 14  ADbrand               535 non-null    object  
 15

In [23]:
# Also save as geojson
Add_gdf_plot.drop(columns='_merge').to_file("Advert_image_location_info.geojson", driver='GeoJSON')