In [6]:
import geopandas as gpd
import pandas as pd
from fiona import crs
import rasterio as rio
from rasterio import features
from rasterio.merge import merge
from rasterio.transform import Affine
from rasterio.crs import CRS
import numpy as np
import os
import random
from shapely.geometry import Point
from osgeo import gdal
# from gdalconst import GA_ReadOnly
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.metrics import make_scorer, recall_score, accuracy_score, balanced_accuracy_score, precision_score, f1_score, confusion_matrix
from sklearn.svm import SVC
from sklearn.neighbors import LocalOutlierFactor
from sklearn.decomposition import PCA
import csv
import datetime
import concurrent.futures
import logging
import warnings
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from filelock import FileLock


random.seed(0)
np.random.seed(0) 
%autosave 10
%matplotlib inline
# -*- coding: utf-8 -*-


##Supressing warnings
import warnings
import pandas as pd
from pyproj import CRS

# Suppress FutureWarning from pyproj
warnings.filterwarnings("ignore", category=FutureWarning, module="pyproj.crs.crs")

# Suppress SettingWithCopyWarning from pandas
warnings.filterwarnings("ignore", category=pd.errors.SettingWithCopyWarning)

Autosaving every 10 seconds


### Define some global variables

In [7]:
# dist_img = r'../Processing/green_space_classification\dist_road\dist_%s.tif'%ux
bands=['ndti','ndre','ndvi','ndwi','mndwi','glcm','B2','B3','B4','B8'] #define the band names in the img

# read in all the OSM features that may have vegetation
veges = pd.read_excel(r'osm_vegetation_classes.xlsx')

foi = ['ndti','ndre','ndvi','ndwi','mndwi','glcm']
ind_selected = [bands.index(x) for x in foi]
bands = foi

accuracy_out = r'../Processing/accuracy_classification.csv'
# fail_out = r'D:\Work/Processing/green_space_classification\fail_classification.csv'
# print(ind_selected)

### Define some global functions

In [8]:
def sample_pnts(row):
    # define a function to align samples with raster cell
    geometry = row['geometry']
    bounds = geometry.bounds
    xmin, ymin, xmax, ymax = bounds[0], bounds[1], bounds[2], bounds[3]
    x,y= np.mgrid[xmin:xmax+10:10,ymin:ymax+10:10]
    x,y = np.vstack([x.ravel(), y.ravel()])
    p = pd.DataFrame(list(zip(x,y)))
    p[0]=np.floor((p[0]-row['xmin'])/row['xres'])*row['xres']+row['xmin']+row['xres']/2
    p[1]=np.floor((p[1]-row['ymin'])/row['yres'])*row['yres']+row['ymin']+row['yres']/2
    p['pnt'] = list(set(zip(p[0],p[1])))
    p['pnt']  = p['pnt'].apply(Point)
    p = gpd.GeoDataFrame(p['pnt'],geometry='pnt',crs=crs.from_epsg(27700))
    p = p[p.within(geometry)]
    return p['pnt'].apply(lambda x:[x.x,x.y]).values

def sample_raster(row,img_array):
    # define a function to sample the rasters
    y = int(row['y_n'])
    x = int(row['x_n'])
    if 0 <= y < img_array.shape[1] and 0 <= x < img_array.shape[2]:
        res = img_array[:, y, x]
    else:
        res = np.nan
    if np.isnan(res).any():
        res = np.nan
    return res

# def sample_raster(row,img_array):
#     # define a function to sample the rasters
#     y = int(row['y_n'])
#     x = int(row['x_n'])
#     res = img_array[:,y,x]
#     if np.isnan(res).any():
#         res = np.nan
#     return res

In [None]:
#extract the selected feature polygons
def extract_OSM_polygons(OSM, city):
    shapefile = gpd.read_file(OSM)
    shapefile= shapefile.to_crs({'init': 'epsg:27700'})
    shapefile['geometry'] = shapefile.geometry.buffer(-10)
    shapefile = shapefile[~shapefile.is_empty]
    building = shapefile[~shapefile['building'].isnull()]
    building.loc[:,'area_length']=(building.area/building.length).values
    building.loc[:,'general'] = 'bldg'
    shapefile = shapefile[shapefile['building'].isnull()]
    shapefile['FID'] = list(range(0,len(shapefile.index)))
    one_city = pd.DataFrame()
    for i in veges.index:
        sub = pd.DataFrame()
        # key = veges.loc[i,'Key']
        # value = veges.loc[i,'Value']
        # sub['geometry'] = shapefile.loc[shapefile[key]==value,'geometry']
        # sub['FID'] = shapefile.loc[shapefile[key]==value,'FID']
        sub['geometry'] = shapefile.loc[shapefile['general']=='vegetation','geometry']
        sub['FID'] = shapefile.loc[shapefile['general']=='vegetation','FID']
        sub['key'] = 'general'
        sub['value']='vegetation'
        sub['SALID1'] = OSM.split('\\')[-1].split('.')[0]
        if len(sub.index)>0:
            one_city=pd.concat([one_city,sub])
    one_city['general']='vegetation'

    if len(one_city.index)>0:
        one_city_gdf = gpd.GeoDataFrame(one_city,geometry='geometry', crs=crs.from_epsg(27700))
        one_city_gdf.loc[:,'shape_index']=(one_city_gdf.length/(4*np.sqrt(one_city_gdf.area))).values
        one_city_gdf = one_city_gdf.loc[(one_city_gdf.area<=one_city_gdf.area.quantile(0.975))&
              (one_city_gdf.area>=one_city_gdf.area.quantile(0.025))&
              (one_city_gdf['shape_index']<one_city_gdf['shape_index'].quantile(0.9))]
        
        background=shapefile.loc[~shapefile['FID'].isin(set(one_city['FID'])),['geometry','FID']]
        background['general']='other'
        background.loc[:,'shape_index']=(background.length/(4*np.sqrt(background.area))).values
        background = background.loc[(background.area<=background.area.quantile(0.975))&
          (background.area>=background.area.quantile(0.025))&
          (background['shape_index']<background['shape_index'].quantile(0.9))]
        one_city_gdf = pd.concat([one_city_gdf,building])
        one_city_gdf = pd.concat([one_city_gdf,background])
        one_city_gdf = gpd.GeoDataFrame(one_city_gdf[['general','geometry','shape_index']],geometry='geometry', crs=crs.from_epsg(27700))
    one_city_gdf.to_file(driver = 'ESRI Shapefile', filename= r"../Processing/polygon_%s.shp"%city)
    return one_city_gdf

def grid_search_wrapper(refit_score,clf,param_grid,scorers,X_train,X_test,y_train,y_test,fit_params,city):
    """
    fits a GridSearchCV classifier using refit_score for optimization
    prints classifier performance metrics
    """
    skf = StratifiedKFold(n_splits=10,random_state=0,shuffle=True)
    grid_search = GridSearchCV(clf, param_grid, scoring= ['f1','f1_weighted'], refit=refit_score,
                           cv=skf, return_train_score=False,n_jobs=4,verbose=0)
    grid_search.fit(X_train.values, y_train.values)

    # make the predictions
    y_pred = grid_search.predict(X_test.values)

    return grid_search, {'city':city,'datetime':datetime.datetime.now(),
                         'accuracy_balanced':balanced_accuracy_score(y_test,y_pred),
                        'accuracy':accuracy_score(y_test,y_pred),
                        'precision':precision_score(y_test,y_pred),
                         'recall':recall_score(y_test,y_pred),
                         'f1_score':f1_score(y_test,y_pred)}

def generate_sample(one_city_gdf, img, city):
    logging.info("Starting sample generation...")
    global bands
    # sample points to raster grid
    raster = gdal.Open(img, gdal.GA_ReadOnly)
    geoTransform = raster.GetGeoTransform()
    one_city_gdf['xmin'] = geoTransform[0]
    one_city_gdf['ymin'] = geoTransform[3]
    one_city_gdf['xres'] = geoTransform[1]
    one_city_gdf['yres'] = geoTransform[5]
    one_city_gdf['pnts']=  one_city_gdf.apply(sample_pnts,axis=1)
    # attach sample class
    all_sample = gpd.GeoDataFrame()
    for i in set(one_city_gdf['general']):
        logging.debug(f"Processing class: {i}")
        xys = one_city_gdf.loc[one_city_gdf['general']==i,'pnts'].values
        xys_flat = [item for sublist in xys for item in sublist]
        sample_df = pd.DataFrame(xys_flat)
        sample_df['coordinates'] = list(zip(sample_df[0],sample_df[1]))
        sample_gdf = gpd.GeoDataFrame(sample_df['coordinates'],
                                      geometry=gpd.points_from_xy(sample_df[0],sample_df[1]),crs="epsg:27700")
        sample_gdf['class'] = i
        logging.debug(f"Generated {len(sample_gdf)} points for class {i}.")
        all_sample = pd.concat([all_sample,sample_gdf])


    print("one_city_gdf values")
    print(one_city_gdf['general'].value_counts())
    
    print("all_sample values")
    print(all_sample['class'].value_counts())
    
    # attach sample to img grid x,y
    all_sample['x']=all_sample.geometry.x
    all_sample['y']=all_sample.geometry.y
    all_sample['x_n'] = (all_sample['x'] - geoTransform[0])/geoTransform[1]-0.5
    all_sample['y_n'] = (all_sample['y'] - geoTransform[3])/geoTransform[5]-0.5
    all_sample = all_sample.reset_index()

    print("all_sample values second")
    print(all_sample['class'].value_counts())
    
    # remove overlapped samples
    land_sample = all_sample[all_sample['class']!='bldg'].copy()
    print("land_sample")
    print(len(land_sample))
    print(len(land_sample.drop_duplicates(subset=['x','y'], keep=False, inplace=False)))
    # land_sample.drop_duplicates(subset=['x','y'], keep=False, inplace=True)
    land_sample.drop_duplicates(subset=['x','y'], keep='first', inplace=True)
    clean_sample = pd.concat([land_sample,all_sample[all_sample['class']=='bldg'].copy()])
    clean_sample['bldg_drop']=0
    clean_sample.loc[clean_sample['class']=='bldg','bldg_drop']=1
    clean_sample = clean_sample.sort_values('bldg_drop', ascending=True)
    clean_sample.drop_duplicates(subset=['x','y'], keep='last', inplace=True)
   
    # mask out pixels with nan in any band
    img_array= np.array(raster.ReadAsArray())
    # print(img_array.shape)
    img_array = img_array[ind_selected,:,:]
    x_d = img_array.shape[1]
    y_d = img_array.shape[2]
    n_d =  img_array.shape[0]
    img_array = img_array.reshape(n_d,x_d*y_d)
    img_array[:,np.isnan(img_array).any(axis=0)] = np.nan
    img_array = img_array.reshape(n_d,x_d,y_d)
    # use the samples to sample the img

    print("clean_sample number before ndvi filtering")
    print(clean_sample['class'].value_counts())
    
    clean_sample['sample_value'] = clean_sample.apply(lambda x:sample_raster(x,img_array),axis=1)
    #remove bad sample based on ndvi
    clean_sample = clean_sample.loc[~clean_sample['sample_value'].isnull()]
    clean_sample['mean_ndvi'] = clean_sample['sample_value'].apply(lambda x:x[bands.index('ndvi')])

    # drop any vegetation sample with NDVI less than 0.1
    # 0.1 > 0.02 
    clean_sample.loc[(clean_sample['class']=='vegetation')
                     &(clean_sample['mean_ndvi']<=0.02),'sample_value']=np.nan 
    clean_sample = clean_sample.loc[~clean_sample['sample_value'].isnull()]

    print("clean_sample number after ndvi filtering")
    print(len(clean_sample))
    print(clean_sample.head())
    print(clean_sample['class'].value_counts())
    
    # drop any non-vegetation sample with NDVI greater than median NDVI of vegetated samples
    v_median = clean_sample.loc[(clean_sample['class']=='vegetation'),'mean_ndvi'].median()
    print("v_median")
    print(v_median)
    clean_sample.loc[(clean_sample['class']!='vegetation')
                     &(clean_sample['mean_ndvi']>=v_median),'sample_value']=np.nan
    clean_sample = clean_sample.loc[~clean_sample['sample_value'].isnull()]
    clean_sample = clean_sample.drop(columns=['mean_ndvi','sample_value'],axis=1)

    print("clean_sample number before pca")
    print(len(clean_sample))
    print(clean_sample.head())
    print(clean_sample['class'].value_counts())
    
    # PCA transformation of the img
    # min-max normalization first
    for i in range(0,img_array.shape[0]):
        v = img_array[i,:,:]
        img_array[i,:,:]=(v-np.nanmin(v))/(np.nanmax(v)-np.nanmin(v))
    img_array_pca = np.copy(img_array)
    img_array_pca = img_array_pca.reshape((img_array_pca.shape[0],
                                           img_array_pca.shape[1]*img_array_pca.shape[2])).transpose()
    img_array_pca_valid = img_array_pca[~np.isnan(img_array_pca).any(axis=1)]
    pca = PCA(n_components=img_array_pca_valid.shape[1])
    pca_res = pca.fit(img_array_pca_valid)
    var=np.cumsum(np.round(pca_res.explained_variance_ratio_, decimals=3)*100)
    n_pc = sum(var<=90)+1
    pca = PCA(n_components=n_pc)
    pca_reduce = pca.fit_transform(img_array_pca_valid)
    pca_reduce = np.multiply(pca_reduce,pca_res.explained_variance_ratio_[:n_pc])

    img_reduce = np.copy(img_array[:n_pc,:,:])
    img_reduce_re = img_reduce.reshape((img_reduce.shape[0],img_reduce.shape[1]*img_reduce.shape[2])).transpose()
    img_reduce_re[~np.isnan(img_reduce_re).any(axis=1)] = pca_reduce
    img_reduce_re = img_reduce_re.transpose()
    img_reduce = img_reduce_re.reshape((img_reduce.shape[0],img_reduce.shape[1],img_reduce.shape[2]))

    img_array = np.copy(img_reduce)
    del img_array_pca,img_array_pca_valid,img_reduce_re,img_reduce,pca_reduce
    # determine outliers in the samples
    clean_sample['sample_value'] = clean_sample.apply(lambda x: sample_raster(x,img_array),axis=1)
    PCAs = list(range(0,n_pc))
    for PC in PCAs:
        i  = PC
        clean_sample[PC] = clean_sample['sample_value'].apply(lambda x:x[i])
    clean_sample= clean_sample.drop('sample_value',axis=1)

    print("clean_sample number after pca")
    print(len(clean_sample))
    print(clean_sample['class'].value_counts())
    
    for i in set(clean_sample['class']):
        X = clean_sample.loc[clean_sample['class']==i,PCAs].values
        X = np.array(X.tolist())
        clf = LocalOutlierFactor(n_neighbors=20, contamination='auto')
        y_pred = clf.fit_predict(X)
        clean_sample.loc[clean_sample['class']==i,'outlier']=y_pred
        outlier_score = clf.negative_outlier_factor_
        clean_sample.loc[clean_sample['class']==i,'outlier_score'] = (outlier_score-outlier_score.min()) / (outlier_score.max() - outlier_score.min())
    clean_sample[PCAs] = clean_sample[PCAs].astype(np.float32)
    clean_sample = clean_sample.dropna()
    clean_sample = clean_sample.loc[clean_sample['outlier']!=-1]
    
    print("clean_sample number after second pca")
    print(len(clean_sample))
    print(clean_sample['class'].value_counts())

    # random selection of samples
    n_vege = len(clean_sample.loc[clean_sample['class']=='vegetation'])
    n_other = len(clean_sample.loc[clean_sample['class']=='other'])
    n_bldg = len(clean_sample.loc[clean_sample['class']=='bldg'])
    n_sample = int(0.2*min(n_vege,n_other+n_bldg))
    print("n_sample: ",n_sample)
    print("n_vege: ",n_vege)
    print("n_other: ",n_other)
    print("n_bldg: ",n_bldg)
    
    if n_sample>=2500:
        n_sample = 2500
    if n_sample<200:
        n_sample = int(1*min(n_vege,n_other+n_bldg))
    sub_clean_sample = clean_sample.loc[clean_sample['class']=='vegetation'].sample(n=n_sample,random_state=0)
    if n_other>(n_sample/2) and n_bldg>(n_sample/2):
        sub_clean_sample = pd.concat([sub_clean_sample,clean_sample.loc[clean_sample['class']=='bldg'].sample(n=int(n_sample/2),random_state=0)])
        sub_clean_sample = pd.concat([sub_clean_sample,clean_sample.loc[clean_sample['class']=='other'].sample(n=int(n_sample/2),random_state=0)])
    else:
        if n_other > (n_sample / 2) and n_bldg < (n_sample / 2):
            sub_clean_sample = pd.concat([sub_clean_sample, clean_sample.loc[clean_sample['class'] == 'bldg'].sample(n=int(n_bldg), random_state=0)])
            sub_clean_sample = pd.concat([sub_clean_sample, clean_sample.loc[clean_sample['class'] == 'other'].sample(n=int(n_sample - n_bldg), random_state=0)])
        else:
            sub_clean_sample = pd.concat([sub_clean_sample, clean_sample.loc[clean_sample['class'] == 'bldg'].sample(n=int(n_sample - n_other), random_state=0)])
            sub_clean_sample = pd.concat([sub_clean_sample, clean_sample.loc[clean_sample['class'] == 'other'].sample(n=int(n_other), random_state=0)])

    sub_clean_sample.loc[sub_clean_sample['class']=='bldg','class']='other'
    ext=sub_clean_sample[['class','geometry']]
    ext = ext.to_crs({'init': 'epsg:27700'})
    ext.to_file(r"../Processing/all_sample_%s.shp"% city)
    # encouraging garbage collection
    raster = None
    return sub_clean_sample, img_array,PCAs

def split_sample(sub_clean_sample,PCAs,city):
    # training testing split
    ft = sub_clean_sample[PCAs]
    ft['outlier_score'] = sub_clean_sample['outlier_score']

    sub_clean_sample['class'] = sub_clean_sample['class'].apply(lambda x:1 if x=='vegetation' else 0)
    targets = sub_clean_sample['class']
    X_train, X_test, y_train, y_test = train_test_split(ft, targets, stratify=targets,random_state=0)
    X_test_weight = X_test['outlier_score']
    X_test = X_test[PCAs]
    X_train_weight = X_train['outlier_score']
    X_train = X_train[PCAs]
    train_exp = X_train.merge(sub_clean_sample,how='inner')
    test_exp = X_test.merge(sub_clean_sample,how='inner')
    gpd.GeoDataFrame(train_exp[['class','geometry']],geometry='geometry').to_file(driver = 'ESRI Shapefile',
                                filename= r"../Processing/train_sample_%s.shp"%city)
    gpd.GeoDataFrame(test_exp[['class','geometry']],geometry='geometry').to_file(driver = 'ESRI Shapefile',
                                filename= r"../Processing/test_sample_%s.shp"%(city)
    return X_train,X_test,y_train,y_test,X_train_weight

import concurrent.futures
import logging
import warnings
from filelock import FileLock

warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", category=FutureWarning)

def process_city(city, tif_names):
    try:
        logging.info(f'Starting processing for {city}')
        OSM = r'../Code/%s_OSM_combined.shp' % city
        img = r'../Sample_image/LA_%s_2023.tif' % tif_names[city]
        
        if not os.path.exists(OSM):
            logging.error(f'OSM file does not exist: {OSM}')
            return
        if not os.path.exists(img):
            logging.error(f'Image file does not exist: {img}')
            return
        
        one_city_gdf = extract_OSM_polygons(OSM, city) # get OSM polygons
        print("one_city_gdf after extract")
        print(len(one_city_gdf))
        sub_clean_sample, img_array, PCAs = generate_sample(one_city_gdf, img, city) # generate random samples
        print("sub_clean_sample")
        print(len(sub_clean_sample))
        print(sub_clean_sample.head())
    
        X_train, X_test, y_train, y_test, X_train_weight = split_sample(sub_clean_sample, PCAs,city) # sample values, train test split
        scorers = {
            'precision_score': make_scorer(precision_score),
            'recall_score': make_scorer(recall_score),
            'accuracy_score': make_scorer(accuracy_score),
            'f1_score': make_scorer(f1_score)
        }
        fit_params = {'sample_weight': X_train_weight}
        clf = SVC()
        param_grid = {'C': [2 ** x for x in np.arange(-3, 13, dtype=float)],
                      'gamma': [2 ** x for x in np.arange(-3, 13, dtype=float)],
                      'random_state': [0],
                      'class_weight': ['balanced']}
        grid_search_clf, test_scores = grid_search_wrapper('f1_weighted', clf, param_grid, scorers, X_train, X_test, y_train, y_test, fit_params,city)
        
        # save testing accuracy
        logging.info(f'Test scores for {city}: {test_scores}')
    
        lock = FileLock(f"{accuracy_out}.lock")
        with lock:
            with open(accuracy_out, 'a') as csv_file:
                writer = csv.writer(csv_file, delimiter=',', lineterminator='\n')
                writer.writerow([f"{city}"] + list(zip(test_scores.keys(), test_scores.values())))
        
        csv_file.close()
        
        # save original img
        img_array2 = np.copy(img_array)
        img_re = img_array2.reshape((img_array2.shape[0], img_array2.shape[1] * img_array2.shape[2])).transpose()
        img_pre = np.copy(img_re[~np.isnan(img_re).any(axis=1)])
        img_pre = grid_search_clf.predict(img_pre)
        img_pre = img_pre.astype(np.int16)
        res = img_re[:, 0]
        res[~np.isnan(res)] = img_pre
        res[np.isnan(res)] = -32768
        res = res.reshape(img_array[0, :, :].shape)
        res = res.astype(np.int16)

        org_img = gdal.Open(img, gdal.GA_ReadOnly)
        meta = {
            'driver': 'GTiff',
            'dtype': 'int16',
            'nodata': -32768,
            'width': res.shape[1],
            'height': res.shape[0],
            'count': 1,
            # due to package update
            # 'crs': CRS.from_dict(init='epsg:27700'),
            'crs': CRS("EPSG:27700"),
            'transform': Affine(10, 0.0, org_img.GetGeoTransform()[0], 0, -10, org_img.GetGeoTransform()[-3]),
            'compress': 'lzw',
            'interleave': 'pixel'
        }

        # Save to a specific file 
        result_path = r'../Results/{}_{}.tif'.format(city)
        with rio.open(result_path, 'w', **meta) as dst:
            dst.write(res, 1)

        logging.info(f'Processed and saved results for {city}')

        #encouraging garbage collection
        del img_array
        org_img = None
        
    except Exception as e:
        logging.error(f'Error processing {city}: {e}', exc_info=True)

def main():
    logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
    tif_names = {"Greater_Manchester": 'Manchester', "Greater_London": 'London', "West_Midlands": 'Westmidlands'}
    # cities = ["London"]
    cities = ["Greater_Manchester", "West_Midlands", "Greater_London"]
    cities = ["Greater_London"]
    
    max_workers = 3  # Adjust this number based on your system's CPU cores and memory

    with concurrent.futures.ProcessPoolExecutor(max_workers=max_workers) as executor:
        future_to_city = {
            executor.submit(process_city, city, tif_names) for city in cities
        }
        for future in concurrent.futures.as_completed(future_to_city):
            city = future_to_city[future]
            try:
                future.result()  # Raises exception if any occurred during processing
                logging.info(f'Completed processing for {city}')
                # gc.collect()
            except Exception as e:
                logging.error(f'Error processing {city}: {e}', exc_info=True)

if __name__ == '__main__':
    main()


print('All jobs done')

## GI classification to 5 levels

#### Amendments Made

- application of multiclass
- various classifier
- score comparison table change

In [5]:
warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", category=FutureWarning)

bands=['ndti','ndre','ndvi','ndwi','mndwi','glcm','B2','B3','B4','B8']
veges = pd.read_excel(r'osm_vegetation_classes.xlsx')
foi = ['ndti','ndre','ndvi','ndwi','mndwi','glcm']
ind_selected = [bands.index(x) for x in foi]
bands = foi
classification_comparison = r'../Processing/classification_comparison.csv'


def sample_pnts(row):
    geometry = row['geometry']
    bounds = geometry.bounds
    xmin, ymin, xmax, ymax = bounds[0], bounds[1], bounds[2], bounds[3]
    x, y = np.mgrid[xmin:xmax+10:10, ymin:ymax+10:10]
    x, y = np.vstack([x.ravel(), y.ravel()])
    p = pd.DataFrame(list(zip(x, y)))
    p[0] = np.floor((p[0] - row['xmin']) / row['xres']) * row['xres'] + row['xmin'] + row['xres'] / 2
    p[1] = np.floor((p[1] - row['ymin']) / row['yres']) * row['yres'] + row['ymin'] + row['yres'] / 2
    p['pnt'] = list(set(zip(p[0], p[1])))
    p['pnt'] = p['pnt'].apply(Point)
    p = gpd.GeoDataFrame(p['pnt'], geometry='pnt', crs=CRS.from_epsg(27700))
    p = p[p.within(geometry)]
    return p['pnt'].apply(lambda x: [x.x, x.y]).values


def sample_raster(row, img_array):
    y = int(row['y_n'])
    x = int(row['x_n'])
    if 0 <= y < img_array.shape[1] and 0 <= x < img_array.shape[2]:
        res = img_array[:, y, x]
    else:
        res = np.nan
    if np.isnan(res).any():
        res = np.nan
    return res


def extract_OSM_polygons(OSM, city):
    shapefile = gpd.read_file(OSM)
    shapefile = shapefile.to_crs({'init': 'epsg:27700'})
    shapefile['geometry'] = shapefile.geometry.buffer(-10)
    shapefile = shapefile[~shapefile.is_empty]
    building = shapefile[~shapefile['building'].isnull()]
    building.loc[:, 'area_length'] = (building.area / building.length).values
    # building.loc[:, 'general'] = 'bldg'
    building.loc[:, 'general'] = 5
    shapefile = shapefile[shapefile['building'].isnull()]
    shapefile['FID'] = list(range(0, len(shapefile.index)))
    one_city = pd.DataFrame()
    # for i in veges.index:
    sub = pd.DataFrame()
    sub['geometry'] = shapefile.loc[shapefile['general'] == 'vegetation', 'geometry']
    sub['FID'] = shapefile.loc[shapefile['general'] == 'vegetation', 'FID']
    sub['key'] = 'general'
    sub['value'] = 'vegetation'
    sub['SALID1'] = OSM.split('\\')[-1].split('.')[0]
    sub['tag'] = shapefile.loc[shapefile['general'] == 'vegetation', 'tag']
    if len(sub.index) > 0:
        one_city = pd.concat([one_city, sub])
    one_city['general'] = 'vegetation'
        
    one_city['general'] = one_city.apply(
        lambda x: x['tag'] if x['general'] == 'vegetation' else 5, axis=1)

    if len(one_city.index) > 0:
        one_city_gdf = gpd.GeoDataFrame(one_city, geometry='geometry', crs=CRS.from_epsg(27700))
        one_city_gdf.loc[:, 'shape_index'] = (one_city_gdf.length / (4 * np.sqrt(one_city_gdf.area))).values
        one_city_gdf = one_city_gdf.loc[
            (one_city_gdf.area <= one_city_gdf.area.quantile(0.975)) &
            (one_city_gdf.area >= one_city_gdf.area.quantile(0.025)) &
            (one_city_gdf['shape_index'] < one_city_gdf['shape_index'].quantile(0.9))
        ]

        background = shapefile.loc[~shapefile['FID'].isin(set(one_city['FID'])), ['geometry', 'FID']]
        background['general'] = 5
        background.loc[:, 'shape_index'] = (background.length / (4 * np.sqrt(background.area))).values
        background = background.loc[
            (background.area <= background.area.quantile(0.975)) &
            (background.area >= background.area.quantile(0.025)) &
            (background['shape_index'] < background['shape_index'].quantile(0.9))
        ]
        one_city_gdf = pd.concat([one_city_gdf, building])
        one_city_gdf = pd.concat([one_city_gdf, background])
        one_city_gdf = gpd.GeoDataFrame(one_city_gdf[['general', 'geometry', 'shape_index']], geometry='geometry',
                                        crs=CRS.from_epsg(27700))
    
    one_city_gdf.to_file(driver='ESRI Shapefile', filename=r"../Processing/polygon_%s.shp" % city)
    return one_city_gdf

#To debug XGB classifier's strict integer rule
custom_mapping = {0.0: 0, 1.0: 1, 2.0: 2, 4.0: 3}

def preprocess_labels(y, mapping):
    y_mapped = y.map(mapping)
    return y_mapped.astype(int)

def grid_search_wrapper(refit_score, clf, param_grid, scorers, X_train, X_test, y_train, y_test, fit_params, city):
    skf = StratifiedKFold(n_splits=10, random_state=0, shuffle=True)
    grid_search = GridSearchCV(clf, param_grid, scoring=scorers, refit=refit_score, cv=skf, return_train_score=False,
                               n_jobs=4, verbose=0)

    #To debug XGB classifier's strict integer rule
    if isinstance(clf, XGBClassifier):
        y_train = preprocess_labels(y_train, custom_mapping)
        y_test = preprocess_labels(y_test, custom_mapping)
    
    grid_search.fit(X_train.values, y_train.values)

    y_pred = grid_search.predict(X_test.values)

    return grid_search, {'city': city, 'datetime': datetime.datetime.now(),
                         'accuracy_balanced': balanced_accuracy_score(y_test, y_pred),
                         'accuracy': accuracy_score(y_test, y_pred),
                         'precision_macro': precision_score(y_test, y_pred, average='macro'),
                         'recall_macro': recall_score(y_test, y_pred, average='macro'),
                         'f1_score_macro': f1_score(y_test, y_pred, average='macro'),
                         'precision_weighted': precision_score(y_test, y_pred, average='weighted'),
                         'recall_weighted': recall_score(y_test, y_pred, average='weighted'),
                         'f1_score_weighted': f1_score(y_test, y_pred, average='weighted')}


def generate_sample(one_city_gdf, img, city):
    logging.info("Starting sample generation...")
    global bands
    raster = gdal.Open(img, gdal.GA_ReadOnly)
    geoTransform = raster.GetGeoTransform()
    one_city_gdf['xmin'] = geoTransform[0]
    one_city_gdf['ymin'] = geoTransform[3]
    one_city_gdf['xres'] = geoTransform[1]
    one_city_gdf['yres'] = geoTransform[5]
    one_city_gdf['pnts'] = one_city_gdf.apply(sample_pnts, axis=1)
    all_sample = gpd.GeoDataFrame()
    for i in set(one_city_gdf['general']):
        logging.debug(f"Processing class: {i}")
        xys = one_city_gdf.loc[one_city_gdf['general'] == i, 'pnts'].values
        xys_flat = [item for sublist in xys for item in sublist]
        sample_df = pd.DataFrame(xys_flat)
        sample_df['coordinates'] = list(zip(sample_df[0], sample_df[1]))
        sample_gdf = gpd.GeoDataFrame(sample_df['coordinates'],
                                      geometry=gpd.points_from_xy(sample_df[0], sample_df[1]), crs="epsg:27700")
        sample_gdf['class'] = i
        logging.debug(f"Generated {len(sample_gdf)} points for class {i}.")
        all_sample = pd.concat([all_sample, sample_gdf])

    print("all_sample values before filtering")
    print(all_sample['class'].value_counts())

    value_counts = all_sample['class'].value_counts()    
    classes_to_exclude = value_counts[value_counts < 500].index
    all_sample = all_sample[~all_sample['class'].isin(classes_to_exclude)]

    print("all_sample values after filtering")
    print(all_sample['class'].value_counts())
    
    all_sample['x'] = all_sample.geometry.x
    all_sample['y'] = all_sample.geometry.y
    all_sample['x_n'] = (all_sample['x'] - geoTransform[0]) / geoTransform[1] - 0.5
    all_sample['y_n'] = (all_sample['y'] - geoTransform[3]) / geoTransform[5] - 0.5
    all_sample = all_sample.reset_index()

    print("all_sample values second")
    print(all_sample['class'].value_counts())

    land_sample = all_sample[all_sample['class'] != 5].copy()
    print("land_sample")
    print(len(land_sample))
    print(len(land_sample.drop_duplicates(subset=['x', 'y'], keep=False, inplace=False)))
    land_sample.drop_duplicates(subset=['x', 'y'], keep='first', inplace=True)
    clean_sample = pd.concat([land_sample, all_sample[all_sample['class'] != 5].copy()])
    clean_sample['bldg_drop'] = 0
    clean_sample.loc[clean_sample['class'] == 5, 'bldg_drop'] = 1
    clean_sample = clean_sample.sort_values('bldg_drop', ascending=True)
    # including green roof or other urban green infra - may need to fix
    clean_sample.drop_duplicates(subset=['x', 'y'], keep='first', inplace=True)

    img_array = np.array(raster.ReadAsArray())
    img_array = img_array[ind_selected, :, :]
    x_d = img_array.shape[1]
    y_d = img_array.shape[2]
    n_d = img_array.shape[0]
    img_array = img_array.reshape(n_d, x_d * y_d)
    img_array[:, np.isnan(img_array).any(axis=0)] = np.nan
    img_array = img_array.reshape(n_d, x_d, y_d)

    print("clean_sample number before ndvi filtering")
    print(clean_sample['class'].value_counts())

    clean_sample['sample_value'] = clean_sample.apply(lambda x: sample_raster(x, img_array), axis=1)
    clean_sample = clean_sample.loc[~clean_sample['sample_value'].isnull()]
    clean_sample['mean_ndvi'] = clean_sample['sample_value'].apply(lambda x: x[bands.index('ndvi')])

    clean_sample.loc[(clean_sample['class'] != 5)
                     & (clean_sample['mean_ndvi'] <= 0.05), 'sample_value'] = np.nan
    clean_sample = clean_sample.loc[~clean_sample['sample_value'].isnull()]

    print("clean_sample number after ndvi filtering")
    print(len(clean_sample))
    print(clean_sample.head())
    print(clean_sample['class'].value_counts())

    v_median = clean_sample.loc[(clean_sample['class'] != 5), 'mean_ndvi'].median()
    print("v_median")
    print(v_median)
    clean_sample.loc[(clean_sample['class'] == 5)
                     & (clean_sample['mean_ndvi'] >= v_median), 'sample_value'] = np.nan
    clean_sample = clean_sample.loc[~clean_sample['sample_value'].isnull()]
    clean_sample = clean_sample.drop(columns=['mean_ndvi', 'sample_value'], axis=1)

    print("clean_sample number before pca")
    print(len(clean_sample))
    print(clean_sample.head())
    print(clean_sample['class'].value_counts())

    for i in range(0, img_array.shape[0]):
        v = img_array[i, :, :]
        img_array[i, :, :] = (v - np.nanmin(v)) / (np.nanmax(v) - np.nanmin(v))
    img_array_pca = np.copy(img_array)
    img_array_pca = img_array_pca.reshape((img_array_pca.shape[0],
                                           img_array_pca.shape[1] * img_array_pca.shape[2])).transpose()
    img_array_pca_valid = img_array_pca[~np.isnan(img_array_pca).any(axis=1)]
    pca = PCA(n_components=img_array_pca_valid.shape[1])
    pca_res = pca.fit(img_array_pca_valid)
    var = np.cumsum(np.round(pca_res.explained_variance_ratio_, decimals=3) * 100)
    n_pc = sum(var <= 90) + 1
    pca = PCA(n_components=n_pc)
    pca_reduce = pca.fit_transform(img_array_pca_valid)
    pca_reduce = np.multiply(pca_reduce, pca_res.explained_variance_ratio_[:n_pc])

    img_reduce = np.copy(img_array[:n_pc, :, :])
    img_reduce_re = img_reduce.reshape((img_reduce.shape[0], img_reduce.shape[1] * img_reduce.shape[2])).transpose()
    img_reduce_re[~np.isnan(img_reduce_re).any(axis=1)] = pca_reduce
    img_reduce_re = img_reduce_re.transpose()
    img_reduce = img_reduce_re.reshape((img_reduce.shape[0], img_reduce.shape[1], img_reduce.shape[2]))

    img_array = np.copy(img_reduce)
    del img_array_pca, img_array_pca_valid, img_reduce_re, img_reduce, pca_reduce

    clean_sample['sample_value'] = clean_sample.apply(lambda x: sample_raster(x, img_array), axis=1)
    PCAs = list(range(0, n_pc))
    for PC in PCAs:
        i = PC
        clean_sample[PC] = clean_sample['sample_value'].apply(lambda x: x[i])
    clean_sample = clean_sample.drop('sample_value', axis=1)

    print("clean_sample number after pca")
    print(len(clean_sample))
    print(clean_sample['class'].value_counts())

    for i in set(clean_sample['class']):
        X = clean_sample.loc[clean_sample['class'] == i, PCAs].values
        X = np.array(X.tolist())
        clf = LocalOutlierFactor(n_neighbors=20, contamination='auto')
        y_pred = clf.fit_predict(X)
        clean_sample.loc[clean_sample['class'] == i, 'outlier'] = y_pred
        outlier_score = clf.negative_outlier_factor_
        clean_sample.loc[clean_sample['class'] == i, 'outlier_score'] = (outlier_score - outlier_score.min()) / (
                    outlier_score.max() - outlier_score.min())
    clean_sample[PCAs] = clean_sample[PCAs].astype(np.float32)
    clean_sample = clean_sample.dropna()
    clean_sample = clean_sample.loc[clean_sample['outlier'] != -1]

    print("clean_sample number after second pca")
    print(len(clean_sample))
    print(clean_sample['class'].value_counts())
    
    # Count the number of samples for each class
    class_counts = clean_sample['class'].value_counts()
    print("Class counts:", class_counts)
    
    # Determine the sample size
    n_sample = int(min(class_counts))
    if n_sample >= 2500:
        n_sample = 2500
    
    sampled_dfs = []
    for class_label in range(6):
        if class_label in class_counts:
            sampled_dfs.append(clean_sample[clean_sample['class'] == class_label].sample(n=min(n_sample, class_counts[class_label]), random_state=0))
    
    sub_clean_sample = pd.concat(sampled_dfs)
    
    ext = sub_clean_sample[['class', 'geometry']]
    ext = ext.to_crs({'init': 'epsg:27700'})
    ext.to_file(r"../Processing/all_sample_%s.shp" % city)
    raster = None
    return sub_clean_sample, img_array, PCAs


def split_sample(sub_clean_sample, PCAs, city):
    ft = sub_clean_sample[PCAs]
    ft['outlier_score'] = sub_clean_sample['outlier_score']
    targets = sub_clean_sample['class']

    X_train, X_test, y_train, y_test = train_test_split(ft, targets, stratify=targets, random_state=0)

    X_test_weight = X_test['outlier_score']
    X_test = X_test[PCAs]
    X_train_weight = X_train['outlier_score']
    X_train = X_train[PCAs]

    train_exp = X_train.merge(sub_clean_sample[['class', 'geometry']], left_index=True, right_index=True)
    test_exp = X_test.merge(sub_clean_sample[['class', 'geometry']], left_index=True, right_index=True)

    gpd.GeoDataFrame(train_exp[['class', 'geometry']], geometry='geometry').to_file(driver='ESRI Shapefile',
                                                                                     filename=r"../Processing/train_sample_%s.shp" % city)
    gpd.GeoDataFrame(test_exp[['class', 'geometry']], geometry='geometry').to_file(driver='ESRI Shapefile',
                                                                                   filename=r"../Processing/test_sample_%s.shp" % city)
    return X_train, X_test, y_train, y_test, X_train_weight


def process_city(city, tif_names):
    try:
        logging.info(f'Starting processing for {city}')
        OSM = r'../Code/%s_OSM_combined.shp' % city
        img = r'../Sample_image/LA_%s_2023.tif' % tif_names[city]

        if not os.path.exists(OSM):
            logging.error(f'OSM file does not exist: {OSM}')
            return
        if not os.path.exists(img):
            logging.error(f'Image file does not exist: {img}')
            return

        one_city_gdf = extract_OSM_polygons(OSM, city)  # get OSM polygons
        print("one_city_gdf after extract")
        print(len(one_city_gdf))
        sub_clean_sample, img_array, PCAs = generate_sample(one_city_gdf, img, city)  # generate random samples
        print("sub_clean_sample")
        print(len(sub_clean_sample))
        print(sub_clean_sample.head())

        X_train, X_test, y_train, y_test, X_train_weight = split_sample(sub_clean_sample, PCAs, city)  # sample values, train test split
        scorers = {
            'precision_macro': make_scorer(precision_score, average='macro', zero_division=0),
            'recall_macro': make_scorer(recall_score, average='macro', zero_division=0),
            'accuracy': make_scorer(accuracy_score),
            'f1_score_macro': make_scorer(f1_score, average='macro', zero_division=0),
            'precision_weighted': make_scorer(precision_score, average='weighted', zero_division=0),
            'recall_weighted': make_scorer(recall_score, average='weighted', zero_division=0),
            'f1_score_weighted': make_scorer(f1_score, average='weighted', zero_division=0)
        }
        
        fit_params = {'sample_weight': X_train_weight}

        classifiers = {
            # 'RandomForest': RandomForestClassifier(),
            # 'LogisticRegression': LogisticRegression(),
            # 'KNeighbors': KNeighborsClassifier(),
            # 'GradientBoosting': GradientBoostingClassifier(),
            'XGBoost': XGBClassifier(),
            # 'SVM': SVC()
        }
        
        param_grids = {
            # 'RandomForest': {'n_estimators': [100, 200, 300],
            #                  'max_features': [None, 'sqrt', 'log2'],
            #                  'min_samples_split': [2, 5, 10],
            #                  'random_state': [0]},
            # 'LogisticRegression': {'C': [0.1, 1, 10, 100],
            #                        'solver': ['lbfgs'],
            #                        'max_iter': [100, 200, 300],
            #                        'class_weight': ['balanced']},
            # 'KNeighbors': {'n_neighbors': [3, 5, 7, 9],
            #                'weights': ['uniform', 'distance'],
            #                'algorithm': ['ball_tree', 'kd_tree', 'brute']},
            # 'GradientBoosting': {'n_estimators': [100, 200, 300],
            #                      'learning_rate': [0.01, 0.1, 0.2],
            #                      'max_depth': [3, 4, 5],
            #                      'random_state': [0]},
            'XGBoost': {'n_estimators': [100, 200, 300],
                                'learning_rate': [0.01, 0.1, 0.2],
                                'max_depth': [3, 4, 5],
                                'random_state': [0]},
            # 'SVM': {'C': [2 ** x for x in np.arange(-3, 13, dtype=float)],
            #         'gamma': [2 ** x for x in np.arange(-3, 13, dtype=float)],
            #         'class_weight': ['balanced']}
        }

        for clf_name in classifiers:
            clf = classifiers[clf_name]
            param_grid = param_grids[clf_name]
            grid_search_clf, test_scores = grid_search_wrapper('f1_score_weighted', clf, param_grid, scorers, X_train, X_test, y_train, y_test, fit_params, city)

            # save testing accuracy
            logging.info(f'Test scores for {city} using {clf_name}: {test_scores}')

            lock = FileLock(f"{classification_comparison}.lock")
            with lock:
                with open(classification_comparison, 'a') as csv_file:
                    writer = csv.writer(csv_file, delimiter=',', lineterminator='\n')
                    writer.writerow([f"{city}"] + [clf_name] + list(test_scores.values()))

            # save original img
            img_array2 = np.copy(img_array)
            img_re = img_array2.reshape((img_array2.shape[0], img_array2.shape[1] * img_array2.shape[2])).transpose()
            img_pre = np.copy(img_re[~np.isnan(img_re).any(axis=1)])
            img_pre = grid_search_clf.predict(img_pre)
            img_pre = img_pre.astype(np.int16)
            res = img_re[:, 0]
            res[~np.isnan(res)] = img_pre
            res[np.isnan(res)] = -32768
            res = res.reshape(img_array[0, :, :].shape)
            res = res.astype(np.int16)

            org_img = gdal.Open(img, gdal.GA_ReadOnly)
            meta = {
                'driver': 'GTiff',
                'dtype': 'int16',
                'nodata': -32768,
                'width': res.shape[1],
                'height': res.shape[0],
                'count': 1,
                'crs': CRS("EPSG:27700"),
                'transform': Affine(10, 0.0, org_img.GetGeoTransform()[0], 0, -10, org_img.GetGeoTransform()[-3]),
                'compress': 'lzw',
                'interleave': 'pixel'
            }

            result_path = r'../Results/{}_{}_{}.tif'.format(city, clf_name, datetime.datetime.now().strftime('%m%d_%H'))
            with rio.open(result_path, 'w', **meta) as dst:
                dst.write(res, 1)

            logging.info(f'Processed and saved results for {city} using {clf_name}')

        del img_array
        org_img = None

    except Exception as e:
        logging.error(f'Error processing {city}: {e}', exc_info=True)


def main():
    logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
    tif_names = {"Greater_Manchester": 'Manchester', "Greater_London": 'London', "West_Midlands": 'Westmidlands'}
    cities = ["Greater_Manchester", "West_Midlands", "Greater_London"]

    max_workers = 3

    with concurrent.futures.ProcessPoolExecutor(max_workers=max_workers) as executor:
        # future_to_city = {
        #     executor.submit(process_city, city, tif_names) for city in cities
        # }
        future_to_city = {executor.submit(process_city, city, tif_names): city for city in cities}
        for future in concurrent.futures.as_completed(future_to_city):
            city = future_to_city[future]
            try:
                future.result()
                logging.info(f'Completed processing for {city}')
            except Exception as e:
                logging.error(f'Error processing {city}: {e}', exc_info=True)


if __name__ == '__main__':
    main()

print('All jobs done')


2024-07-27 17:08:02,198 - INFO - Starting processing for West_Midlands
2024-07-27 17:08:02,198 - INFO - Starting processing for Greater_London
2024-07-27 17:08:02,198 - INFO - Starting processing for Greater_Manchester
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  super().__setitem__(key, value)


one_city_gdf after extract
27436


2024-07-27 17:08:26,705 - INFO - Starting sample generation...
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  super().__setitem__(key, value)


one_city_gdf after extract
27602


2024-07-27 17:08:51,383 - INFO - Starting sample generation...
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  super().__setitem__(key, value)


one_city_gdf after extract
12794


2024-07-27 17:10:02,489 - INFO - Starting sample generation...


all_sample values before filtering
class
1.0    990831
0.0    857661
2.0    241863
4.0    114473
5.0     58012
3.0       419
Name: count, dtype: int64
all_sample values after filtering
class
1.0    990831
0.0    857661
2.0    241863
4.0    114473
5.0     58012
Name: count, dtype: int64
all_sample values second
class
1.0    990831
0.0    857661
2.0    241863
4.0    114473
5.0     58012
Name: count, dtype: int64
land_sample
2204828
2041779
clean_sample number before ndvi filtering
class
1.0    989226
0.0    854834
2.0    224673
4.0     43495
Name: count, dtype: int64
clean_sample number after ndvi filtering
1411520
          index           coordinates                       geometry  class  \
2204821  114466  (361105.0, 407735.0)  POINT (361105.000 407735.000)    4.0   
2204822  114467  (361155.0, 407765.0)  POINT (361155.000 407765.000)    4.0   
2204823  114468  (361155.0, 407755.0)  POINT (361155.000 407755.000)    4.0   
2204824  114469  (361105.0, 407745.0)  POINT (361105.000 407745

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ft['outlier_score'] = sub_clean_sample['outlier_score']


clean_sample number after ndvi filtering
855264
          index           coordinates                       geometry  class  \
1261419  143950  (413015.0, 277465.0)  POINT (413015.000 277465.000)    4.0   
1261420  143951  (413015.0, 277455.0)  POINT (413015.000 277455.000)    4.0   
1261421  143952  (406735.0, 299035.0)  POINT (406735.000 299035.000)    4.0   
24           24  (428205.0, 278385.0)  POINT (428205.000 278385.000)    0.0   
25           25  (428285.0, 278405.0)  POINT (428285.000 278405.000)    0.0   

                x         y     x_n     y_n  bldg_drop  \
1261419  413015.0  277465.0  2691.0  2995.0          0   
1261420  413015.0  277455.0  2691.0  2996.0          0   
1261421  406735.0  299035.0  2063.0   838.0          0   
24       428205.0  278385.0  4210.0  2903.0          0   
25       428285.0  278405.0  4218.0  2901.0          0   

                                              sample_value  mean_ndvi  
1261419  [0.09119949, 0.01320692, 0.055593137, -0.06678.

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ft['outlier_score'] = sub_clean_sample['outlier_score']


clean_sample number after second pca
1548105
class
0.0    541628
2.0    515801
1.0    431272
4.0     59404
Name: count, dtype: int64
Class counts: class
0.0    541628
2.0    515801
1.0    431272
4.0     59404
Name: count, dtype: int64
sub_clean_sample
10000
         index           coordinates                       geometry  class  \
75813    75813  (527025.0, 191945.0)  POINT (527025.000 191945.000)    0.0   
516178  516178  (547445.0, 171775.0)  POINT (547445.000 171775.000)    0.0   
487358  487358  (510135.0, 178115.0)  POINT (510135.000 178115.000)    0.0   
593849  593849  (531175.0, 170985.0)  POINT (531175.000 170985.000)    0.0   
336265  336265  (541345.0, 167315.0)  POINT (541345.000 167315.000)    0.0   

               x         y     x_n     y_n  bldg_drop         0         1  \
75813   527025.0  191945.0  2386.0   989.0          0  0.221069 -0.001902   
516178  547445.0  171775.0  4428.0  3006.0          0  0.229511 -0.001558   
487358  510135.0  178115.0   697.0  2372.0

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ft['outlier_score'] = sub_clean_sample['outlier_score']
2024-07-27 17:13:18,344 - INFO - Test scores for Greater_Manchester using XGBoost: {'city': 'Greater_Manchester', 'datetime': datetime.datetime(2024, 7, 27, 17, 13, 18, 318271), 'accuracy_balanced': 0.358, 'accuracy': 0.358, 'precision_macro': 0.3545556518442582, 'recall_macro': 0.358, 'f1_score_macro': 0.34639416133279344, 'precision_weighted': 0.35455565184425825, 'recall_weighted': 0.358, 'f1_score_weighted': 0.34639416133279355}
2024-07-27 17:13:30,540 - INFO - Processed and saved results for Greater_Manchester using XGBoost
2024-07-27 17:14:01,104 - INFO - Test scores for West_Midlands using XGBoost: {'city': 'West_Midlands', 'datetime': datetime.datetime(2024, 7, 2

TypeError: 'set' object is not subscriptable

## GI classification to 3 levels (diminished from 5)
- Including Green space, agriculture, Urban green infra
- clean_sample.drop_duplicates(subset=['x', 'y'], keep='first', inplace=True)


In [13]:
warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", category=FutureWarning)

bands=['ndti','ndre','ndvi','ndwi','mndwi','glcm','B2','B3','B4','B8']
veges = pd.read_excel(r'osm_vegetation_classes.xlsx')
foi = ['ndti','ndre','ndvi','ndwi','mndwi','glcm']
ind_selected = [bands.index(x) for x in foi]
bands = foi
classification_comparison = r'../Processing/classification_comparison.csv'


def sample_pnts(row):
    geometry = row['geometry']
    bounds = geometry.bounds
    xmin, ymin, xmax, ymax = bounds[0], bounds[1], bounds[2], bounds[3]
    x, y = np.mgrid[xmin:xmax+10:10, ymin:ymax+10:10]
    x, y = np.vstack([x.ravel(), y.ravel()])
    p = pd.DataFrame(list(zip(x, y)))
    p[0] = np.floor((p[0] - row['xmin']) / row['xres']) * row['xres'] + row['xmin'] + row['xres'] / 2
    p[1] = np.floor((p[1] - row['ymin']) / row['yres']) * row['yres'] + row['ymin'] + row['yres'] / 2
    p['pnt'] = list(set(zip(p[0], p[1])))
    p['pnt'] = p['pnt'].apply(Point)
    p = gpd.GeoDataFrame(p['pnt'], geometry='pnt', crs=CRS.from_epsg(27700))
    p = p[p.within(geometry)]
    return p['pnt'].apply(lambda x: [x.x, x.y]).values


def sample_raster(row, img_array):
    y = int(row['y_n'])
    x = int(row['x_n'])
    if 0 <= y < img_array.shape[1] and 0 <= x < img_array.shape[2]:
        res = img_array[:, y, x]
    else:
        res = np.nan
    if np.isnan(res).any():
        res = np.nan
    return res


def extract_OSM_polygons(OSM, city):
    shapefile = gpd.read_file(OSM)
    shapefile = shapefile.to_crs({'init': 'epsg:27700'})
    shapefile['geometry'] = shapefile.geometry.buffer(-10)
    shapefile = shapefile[~shapefile.is_empty]
    building = shapefile[~shapefile['building'].isnull()]
    building.loc[:, 'area_length'] = (building.area / building.length).values
    # building.loc[:, 'general'] = 'bldg'
    building.loc[:, 'general'] = 5
    shapefile = shapefile[shapefile['building'].isnull()]
    shapefile['FID'] = list(range(0, len(shapefile.index)))
    one_city = pd.DataFrame()
    # for i in veges.index:
    sub = pd.DataFrame()
    sub['geometry'] = shapefile.loc[shapefile['general'] == 'vegetation', 'geometry']
    sub['FID'] = shapefile.loc[shapefile['general'] == 'vegetation', 'FID']
    sub['key'] = 'general'
    sub['value'] = 'vegetation'
    sub['SALID1'] = OSM.split('\\')[-1].split('.')[0]
    sub['tag'] = shapefile.loc[shapefile['general'] == 'vegetation', 'tag']
    if len(sub.index) > 0:
        one_city = pd.concat([one_city, sub])
    one_city['general'] = 'vegetation'
        
    one_city['general'] = one_city.apply(
        lambda x: x['tag'] if x['general'] == 'vegetation' else 5, axis=1)

    if len(one_city.index) > 0:
        one_city_gdf = gpd.GeoDataFrame(one_city, geometry='geometry', crs=CRS.from_epsg(27700))
        one_city_gdf.loc[:, 'shape_index'] = (one_city_gdf.length / (4 * np.sqrt(one_city_gdf.area))).values
        one_city_gdf = one_city_gdf.loc[
            (one_city_gdf.area <= one_city_gdf.area.quantile(0.975)) &
            (one_city_gdf.area >= one_city_gdf.area.quantile(0.025)) &
            (one_city_gdf['shape_index'] < one_city_gdf['shape_index'].quantile(0.9))
        ]

        background = shapefile.loc[~shapefile['FID'].isin(set(one_city['FID'])), ['geometry', 'FID']]
        background['general'] = 5
        background.loc[:, 'shape_index'] = (background.length / (4 * np.sqrt(background.area))).values
        background = background.loc[
            (background.area <= background.area.quantile(0.975)) &
            (background.area >= background.area.quantile(0.025)) &
            (background['shape_index'] < background['shape_index'].quantile(0.9))
        ]
        one_city_gdf = pd.concat([one_city_gdf, building])
        one_city_gdf = pd.concat([one_city_gdf, background])
        one_city_gdf = gpd.GeoDataFrame(one_city_gdf[['general', 'geometry', 'shape_index']], geometry='geometry',
                                        crs=CRS.from_epsg(27700))
    
    one_city_gdf.to_file(driver='ESRI Shapefile', filename=r"../Processing/polygon_%s.shp" % city)
    return one_city_gdf

#To debug XGB classifier's strict integer rule
custom_mapping = {0.0: 0, 1.0: 1, 2.0: 2, 5.0: 3}

def preprocess_labels(y, mapping):
    y_mapped = y.map(mapping)
    return y_mapped.astype(int)

def grid_search_wrapper(refit_score, clf, param_grid, scorers, X_train, X_test, y_train, y_test, fit_params, city):
    skf = StratifiedKFold(n_splits=10, random_state=0, shuffle=True)
    grid_search = GridSearchCV(clf, param_grid, scoring=scorers, refit=refit_score, cv=skf, return_train_score=False,
                               n_jobs=4, verbose=0)

    #To debug XGB classifier's strict integer rule
    if isinstance(clf, XGBClassifier):
        y_train = preprocess_labels(y_train, custom_mapping)
        y_test = preprocess_labels(y_test, custom_mapping)
    
    grid_search.fit(X_train.values, y_train.values)

    y_pred = grid_search.predict(X_test.values)

    return grid_search, {'city': city, 'datetime': datetime.datetime.now(),
                         'accuracy_balanced': balanced_accuracy_score(y_test, y_pred),
                         'accuracy': accuracy_score(y_test, y_pred),
                         'precision_macro': precision_score(y_test, y_pred, average='macro'),
                         'recall_macro': recall_score(y_test, y_pred, average='macro'),
                         'f1_score_macro': f1_score(y_test, y_pred, average='macro'),
                         'precision_weighted': precision_score(y_test, y_pred, average='weighted'),
                         'recall_weighted': recall_score(y_test, y_pred, average='weighted'),
                         'f1_score_weighted': f1_score(y_test, y_pred, average='weighted')}


def generate_sample(one_city_gdf, img, city):
    logging.info("Starting sample generation...")
    global bands
    raster = gdal.Open(img, gdal.GA_ReadOnly)
    geoTransform = raster.GetGeoTransform()
    one_city_gdf['xmin'] = geoTransform[0]
    one_city_gdf['ymin'] = geoTransform[3]
    one_city_gdf['xres'] = geoTransform[1]
    one_city_gdf['yres'] = geoTransform[5]
    one_city_gdf['pnts'] = one_city_gdf.apply(sample_pnts, axis=1)
    all_sample = gpd.GeoDataFrame()
    for i in set(one_city_gdf['general']):
        logging.debug(f"Processing class: {i}")
        xys = one_city_gdf.loc[one_city_gdf['general'] == i, 'pnts'].values
        xys_flat = [item for sublist in xys for item in sublist]
        sample_df = pd.DataFrame(xys_flat)
        sample_df['coordinates'] = list(zip(sample_df[0], sample_df[1]))
        sample_gdf = gpd.GeoDataFrame(sample_df['coordinates'],
                                      geometry=gpd.points_from_xy(sample_df[0], sample_df[1]), crs="epsg:27700")
        sample_gdf['class'] = i
        logging.debug(f"Generated {len(sample_gdf)} points for class {i}.")
        all_sample = pd.concat([all_sample, sample_gdf])

    print("all_sample values before filtering")
    print(all_sample['class'].value_counts())

    value_counts = all_sample['class'].value_counts()    
    classes_to_exclude = value_counts[value_counts < 500].index
    all_sample = all_sample[~all_sample['class'].isin(classes_to_exclude)]

    all_sample['class'] = all_sample['class'].apply(lambda x: 2 if x in [2.0, 3.0, 4.0] else x)


    print("all_sample values after filtering")
    print(all_sample['class'].value_counts())
    
    all_sample['x'] = all_sample.geometry.x
    all_sample['y'] = all_sample.geometry.y
    all_sample['x_n'] = (all_sample['x'] - geoTransform[0]) / geoTransform[1] - 0.5
    all_sample['y_n'] = (all_sample['y'] - geoTransform[3]) / geoTransform[5] - 0.5
    all_sample = all_sample.reset_index()

    print("all_sample values second")
    print(all_sample['class'].value_counts())

    land_sample = all_sample[all_sample['class'] != 5].copy()
    print("land_sample")
    print(len(land_sample))
    print(len(land_sample.drop_duplicates(subset=['x', 'y'], keep=False, inplace=False)))
    land_sample.drop_duplicates(subset=['x', 'y'], keep=False, inplace=True)
    clean_sample = pd.concat([land_sample, all_sample[all_sample['class'] == 5].copy()])
    clean_sample['bldg_drop'] = 0
    clean_sample.loc[clean_sample['class'] == 5, 'bldg_drop'] = 1
    clean_sample = clean_sample.sort_values('bldg_drop', ascending=True)
    # including green roof or other urban green infra - may need to fix
    clean_sample.drop_duplicates(subset=['x', 'y'], keep='first', inplace=True)

    img_array = np.array(raster.ReadAsArray())
    img_array = img_array[ind_selected, :, :]
    x_d = img_array.shape[1]
    y_d = img_array.shape[2]
    n_d = img_array.shape[0]
    img_array = img_array.reshape(n_d, x_d * y_d)
    img_array[:, np.isnan(img_array).any(axis=0)] = np.nan
    img_array = img_array.reshape(n_d, x_d, y_d)

    print("clean_sample number before ndvi filtering")
    print(clean_sample['class'].value_counts())

    clean_sample['sample_value'] = clean_sample.apply(lambda x: sample_raster(x, img_array), axis=1)
    clean_sample = clean_sample.loc[~clean_sample['sample_value'].isnull()]
    clean_sample['mean_ndvi'] = clean_sample['sample_value'].apply(lambda x: x[bands.index('ndvi')])

    clean_sample.loc[(clean_sample['class'] != 5)
                     & (clean_sample['mean_ndvi'] <= 0.05), 'sample_value'] = np.nan
    clean_sample = clean_sample.loc[~clean_sample['sample_value'].isnull()]

    print("clean_sample number after ndvi filtering")
    print(len(clean_sample))
    print(clean_sample.head())
    print(clean_sample['class'].value_counts())

    v_median = clean_sample.loc[(clean_sample['class'] != 5), 'mean_ndvi'].median()
    print("v_median")
    print(v_median)
    clean_sample.loc[(clean_sample['class'] == 5)
                     & (clean_sample['mean_ndvi'] >= v_median), 'sample_value'] = np.nan
    clean_sample = clean_sample.loc[~clean_sample['sample_value'].isnull()]
    clean_sample = clean_sample.drop(columns=['mean_ndvi', 'sample_value'], axis=1)

    print("clean_sample number before pca")
    print(len(clean_sample))
    print(clean_sample.head())
    print(clean_sample['class'].value_counts())

    for i in range(0, img_array.shape[0]):
        v = img_array[i, :, :]
        img_array[i, :, :] = (v - np.nanmin(v)) / (np.nanmax(v) - np.nanmin(v))
    img_array_pca = np.copy(img_array)
    img_array_pca = img_array_pca.reshape((img_array_pca.shape[0],
                                           img_array_pca.shape[1] * img_array_pca.shape[2])).transpose()
    img_array_pca_valid = img_array_pca[~np.isnan(img_array_pca).any(axis=1)]
    pca = PCA(n_components=img_array_pca_valid.shape[1])
    pca_res = pca.fit(img_array_pca_valid)
    var = np.cumsum(np.round(pca_res.explained_variance_ratio_, decimals=3) * 100)
    n_pc = sum(var <= 90) + 1
    pca = PCA(n_components=n_pc)
    pca_reduce = pca.fit_transform(img_array_pca_valid)
    pca_reduce = np.multiply(pca_reduce, pca_res.explained_variance_ratio_[:n_pc])

    img_reduce = np.copy(img_array[:n_pc, :, :])
    img_reduce_re = img_reduce.reshape((img_reduce.shape[0], img_reduce.shape[1] * img_reduce.shape[2])).transpose()
    img_reduce_re[~np.isnan(img_reduce_re).any(axis=1)] = pca_reduce
    img_reduce_re = img_reduce_re.transpose()
    img_reduce = img_reduce_re.reshape((img_reduce.shape[0], img_reduce.shape[1], img_reduce.shape[2]))

    img_array = np.copy(img_reduce)
    del img_array_pca, img_array_pca_valid, img_reduce_re, img_reduce, pca_reduce

    clean_sample['sample_value'] = clean_sample.apply(lambda x: sample_raster(x, img_array), axis=1)
    PCAs = list(range(0, n_pc))
    for PC in PCAs:
        i = PC
        clean_sample[PC] = clean_sample['sample_value'].apply(lambda x: x[i])
    clean_sample = clean_sample.drop('sample_value', axis=1)

    print("clean_sample number after pca")
    print(len(clean_sample))
    print(clean_sample['class'].value_counts())

    for i in set(clean_sample['class']):
        X = clean_sample.loc[clean_sample['class'] == i, PCAs].values
        X = np.array(X.tolist())
        clf = LocalOutlierFactor(n_neighbors=20, contamination='auto')
        y_pred = clf.fit_predict(X)
        clean_sample.loc[clean_sample['class'] == i, 'outlier'] = y_pred
        outlier_score = clf.negative_outlier_factor_
        clean_sample.loc[clean_sample['class'] == i, 'outlier_score'] = (outlier_score - outlier_score.min()) / (
                    outlier_score.max() - outlier_score.min())
    clean_sample[PCAs] = clean_sample[PCAs].astype(np.float32)
    clean_sample = clean_sample.dropna()
    clean_sample = clean_sample.loc[clean_sample['outlier'] != -1]

    print("clean_sample number after second pca")
    print(len(clean_sample))
    print(clean_sample['class'].value_counts())
    
    # Count the number of samples for each class
    class_counts = clean_sample['class'].value_counts()
    print("Class counts:", class_counts)
    
    # Determine the sample size
    n_sample = int(min(class_counts))
    if n_sample >= 2500:
        n_sample = 2500
    
    sampled_dfs = []
    for class_label in range(6):
        if class_label in class_counts:
            sampled_dfs.append(clean_sample[clean_sample['class'] == class_label].sample(n=min(n_sample, class_counts[class_label]), random_state=0))
    
    sub_clean_sample = pd.concat(sampled_dfs)
    
    ext = sub_clean_sample[['class', 'geometry']]
    ext = ext.to_crs({'init': 'epsg:27700'})
    ext.to_file(r"../Processing/all_sample_%s.shp" % city)
    raster = None
    return sub_clean_sample, img_array, PCAs


def split_sample(sub_clean_sample, PCAs, city):
    ft = sub_clean_sample[PCAs]
    ft['outlier_score'] = sub_clean_sample['outlier_score']
    targets = sub_clean_sample['class']

    X_train, X_test, y_train, y_test = train_test_split(ft, targets, stratify=targets, random_state=0)

    X_test_weight = X_test['outlier_score']
    X_test = X_test[PCAs]
    X_train_weight = X_train['outlier_score']
    X_train = X_train[PCAs]

    train_exp = X_train.merge(sub_clean_sample[['class', 'geometry']], left_index=True, right_index=True)
    test_exp = X_test.merge(sub_clean_sample[['class', 'geometry']], left_index=True, right_index=True)

    gpd.GeoDataFrame(train_exp[['class', 'geometry']], geometry='geometry').to_file(driver='ESRI Shapefile',
                                                                                     filename=r"../Processing/train_sample_%s.shp" % city)
    gpd.GeoDataFrame(test_exp[['class', 'geometry']], geometry='geometry').to_file(driver='ESRI Shapefile',
                                                                                   filename=r"../Processing/test_sample_%s.shp" % city)
    return X_train, X_test, y_train, y_test, X_train_weight


def process_city(city, tif_names):
    try:
        logging.info(f'Starting processing for {city}')
        OSM = r'../Code/%s_OSM_combined.shp' % city
        img = r'../Sample_image/LA_%s_2023.tif' % tif_names[city]

        if not os.path.exists(OSM):
            logging.error(f'OSM file does not exist: {OSM}')
            return
        if not os.path.exists(img):
            logging.error(f'Image file does not exist: {img}')
            return

        one_city_gdf = extract_OSM_polygons(OSM, city)  # get OSM polygons
        print("one_city_gdf after extract")
        print(len(one_city_gdf))
        sub_clean_sample, img_array, PCAs = generate_sample(one_city_gdf, img, city)  # generate random samples
        print("sub_clean_sample")
        print(len(sub_clean_sample))
        print(sub_clean_sample.head())

        X_train, X_test, y_train, y_test, X_train_weight = split_sample(sub_clean_sample, PCAs, city)  # sample values, train test split
        scorers = {
            'precision_macro': make_scorer(precision_score, average='macro', zero_division=0),
            'recall_macro': make_scorer(recall_score, average='macro', zero_division=0),
            'accuracy': make_scorer(accuracy_score),
            'f1_score_macro': make_scorer(f1_score, average='macro', zero_division=0),
            'precision_weighted': make_scorer(precision_score, average='weighted', zero_division=0),
            'recall_weighted': make_scorer(recall_score, average='weighted', zero_division=0),
            'f1_score_weighted': make_scorer(f1_score, average='weighted', zero_division=0)
        }
        
        fit_params = {'sample_weight': X_train_weight}

        classifiers = {
            'SVM': SVC(),
            'RandomForest': RandomForestClassifier(),
            'LogisticRegression': LogisticRegression(),
            'KNeighbors': KNeighborsClassifier(),
            'GradientBoosting': GradientBoostingClassifier(),
            'XGBoost': XGBClassifier()
        }
        
        param_grids = {
            'SVM': {'C': [2 ** x for x in np.arange(-3, 13, dtype=float)],
                    'gamma': [2 ** x for x in np.arange(-3, 13, dtype=float)],
                    'class_weight': ['balanced']},
            'RandomForest': {'n_estimators': [100, 200, 300],
                             'max_features': [None, 'sqrt', 'log2'],
                             'min_samples_split': [2, 5, 10],
                             'random_state': [0]},
            'LogisticRegression': {'C': [0.1, 1, 10, 100],
                                   'solver': ['lbfgs'],
                                   'max_iter': [100, 200, 300],
                                   'class_weight': ['balanced']},
            'KNeighbors': {'n_neighbors': [3, 5, 7, 9],
                           'weights': ['uniform', 'distance'],
                           'algorithm': ['ball_tree', 'kd_tree', 'brute']},
            'GradientBoosting': {'n_estimators': [100, 200, 300],
                                 'learning_rate': [0.01, 0.1, 0.2],
                                 'max_depth': [3, 4, 5],
                                 'random_state': [0]},
            'XGBoost': {'n_estimators': [100, 200, 300],
                                'learning_rate': [0.01, 0.1, 0.2],
                                'max_depth': [3, 4, 5],
                                'random_state': [0]}
        }

        for clf_name in classifiers:
            clf = classifiers[clf_name]
            param_grid = param_grids[clf_name]
            grid_search_clf, test_scores = grid_search_wrapper('f1_score_weighted', clf, param_grid, scorers, X_train, X_test, y_train, y_test, fit_params, city)

            # save testing accuracy
            logging.info(f'Test scores for {city} using {clf_name}: {test_scores}')

            lock = FileLock(f"{classification_comparison}.lock")
            with lock:
                with open(classification_comparison, 'a') as csv_file:
                    writer = csv.writer(csv_file, delimiter=',', lineterminator='\n')
                    writer.writerow([f"{city}"] + [clf_name] + list(test_scores.values()))

            # save original img
            img_array2 = np.copy(img_array)
            img_re = img_array2.reshape((img_array2.shape[0], img_array2.shape[1] * img_array2.shape[2])).transpose()
            img_pre = np.copy(img_re[~np.isnan(img_re).any(axis=1)])
            img_pre = grid_search_clf.predict(img_pre)
            img_pre = img_pre.astype(np.int16)
            res = img_re[:, 0]
            res[~np.isnan(res)] = img_pre
            res[np.isnan(res)] = -32768
            res = res.reshape(img_array[0, :, :].shape)
            res = res.astype(np.int16)

            org_img = gdal.Open(img, gdal.GA_ReadOnly)
            meta = {
                'driver': 'GTiff',
                'dtype': 'int16',
                'nodata': -32768,
                'width': res.shape[1],
                'height': res.shape[0],
                'count': 1,
                'crs': CRS("EPSG:27700"),
                'transform': Affine(10, 0.0, org_img.GetGeoTransform()[0], 0, -10, org_img.GetGeoTransform()[-3]),
                'compress': 'lzw',
                'interleave': 'pixel'
            }

            result_path = r'../Results/{}_{}_{}.tif'.format(city, clf_name, datetime.datetime.now().strftime('%m%d_%H'))
            with rio.open(result_path, 'w', **meta) as dst:
                dst.write(res, 1)

            logging.info(f'Processed and saved results for {city} using {clf_name}')

        del img_array
        org_img = None

    except Exception as e:
        logging.error(f'Error processing {city}: {e}', exc_info=True)


def main():
    logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
    tif_names = {"Greater_Manchester": 'Manchester', "Greater_London": 'London', "West_Midlands": 'Westmidlands'}
    cities = ["Greater_Manchester", "West_Midlands", "Greater_London"]

    max_workers = 3

    with concurrent.futures.ProcessPoolExecutor(max_workers=max_workers) as executor:
        # future_to_city = {
        #     executor.submit(process_city, city, tif_names) for city in cities
        # }
        future_to_city = {executor.submit(process_city, city, tif_names): city for city in cities}
        for future in concurrent.futures.as_completed(future_to_city):
            city = future_to_city[future]
            try:
                future.result()
                logging.info(f'Completed processing for {city}')
            except Exception as e:
                logging.error(f'Error processing {city}: {e}', exc_info=True)


if __name__ == '__main__':
    main()

print('All jobs done')

2024-07-28 14:39:55,927 - INFO - Starting processing for Greater_Manchester
2024-07-28 14:39:55,928 - INFO - Starting processing for West_Midlands
2024-07-28 14:39:55,928 - INFO - Starting processing for Greater_London


one_city_gdf after extract
27436


2024-07-28 14:40:20,537 - INFO - Starting sample generation...


one_city_gdf after extract
27602


2024-07-28 14:40:45,228 - INFO - Starting sample generation...


one_city_gdf after extract
12794


2024-07-28 14:41:55,804 - INFO - Starting sample generation...


all_sample values before filtering
class
1.0    990831
0.0    857661
2.0    241863
4.0    114473
5.0     58012
3.0       419
Name: count, dtype: int64
all_sample values after filtering
class
1.0    990831
0.0    857661
2.0    356336
5.0     58012
Name: count, dtype: int64
all_sample values second
class
1.0    990831
0.0    857661
2.0    356336
5.0     58012
Name: count, dtype: int64
land_sample
2204828
2041779
clean_sample number before ndvi filtering
class
1.0    986287
0.0    842416
2.0    213076
5.0     57413
Name: count, dtype: int64
clean_sample number after ndvi filtering
1429731
          index           coordinates                       geometry  class  \
1377001  519340  (383035.0, 411895.0)  POINT (383035.000 411895.000)    1.0   
1377016  519355  (382995.0, 411835.0)  POINT (382995.000 411835.000)    1.0   
1377015  519354  (383065.0, 411845.0)  POINT (383065.000 411845.000)    1.0   
1377014  519353  (383035.0, 411835.0)  POINT (383035.000 411835.000)    1.0   
1377013  519

2024-07-28 15:20:13,227 - INFO - Test scores for Greater_Manchester using SVM: {'city': 'Greater_Manchester', 'datetime': datetime.datetime(2024, 7, 28, 15, 20, 13, 217159), 'accuracy_balanced': 0.5740000000000001, 'accuracy': 0.574, 'precision_macro': 0.5950866671163494, 'recall_macro': 0.5740000000000001, 'f1_score_macro': 0.5798548243976104, 'precision_weighted': 0.5950866671163494, 'recall_weighted': 0.574, 'f1_score_weighted': 0.5798548243976104}
2024-07-28 15:38:25,871 - INFO - Test scores for Greater_London using SVM: {'city': 'Greater_London', 'datetime': datetime.datetime(2024, 7, 28, 15, 38, 25, 860838), 'accuracy_balanced': 0.5472, 'accuracy': 0.5472, 'precision_macro': 0.5670141323933057, 'recall_macro': 0.5472, 'f1_score_macro': 0.5548575582056354, 'precision_weighted': 0.5670141323933056, 'recall_weighted': 0.5472, 'f1_score_weighted': 0.5548575582056354}
2024-07-28 15:48:55,684 - INFO - Test scores for West_Midlands using SVM: {'city': 'West_Midlands', 'datetime': dateti

All jobs done
