In [2]:
import pandas as pd
import numpy as np
import geopandas as gpd
import re
import math
from sklearn.model_selection import train_test_split
from shapely.geometry import MultiPolygon, Point, Polygon
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
from sklearn.inspection import DecisionBoundaryDisplay
from sklearn.metrics import precision_score, accuracy_score
import pickle

In [2]:
train_df =  pd.read_parquet('train_df.parquet.gzip')
test_df =  pd.read_parquet('test_df.parquet.gzip')

In [3]:
class MachineLearning:
    def __init__(self, train_dataset, test_dataset, model_type):
            self.train_dataset = train_dataset.copy()
            self.test_dataset = test_dataset.copy()
            self.model_type = model_type
            self.scaler_trained = False
            self.model_selection()

    def cleaning(self, data):
        data['time_cleaned'] =   data['time_cleaned'].apply(lambda t: t.hour * 3600 + t.minute * 60 + t.second)
        columns_to_drop=['missionid','moment','dataframe','eventleveltype_cleaned',
                         'geometry','in_Belgium','AEDneeded_cleaned','day_cleaned']
        x_data = data.drop(columns=columns_to_drop)
        y_data = data['AEDneeded_cleaned']
        return x_data, y_data
    
    def scaling (self, dataset):
        columns_to_scale = ['time_cleaned', 'latitude_cleaned', 'longitude_cleaned']
        if self.scaler_trained == False:
            self.scaler = StandardScaler()
            data_scaled =self.scaler.fit_transform(dataset[columns_to_scale])
            self.scaler_trained= True
        else:
             data_scaled =self.scaler.transform(dataset[columns_to_scale])
        columns_to_scaled = ['time_cleaned_s', 'latitude_cleaned_s', 'longitude_cleaned_s']
        data_scaled = pd.DataFrame(data_scaled, columns = columns_to_scaled)
        data_scaled = pd.concat([data_scaled, dataset], axis = 1)
        data_scaled = data_scaled.drop(columns= columns_to_scale)
        data_arranged = self.rearrange(data_scaled)
        return data_arranged
    def preprocessing_choose_model(self):
        self.x_train, self.y_train = self.cleaning(self.train_dataset)
        self.x_test, self.y_test =self.cleaning(self.test_dataset)
        self.x_train_scaled = self.scaling(self.x_train)
        self.x_test_scaled = self.scaling(self.x_test)

    def model_selection(self):
        self.preprocessing_choose_model()
        if self.model_type == 'knn':
            self.make_knn()
            self.best_knn(self.precision_score, 0.9)

    def make_knn(self):
        n_neighbors = np.arange(2, 30, 3)
        weights = ['uniform', 'distance']
        self.precision_score = {}
        self.accuracy_score = {}
        for weight in weights:
            for neighbor in n_neighbors:
                self.knn = KNeighborsClassifier(n_neighbors=neighbor, weights=weight)
                self.knn.fit(self.x_train_scaled, self.y_train)
                self.y_pred = self.knn.predict(self.x_test_scaled)
                predict_labels = ['Requires AED', 'Maybe requires AED', 'Does not require AED']
                self.accuracy_score[(neighbor, weight)] = accuracy_score(self.y_test, self.y_pred)
                self.precision_score[(neighbor, weight)] = precision_score(self.y_test, self.y_pred, labels=predict_labels, average=None)

    def best_knn(self, precision_score, accuracy_cutoff):
        max_value = 0
        max_key = ''
        for key, item in precision_score.items():
            if self.accuracy_score[key]> accuracy_cutoff:
                if item[0] > max_value: 
                    max_value=item[0]
                    max_key = key
        print(max_key, max_value)
        self.optimal_knn= KNeighborsClassifier(n_neighbors=max_key[0], weights=max_key[1])
        self.optimal_knn.fit(self.x_train_scaled, self.y_train)
        self.optimal_y_pred = self.optimal_knn.predict(self.x_test_scaled)

    def predicting(self, df_topredict):
        scaled_df_topredict = self.scaling(df_topredict)
        predictions = self.optimal_knn.predict(scaled_df_topredict)
        predictions = pd.DataFrame(predictions, columns=['predictions'])
        return predictions
    
    def rearrange(self, df_toarrange):
        df_arranged =pd.DataFrame()
        df_arranged['lat'] = df_toarrange['latitude_cleaned_s']
        df_arranged['long'] = df_toarrange['longitude_cleaned_s']
        df_arranged['Monday'] = df_toarrange['Monday']
        df_arranged['Tuesday'] = df_toarrange['Tuesday']
        df_arranged['Wednesday'] = df_toarrange['Wednesday']
        df_arranged['Thursday'] = df_toarrange['Thursday']
        df_arranged['Friday'] = df_toarrange['Friday']
        df_arranged['Saturday'] = df_toarrange['Saturday']
        df_arranged['Sunday'] = df_toarrange['Sunday']
        df_arranged['Time'] = df_toarrange['time_cleaned_s']
        return df_arranged
                          


In [4]:
knn_model = MachineLearning(train_dataset=train_df,
                            test_dataset=test_df, model_type='knn')

(29, 'distance') 0.9529378324603078


In [5]:
class RandomGenerator:
    def __init__(self,municipalities_gdf, region):
        self.municipalities_gdf =municipalities_gdf
        self.region = region
        self.select_region()

    def select_region(self):
        if self.region == 'Brussels':
            self.gdf_Brussels =pd.DataFrame()
            brussels_communes= self.municipalities_gdf.loc[self.municipalities_gdf['arrond'] == '21', ['Communes']].values.tolist()
            for commune in range(len(brussels_communes)):
                self.region = brussels_communes[commune][0]
                self.generate_points()
                self.select_points_in_region()
                print(len(self.final_generated))
                self.gdf_Brussels = pd.concat([self.gdf_Brussels,self.final_generated])
                print(len(self.gdf_Brussels))
            self.final_generated =self.gdf_Brussels
        else:
            self.generate_points()
            self.select_points_in_region()
    
    def generate_points(self):
        self.areas = {'Anderlecht': 17.74,'Oudergem':9.03,'Sint-Agatha-Berchem':2.95, 'Bruxelles':32.61,'Etterbeek':3.15,'Evere': 5.02,'Forest': 6.25, 'Ganshoren': 2.46,
         'Ixelles': 6.34,'Jette': 5.04,'Koekelberg': 1.17,'Sint-Jans-Molenbeek': 5.89,'Saint-Gilles':2.52,'Sint-Joost-ten-Node': 1.14,'Schaerbeek': 8.14,
         'Uccle': 22.91,'Watermael-Boitsfort':12.93,'Sint-Lambrechts-Woluwe': 7.22,'Sint-Pieters-Woluwe': 8.85,'Antwerpen': 204.51,'Brugge': 138.40,
         'Gent': 156.18,'Hasselt': 102.24,'Leuven': 56.63,  'Mons': 146.53,'Liège': 69.39,'Charleroi': 102.08, 'Namur': 175.69, 'Arlon': 118.64}
        self.city_polygon = self.municipalities_gdf.loc[self.municipalities_gdf['Communes'] == self.region,'geometry']
        scaling_factor = self.areas[self.region]
        latitude_generated = np.random.uniform(self.city_polygon.bounds['miny'], self.city_polygon.bounds['maxy'], int(500*scaling_factor))
        longitude_generated= np.random.uniform(self.city_polygon.bounds['minx'], self.city_polygon.bounds['maxx'], int(500*scaling_factor))
        latitude_generated=pd.DataFrame(latitude_generated, columns=['latitude_cleaned'])
        longitude_generated=pd.DataFrame(longitude_generated, columns=['longitude_cleaned'])
        self.df_generated = pd.concat([latitude_generated, longitude_generated], axis =1)

    def select_points_in_region(self):
        self.gdf_generated= gpd.GeoDataFrame(self.df_generated, geometry=gpd.points_from_xy(self.df_generated['longitude_cleaned'], self.df_generated['latitude_cleaned']), crs="EPSG:4326")
        self.gdf_generated['in_city'] = self.gdf_generated['geometry'].apply(lambda x: self.city_polygon.contains(x))
        self.gdf_generated_city = self.gdf_generated[self.gdf_generated['in_city'] == True]
        self.final_generated = self.gdf_generated_city.drop(columns=['geometry', 'in_city']).reset_index()

In [6]:
file_path_municipalities= r'BELGIUM_-_Municipalities.geojson'
municipalities_gdf = gpd.read_file(file_path_municipalities)

In [7]:
dic_for_cities = {}
df_cities = {}
cities = ['Brussels', 'Antwerpen', 'Brugge', 'Gent', 'Hasselt', 'Leuven', 'Mons', 'Liège', 'Charleroi', 'Namur', 'Arlon']
for city in cities:
    df_with_indicators = pd.DataFrame()
    columns = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
    df_cities[city] = RandomGenerator(municipalities_gdf, city).final_generated
    df_indicators = pd.DataFrame(data=0,index=range(len(df_cities[city])), columns=columns)
    df_cities[city] =df_cities[city].reset_index(drop=True)
    df_with_indicators = pd.concat([df_cities[city], df_indicators], axis = 1)

    dic_for_days ={}
    days=columns
    for day in days:
        df_for_this_day = df_with_indicators.copy()
        df_for_this_day[day] = 1
        dic_for_days[day] = df_for_this_day
        for other_day in days:
            if other_day != day:
                df_for_this_day[other_day] = 0

    dic_all_dfs = {}
    for day in dic_for_days.keys():
        for hour in range(24):
            df_for_this_hour = dic_for_days[day].copy()
            df_for_this_hour['time_cleaned'] = hour
            dic_all_dfs[(day, hour)]=df_for_this_hour

    dic_for_cities[city]= dic_all_dfs



4812
4812
1598
6410
796
7206
4753
11959
860
12819
1234
14053
1462
15515
665
16180
1155
17335
1192
18527
262
18789
1258
20047
663
20710
242
20952
2044
22996
6218
29214
3201
32415
1829
34244
2267
36511


In [8]:
cities = ['Brussels', 'Antwerpen', 'Brugge', 'Gent', 'Hasselt', 'Leuven', 'Mons', 'Liège', 'Charleroi', 'Namur', 'Arlon']
days = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']

all_predictions ={}
for city in cities:
    predictions_per_city ={}
    for day in days:
        predictions = pd.DataFrame()
        predictions['lat'] = dic_for_cities[city][('Monday',0)]['latitude_cleaned']
        predictions['lon'] = dic_for_cities[city][('Monday',0)]['longitude_cleaned']
        for hour in range(24):
            to_predict = dic_for_cities[city][(day,hour)]
            column_name = f'hour_{hour}'
            predictions[column_name] = knn_model.predicting(to_predict)
        predictions_per_city[day]=predictions
    all_predictions[city]=predictions_per_city




In [9]:
for i in range(24):
    column_name= f'hour_{i}'
    print(all_predictions['Liège']['Monday'][column_name].value_counts())
    print(all_predictions['Charleroi']['Friday'][column_name].value_counts())



hour_0
Does not require AED    15604
Name: count, dtype: int64
hour_0
Does not require AED    29670
Requires AED              108
Maybe requires AED         32
Name: count, dtype: int64
hour_1
Does not require AED    15604
Name: count, dtype: int64
hour_1
Does not require AED    29670
Requires AED              108
Maybe requires AED         32
Name: count, dtype: int64
hour_2
Does not require AED    15604
Name: count, dtype: int64
hour_2
Does not require AED    29670
Requires AED              108
Maybe requires AED         32
Name: count, dtype: int64
hour_3
Does not require AED    15604
Name: count, dtype: int64
hour_3
Does not require AED    29668
Requires AED              109
Maybe requires AED         33
Name: count, dtype: int64
hour_4
Does not require AED    15604
Name: count, dtype: int64
hour_4
Does not require AED    29665
Requires AED              112
Maybe requires AED         33
Name: count, dtype: int64
hour_5
Does not require AED    15604
Name: count, dtype: int64
hour_5


In [10]:
with open('dic_all_predictions500.pkl', 'wb') as f:
    pickle.dump(all_predictions, f)

In [None]:
with open('optimal_knn_model500.pkl', 'wb') as f:
    pickle.dump(knn_model, f)

In [3]:
#with open('optimal_knn_model500.pkl', 'rb') as f:
    #loaded_knn_model = pickle.load(f)

AttributeError: Can't get attribute 'MachineLearning' on <module '__main__'>

In [None]:
#with open('dic_all_predictions.pkl', 'rb') as f:
    #loaded_all_predictions = pickle.load(f)

In [None]:
#for i in range(1):
    #column_name= f'hour_{i}'
    #print(loaded_all_predictions['Brussels']['Sunday'][column_name].value_counts())
    #print(loaded_all_predictions['Arlon']['Friday'][column_name].value_counts())