In [1]:
import pandas as pd
import joblib
from sklearn.preprocessing import StandardScaler

def preprocess_states_dataset(file_path, output_csv_path):
    # Charger les données
    df = pd.read_csv(file_path, delimiter='$')

    # Suppression des colonnes inutiles
    columns_to_drop = ['Unnamed: 0', 'mun_code']
    for col in columns_to_drop:
        if col in df.columns:
            df.drop(columns=[col], inplace=True)

    # Renommer les colonnes
    columns_to_rename = {
        'lat': 'Latitude',
        'lon': 'Longitude',
        'mun_name': 'state_name'
    }
    df.rename(columns=columns_to_rename, inplace=True)

    # Ajouter des colonnes factices pour correspondre à l'ensemble d'entraînement
    df['Speed_limit'] = 0  # Valeur factice
    df['Time'] = 0         # Valeur factice

    # Nettoyage des noms de colonnes
    df.columns = df.columns.str.strip()

    # Validation des coordonnées
    if 'Longitude' in df.columns and 'Latitude' in df.columns:
        df = df[(df['Longitude'] >= -180) & (df['Longitude'] <= 180)]
        df = df[(df['Latitude'] >= -90) & (df['Latitude'] <= 90)]

 # Charger le scaler et le modèle KMeans
    scaler = joblib.load('./models/scaler2.pkl')
    kmeans = joblib.load('./models/kmeans_model.pkl')


    # Normaliser toutes les colonnes nécessaires
    df[['Longitude', 'Latitude',  'Time' ,'Speed_limit']] = scaler.transform(
        df[['Longitude', 'Latitude', 'Time', 'Speed_limit']]
    )
    df['Longitude']= df['Longitude'].round(4)
    df['Latitude']= df['Latitude'].round(4)

    # Ajouter les clusters basés sur les coordonnées
    df['Location_Cluster'] = kmeans.predict(df[['Longitude', 'Latitude']])

    # Supprimer les colonnes factices après le clustering
    df.drop(columns=['Speed_limit', 'Time'], inplace=True)

    # Sauvegarder le dataset nettoyé et clusterisé
    df.to_csv(output_csv_path, index=False, sep=';')
    print(f"Données nettoyées et clusterisées sauvegardées dans : {output_csv_path}")

    return df

# Appel de la fonction
processed_df =preprocess_states_dataset("./part1/stn_UK.csv", "result_roud4_scaler2.csv")

processed_df

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


Données nettoyées et clusterisées sauvegardées dans : result_roud4_scaler2.csv


Unnamed: 0,osm_name,st_name,state_name,Longitude,Latitude,Location_Cluster
0,"""77 Steps""",77 steps,Rainhill,-0.9438,0.5808,4
1,Allendale Avenue,allendale avenue,Rainhill,-0.9450,0.5792,4
2,Alness Drive,alness drive,Rainhill,-0.9428,0.5751,4
3,Amanda Road,amanda road,Rainhill,-0.9543,0.5841,4
4,Anderson Close,anderson close,Rainhill,-0.9414,0.5726,4
...,...,...,...,...,...,...
918074,Woolwich Foot Tunnel,woolwich foot tunnel,Woolwich Riverside,1.0645,-0.7434,1
918075,Worms Head path,worms head path,Gower,-2.0611,-0.6961,0
918076,Worsdell Street,worsdell street,Sleekburn,-0.0555,1.7633,2
918077,Wye Bridge,wye bridge,Thornwell,-0.8746,-0.6599,0


In [2]:
filtered_df = processed_df[processed_df['Location_Cluster'] == 1] # Display the filtered DataFrame 
filtered_df

Unnamed: 0,osm_name,st_name,state_name,Longitude,Latitude,Location_Cluster
738,"""Leading to"" Logan House and 1-3 Turners Meadow",leading to logan house 1 3 turners meadow,Aston Clinton & Stoke Mandeville,0.4897,-0.5306,1
739,Albert Orchard,albert orchard,Aston Clinton & Stoke Mandeville,0.5157,-0.5360,1
740,Alder Road,alder road,Aston Clinton & Stoke Mandeville,0.4631,-0.5367,1
741,Almond Tree Drive,almond tree drive,Aston Clinton & Stoke Mandeville,0.4632,-0.5363,1
742,Anstey Brook,anstey brook,Aston Clinton & Stoke Mandeville,0.4856,-0.5395,1
...,...,...,...,...,...,...
918061,Western Undercliff,western undercliff,Cliffsend and Pegwell,2.0157,-0.8607,1
918062,Western Way,western way,Charles Dickens,0.2287,-1.2200,1
918063,Westminster Bridge,westminster bridge,St. James's,0.9332,-0.7404,1
918065,Westview,westview,River,1.4043,-0.8035,1
