In [1]:
import pandas as pd

In [2]:
df = pd.read_csv(r'UK_Accident.csv', encoding='latin1')

In [3]:
df.drop(columns=['Location_Northing_OSGR','Location_Easting_OSGR', 
                 'Urban_or_Rural_Area','Police_Force', 'Local_Authority_(District)',
                'Number_of_Casualties','Local_Authority_(Highway)','1st_Road_Class', '2nd_Road_Number',
                'Pedestrian_Crossing-Human_Control', 'Pedestrian_Crossing-Physical_Facilities',
                'Special_Conditions_at_Site','Carriageway_Hazards','Did_Police_Officer_Attend_Scene_of_Accident',
                'LSOA_of_Accident_Location','Unnamed: 0','1st_Road_Number','Junction_Control',
                'Accident_Index', 'Accident_Severity', 'Number_of_Vehicles', '2nd_Road_Class',
                'Road_Type', 'Light_Conditions', 'Year'], inplace=True)

In [4]:
df

Unnamed: 0,Longitude,Latitude,Date,Day_of_Week,Time,Speed_limit,Weather_Conditions,Road_Surface_Conditions
0,-0.191170,51.489096,04/01/2005,3,17:42,30,Raining without high winds,Wet/Damp
1,-0.211708,51.520075,05/01/2005,4,17:36,30,Fine without high winds,Dry
2,-0.206458,51.525301,06/01/2005,5,00:15,30,Fine without high winds,Dry
3,-0.173862,51.482442,07/01/2005,6,10:35,30,Fine without high winds,Dry
4,-0.156618,51.495752,10/01/2005,2,21:13,30,Fine without high winds,Wet/Damp
...,...,...,...,...,...,...,...,...
1504145,-3.417278,55.264773,07/12/2014,1,22:20,70,Snowing without high winds,Snow
1504146,-3.230255,55.054855,11/12/2014,5,06:40,70,Fine without high winds,Snow
1504147,-3.230826,54.985668,09/12/2014,3,02:00,40,Fine without high winds,Frost/Ice
1504148,-3.191397,54.990446,17/12/2014,4,06:55,60,Raining without high winds,Wet/Damp


In [5]:
df = df.drop_duplicates()

df = df.dropna()

In [6]:
df['Date'] = pd.to_datetime(df['Date'], format='%d/%m/%Y')

df['Month'] = df['Date'].dt.month  
df['Day'] = df['Date'].dt.day      
df['Weekday'] = df['Date'].dt.weekday  
df['Is_Weekend'] = df['Weekday'].isin([5, 6]).astype(int)  

In [7]:
df

Unnamed: 0,Longitude,Latitude,Date,Day_of_Week,Time,Speed_limit,Weather_Conditions,Road_Surface_Conditions,Month,Day,Weekday,Is_Weekend
0,-0.191170,51.489096,2005-01-04,3,17:42,30,Raining without high winds,Wet/Damp,1,4,1,0
1,-0.211708,51.520075,2005-01-05,4,17:36,30,Fine without high winds,Dry,1,5,2,0
2,-0.206458,51.525301,2005-01-06,5,00:15,30,Fine without high winds,Dry,1,6,3,0
3,-0.173862,51.482442,2005-01-07,6,10:35,30,Fine without high winds,Dry,1,7,4,0
4,-0.156618,51.495752,2005-01-10,2,21:13,30,Fine without high winds,Wet/Damp,1,10,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
1504145,-3.417278,55.264773,2014-12-07,1,22:20,70,Snowing without high winds,Snow,12,7,6,1
1504146,-3.230255,55.054855,2014-12-11,5,06:40,70,Fine without high winds,Snow,12,11,3,0
1504147,-3.230826,54.985668,2014-12-09,3,02:00,40,Fine without high winds,Frost/Ice,12,9,1,0
1504148,-3.191397,54.990446,2014-12-17,4,06:55,60,Raining without high winds,Wet/Damp,12,17,2,0


In [8]:
df.drop(columns=['Date','Day_of_Week'], inplace=True)

In [9]:
df

Unnamed: 0,Longitude,Latitude,Time,Speed_limit,Weather_Conditions,Road_Surface_Conditions,Month,Day,Weekday,Is_Weekend
0,-0.191170,51.489096,17:42,30,Raining without high winds,Wet/Damp,1,4,1,0
1,-0.211708,51.520075,17:36,30,Fine without high winds,Dry,1,5,2,0
2,-0.206458,51.525301,00:15,30,Fine without high winds,Dry,1,6,3,0
3,-0.173862,51.482442,10:35,30,Fine without high winds,Dry,1,7,4,0
4,-0.156618,51.495752,21:13,30,Fine without high winds,Wet/Damp,1,10,0,0
...,...,...,...,...,...,...,...,...,...,...
1504145,-3.417278,55.264773,22:20,70,Snowing without high winds,Snow,12,7,6,1
1504146,-3.230255,55.054855,06:40,70,Fine without high winds,Snow,12,11,3,0
1504147,-3.230826,54.985668,02:00,40,Fine without high winds,Frost/Ice,12,9,1,0
1504148,-3.191397,54.990446,06:55,60,Raining without high winds,Wet/Damp,12,17,2,0


In [10]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.cluster import KMeans
import numpy as np
import pickle

def time_to_minutes(time_str):
    if isinstance(time_str, str):
        try:
            hour, minute = map(int, time_str.split(':'))
            return hour * 60 + minute
        except ValueError:
            return None  
    else:
        return None  

df['Time'] = df['Time'].apply(time_to_minutes)


weather_map = {
    'Raining without high winds': 'Rain',
    'Fine without high winds': 'Clear',
    'Snowing without high winds': 'Snow',
    'Raining without high wind': 'Rain'
}

road_surface_map = {
    'Wet/Damp': 'Wet',
    'Dry': 'Dry',
    'Snow': 'Icy',
    'Frost/Ice': 'Icy'
}

df['Weather_Conditions'] = df['Weather_Conditions'].map(weather_map)
df['Road_Surface_Conditions'] = df['Road_Surface_Conditions'].map(road_surface_map)

#Normalisation for numerical features
scaler = StandardScaler()
df[['Longitude', 'Latitude', 'Time', 'Speed_limit']] = scaler.fit_transform(df[['Longitude', 'Latitude', 'Time', 'Speed_limit']])
# Sauvegarder le scaler dans un fichier .pkl
with open('./models/scaler2.pkl', 'wb') as file:
    pickle.dump(scaler, file)
#ecnoding for cateforical features+month, day and weekday for the association
df = pd.get_dummies(df, columns=['Weather_Conditions', 'Road_Surface_Conditions', 'Month', 'Day', 'Weekday'], drop_first=True)


#converting booleans from true and false to 0 and 1
df[df.select_dtypes(include=[bool]).columns] = df.select_dtypes(include=[bool]).astype(int)

df


Unnamed: 0,Longitude,Latitude,Time,Speed_limit,Is_Weekend,Weather_Conditions_Rain,Weather_Conditions_Snow,Road_Surface_Conditions_Icy,Road_Surface_Conditions_Wet,Month_2,...,Day_28,Day_29,Day_30,Day_31,Weekday_1,Weekday_2,Weekday_3,Weekday_4,Weekday_5,Weekday_6
0,0.883986,-0.748483,0.714189,-0.640585,0,1,0,0,1,0,...,0,0,0,0,1,0,0,0,0,0
1,0.869361,-0.727143,0.694736,-0.640585,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
2,0.873100,-0.723543,-2.680343,-0.640585,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
3,0.896311,-0.753066,-0.670209,-0.640585,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
4,0.908591,-0.743898,1.398283,-0.640585,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1504145,-1.413332,1.852394,1.615507,2.182195,1,0,1,1,0,0,...,0,0,0,0,0,0,0,0,0,1
1504146,-1.280153,1.707792,-1.432115,2.182195,0,0,0,1,0,0,...,0,0,0,0,0,0,1,0,0,0
1504147,-1.280559,1.660132,-2.339917,0.065110,0,0,0,1,0,0,...,0,0,0,0,1,0,0,0,0,0
1504148,-1.252482,1.663424,-1.383483,1.476500,0,1,0,0,1,0,...,0,0,0,0,0,1,0,0,0,0


In [11]:
import joblib
#clustering longitude and latitude to help binarize the location for the association
kmeans = KMeans(n_clusters=5, random_state=42)
df['Longitude']= df['Longitude'].round(4)
df['Latitude']= df['Latitude'].round(4)
df['Location_Cluster'] = kmeans.fit_predict(df[['Longitude', 'Latitude']])
df_ = pd.get_dummies(df, columns=['Location_Cluster'], prefix='Cluster')


#creating a new feature to binarize time
df_['Time_Bin'] = pd.cut(df_['Time'], bins=[0, 6, 12, 18, 24], labels=["Night", "Morning", "Afternoon", "Evening"])
df_ = pd.get_dummies(df_, columns=['Time_Bin'], drop_first=True)

df_['Hour'] = pd.to_datetime(df_['Time']).dt.hour

#sin and cos are for the cyclic encoding, which helps understand that the hour 23 is close to 0 eventhou they're mathematically far from each other
#df['hour_sin'] = np.sin(2 * np.pi * df['Hour'] / 24)
#df['hour_cos'] = np.cos(2 * np.pi * df['Hour'] / 24)


#making the speed binazized
bins = [-np.inf, -1, 0, 1, np.inf]  
labels = ['Very Low', 'Low', 'High', 'Very High']  
df_['Speed_Limit_Bin'] = pd.cut(df_['Speed_limit'], bins=bins, labels=labels)
df_ = pd.get_dummies(df_, columns=['Speed_Limit_Bin'], drop_first=True)


df_ = df_.drop(['Time', 'Hour', 'Longitude', 'Latitude', 'Speed_limit'], axis=1)

df_ = df_.dropna()

print(df_)



         Is_Weekend  Weather_Conditions_Rain  Weather_Conditions_Snow  \
0                 0                        1                        0   
1                 0                        0                        0   
2                 0                        0                        0   
3                 0                        0                        0   
4                 0                        0                        0   
...             ...                      ...                      ...   
1504145           1                        0                        1   
1504146           0                        0                        0   
1504147           0                        0                        0   
1504148           0                        1                        0   
1504149           0                        0                        0   

         Road_Surface_Conditions_Icy  Road_Surface_Conditions_Wet  Month_2  \
0                                  0         

In [12]:
unique_clusters = df['Location_Cluster'].nunique()
print(f"Nombre unique de clusters : {unique_clusters}")


Nombre unique de clusters : 5


In [13]:
df_

Unnamed: 0,Is_Weekend,Weather_Conditions_Rain,Weather_Conditions_Snow,Road_Surface_Conditions_Icy,Road_Surface_Conditions_Wet,Month_2,Month_3,Month_4,Month_5,Month_6,...,Cluster_1,Cluster_2,Cluster_3,Cluster_4,Time_Bin_Morning,Time_Bin_Afternoon,Time_Bin_Evening,Speed_Limit_Bin_Low,Speed_Limit_Bin_High,Speed_Limit_Bin_Very High
0,0,1,0,0,1,0,0,0,0,0,...,True,False,False,False,False,False,False,True,False,False
1,0,0,0,0,0,0,0,0,0,0,...,True,False,False,False,False,False,False,True,False,False
2,0,0,0,0,0,0,0,0,0,0,...,True,False,False,False,False,False,False,True,False,False
3,0,0,0,0,0,0,0,0,0,0,...,True,False,False,False,False,False,False,True,False,False
4,0,0,0,0,1,0,0,0,0,0,...,True,False,False,False,False,False,False,True,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1504145,1,0,1,1,0,0,0,0,0,0,...,False,False,True,False,False,False,False,False,False,True
1504146,0,0,0,1,0,0,0,0,0,0,...,False,False,True,False,False,False,False,False,False,True
1504147,0,0,0,1,0,0,0,0,0,0,...,False,False,True,False,False,False,False,False,True,False
1504148,0,1,0,0,1,0,0,0,0,0,...,False,False,True,False,False,False,False,False,False,True


In [14]:
#converting booleans from true and false to 0 and 1
df_[df_.select_dtypes(include=[bool]).columns] = df_.select_dtypes(include=[bool]).astype(int)
#printing column names
print(df_.columns)


Index(['Is_Weekend', 'Weather_Conditions_Rain', 'Weather_Conditions_Snow',
       'Road_Surface_Conditions_Icy', 'Road_Surface_Conditions_Wet', 'Month_2',
       'Month_3', 'Month_4', 'Month_5', 'Month_6', 'Month_7', 'Month_8',
       'Month_9', 'Month_10', 'Month_11', 'Month_12', 'Day_2', 'Day_3',
       'Day_4', 'Day_5', 'Day_6', 'Day_7', 'Day_8', 'Day_9', 'Day_10',
       'Day_11', 'Day_12', 'Day_13', 'Day_14', 'Day_15', 'Day_16', 'Day_17',
       'Day_18', 'Day_19', 'Day_20', 'Day_21', 'Day_22', 'Day_23', 'Day_24',
       'Day_25', 'Day_26', 'Day_27', 'Day_28', 'Day_29', 'Day_30', 'Day_31',
       'Weekday_1', 'Weekday_2', 'Weekday_3', 'Weekday_4', 'Weekday_5',
       'Weekday_6', 'Cluster_0', 'Cluster_1', 'Cluster_2', 'Cluster_3',
       'Cluster_4', 'Time_Bin_Morning', 'Time_Bin_Afternoon',
       'Time_Bin_Evening', 'Speed_Limit_Bin_Low', 'Speed_Limit_Bin_High',
       'Speed_Limit_Bin_Very High'],
      dtype='object')


In [15]:
!pip install mlxtend



In [16]:
from mlxtend.frequent_patterns import apriori, association_rules

#apriori on the preprocessed encoded dataset
frequent_itemsets = apriori(df_, min_support=0.05, use_colnames=True)

#calculating the number of itemsets because of the num_itemsets error
num_itemsets = len(frequent_itemsets)

#generating the association rules
rules = association_rules(frequent_itemsets, metric="lift", min_threshold=1.0, num_itemsets=num_itemsets)

print(rules[['antecedents', 'consequents', 'support', 'confidence', 'lift']])



                                          antecedents  \
0                                         (Weekday_5)   
1                                        (Is_Weekend)   
2                                         (Weekday_6)   
3                                        (Is_Weekend)   
4                                         (Cluster_4)   
..                                                ...   
61                                        (Cluster_1)   
62  (Speed_Limit_Bin_Low, Road_Surface_Conditions_...   
63     (Speed_Limit_Bin_Low, Weather_Conditions_Rain)   
64                      (Road_Surface_Conditions_Wet)   
65                          (Weather_Conditions_Rain)   

                                          consequents   support  confidence  \
0                                        (Is_Weekend)  0.133978    1.000000   
1                                         (Weekday_5)  0.133978    0.549726   
2                                        (Is_Weekend)  0.109740    1.000000   

In [17]:
from mlxtend.frequent_patterns import fpgrowth

#Apply FP-Growth on the preprocessed dataset
frequent_itemsets = fpgrowth(df_, min_support=0.05, use_colnames=True)
num_itemsets = len(frequent_itemsets)
rules = association_rules(frequent_itemsets, metric="lift", min_threshold=1.0, num_itemsets=num_itemsets)
print(rules[['antecedents', 'consequents', 'support', 'confidence', 'lift']])



                      antecedents                    consequents   support  \
0           (Speed_Limit_Bin_Low)                    (Cluster_1)  0.274630   
1                     (Cluster_1)          (Speed_Limit_Bin_Low)  0.274630   
2                     (Weekday_1)          (Speed_Limit_Bin_Low)  0.097454   
3           (Speed_Limit_Bin_Low)                    (Weekday_1)  0.097454   
4                     (Weekday_1)                    (Cluster_1)  0.062194   
..                            ...                            ...       ...   
61  (Road_Surface_Conditions_Wet)                    (Cluster_4)  0.066106   
62                    (Cluster_4)                   (Is_Weekend)  0.053243   
63                   (Is_Weekend)                    (Cluster_4)  0.053243   
64  (Road_Surface_Conditions_Wet)                    (Cluster_2)  0.052726   
65                    (Cluster_2)  (Road_Surface_Conditions_Wet)  0.052726   

    confidence      lift  
0     0.428119  1.045080  
1     0.6

In [18]:
import pandas as pd

def get_location_info(state_name, file_path="result_roud4_scaler2.csv"):

    try:
        df = pd.read_csv(file_path, sep=';')
        
        required_columns = {'state_name', 'Location_Cluster', 'Longitude', 'Latitude'}
        if not required_columns.issubset(df.columns):
            missing_columns = required_columns - set(df.columns)
            return f"Les colonnes manquantes dans le fichier : {', '.join(missing_columns)}."

        result = df[df['state_name'] == state_name]
        
        if not result.empty:
            location_info = {
                'Location_Cluster': result.iloc[0]['Location_Cluster'],
                'longitude': result.iloc[0]['Longitude'],
                'latitude': result.iloc[0]['Latitude']
            }
            return location_info
        else:
            return f"Le state_name '{state_name}' n'a pas été trouvé dans le fichier."
    except FileNotFoundError:
        return f"Le fichier '{file_path}' est introuvable."
    except Exception as e:
        return f"Une erreur s'est produite : {e}"

state_name = "Ridge"
location_info = get_location_info(state_name)
print(f"Les informations pour '{state_name}' sont : {location_info}")

with open('get_location_info.pkl', 'wb') as get_location_info_f:
    pickle.dump(get_location_info, get_location_info_f)


Les informations pour 'Ridge' sont : {'Location_Cluster': 2, 'longitude': 0.5771, 'latitude': 0.6511}


In [19]:
import pickle
import requests

print (location_info['Location_Cluster'])
with open(r'./models/scaler2.pkl', 'rb') as file:
    scaler = pickle.load(file)

mean_time = 0  
mean_speed_limit = 0  
scaled_input = [[location_info['longitude'], location_info['latitude'], mean_time, mean_speed_limit]]

original_coords = scaler.inverse_transform(scaled_input)

original_lon = original_coords[0][0]
original_lat = original_coords[0][1]

print(f"Longitude originale: {original_lon}, Latitude originale: {original_lat}")

def get_weather_forecast(lat, lon, api_key):
    url = f"http://api.weatherapi.com/v1/forecast.json?key={api_key}&q={lat},{lon}&days=7"

    response = requests.get(url)
    
    if response.status_code == 200:
        weather_data = response.json()

        if 'forecast' in weather_data:
            forecast = []
            for day in weather_data['forecast']['forecastday']:
                date = day['date']
                description = day['day']['condition']['text']
                forecast.append({"Date": date, "Météo": description})
            return forecast
        else:
            return "Les données météorologiques ne sont pas disponibles."
    else:
        return f"Erreur HTTP: {response.status_code} - {response.text}"

api_key = "1604376839c243cfa8f223439240712"  

forecast = get_weather_forecast(original_lat, original_lon, api_key)
print(forecast)

with open('get_weather_forecast.pkl', 'wb') as get_weather_forecast_f:
    pickle.dump(get_weather_forecast, get_weather_forecast_f)
    

2
Longitude originale: -0.6221280110083248, Latitude originale: 53.52086160655638
[{'Date': '2024-12-12', 'Météo': 'Patchy rain nearby'}, {'Date': '2024-12-13', 'Météo': 'Overcast '}, {'Date': '2024-12-14', 'Météo': 'Partly Cloudy '}, {'Date': '2024-12-15', 'Météo': 'Overcast '}, {'Date': '2024-12-16', 'Météo': 'Overcast '}, {'Date': '2024-12-17', 'Météo': 'Overcast '}, {'Date': '2024-12-18', 'Météo': 'Patchy rain nearby'}]


In [20]:
def convert_to_binary(weather_description):
    rain_keywords = [
        'rain', 'drizzle', 'showers', 'thunderstorm', 'sleet',
        'light rain', 'moderate rain', 'heavy rain',
        'freezing rain', 'rain shower', 'patchy rain'
    ]
    
    snow_keywords = [
        'snow', 'blizzard', 'flurries', 'sleet',
        'light snow', 'moderate snow', 'heavy snow',
        'snow shower', 'patchy snow', 'freezing snow'
    ]
    
    Weather_Conditions_Rain = 1 if any(keyword in weather_description.lower() for keyword in rain_keywords) else 0
    Weather_Conditions_Snow = 1 if any(keyword in weather_description.lower() for keyword in snow_keywords) else 0
    
    return Weather_Conditions_Rain, Weather_Conditions_Snow
for day in forecast:
    Weather_Conditions_Rain, Weather_Conditions_Snow = convert_to_binary(day['Météo'])
    print(f"Date: {day['Date']} - Weather_Conditions_Rain: {Weather_Conditions_Rain}, Weather_Conditions_Snow: {Weather_Conditions_Snow}")

with open('convert_to_binary.pkl', 'wb') as convert_to_binary_f:
    pickle.dump(convert_to_binary, convert_to_binary_f)

Date: 2024-12-12 - Weather_Conditions_Rain: 1, Weather_Conditions_Snow: 0
Date: 2024-12-13 - Weather_Conditions_Rain: 0, Weather_Conditions_Snow: 0
Date: 2024-12-14 - Weather_Conditions_Rain: 0, Weather_Conditions_Snow: 0
Date: 2024-12-15 - Weather_Conditions_Rain: 0, Weather_Conditions_Snow: 0
Date: 2024-12-16 - Weather_Conditions_Rain: 0, Weather_Conditions_Snow: 0
Date: 2024-12-17 - Weather_Conditions_Rain: 0, Weather_Conditions_Snow: 0
Date: 2024-12-18 - Weather_Conditions_Rain: 1, Weather_Conditions_Snow: 0


In [21]:
import pandas as pd

 
def process_forecast(forecast):

    df = pd.DataFrame(forecast)

    df['Date'] = pd.to_datetime(df['Date'], format='%Y-%m-%d')

    df['Month'] = df['Date'].dt.month
    df['Day'] = df['Date'].dt.day
    df['Weekday'] = df['Date'].dt.weekday
    df['Is_Weekend'] = df['Weekday'].isin([5, 6]).astype(int)

    df[['Weather_Conditions_Rain', 'Weather_Conditions_Snow']] = df['Météo'].apply(
        lambda x: pd.Series(convert_to_binary(x))
    )

    month_dummies = pd.get_dummies(df['Month'], prefix='Month')
    day_dummies = pd.get_dummies(df['Day'], prefix='Day')
    weekday_dummies = pd.get_dummies(df['Weekday'], prefix='Weekday')

    for col in [f'Month_{i}' for i in range(2, 13)]:
        if col not in month_dummies:
            month_dummies[col] = 0

    for col in [f'Day_{i}' for i in range(2, 32)]:
        if col not in day_dummies:
            day_dummies[col] = 0

    for col in [f'Weekday_{i}' for i in range(1, 7)]:
        if col not in weekday_dummies:
            weekday_dummies[col] = 0

    df = pd.concat([df, month_dummies, weekday_dummies], axis=1)

    df.drop(columns=['Month', 'Day', 'Weekday'], inplace=True)

    ordered_columns = (
        [f'Month_{i}' for i in range(2, 13)] +
        [f'Weekday_{i}' for i in range(1, 7)] +
        ['Is_Weekend', 'Weather_Conditions_Rain', 'Weather_Conditions_Snow']
    )
    df = df[['Date', 'Météo'] + ordered_columns]

    binary_columns = ordered_columns  
    df[binary_columns] = df[binary_columns].astype(int)

    

    return df

 
result = process_forecast(forecast)

result['Location_Cluster'] = location_info['Location_Cluster']
result= pd.get_dummies(result, columns=['Location_Cluster'], prefix='Cluster')
for col in result.columns:
    if col.startswith('Cluster_'):
        result[col] = result[col].astype(int)
result_ = result.drop(columns=['Date', 'Météo'])

print(result_)


with open('process_forecast.pkl', 'wb') as process_forecast_f:
    pickle.dump(process_forecast, process_forecast_f)


   Month_2  Month_3  Month_4  Month_5  Month_6  Month_7  Month_8  Month_9  \
0        0        0        0        0        0        0        0        0   
1        0        0        0        0        0        0        0        0   
2        0        0        0        0        0        0        0        0   
3        0        0        0        0        0        0        0        0   
4        0        0        0        0        0        0        0        0   
5        0        0        0        0        0        0        0        0   
6        0        0        0        0        0        0        0        0   

   Month_10  Month_11  ...  Weekday_1  Weekday_2  Weekday_3  Weekday_4  \
0         0         0  ...          0          0          1          0   
1         0         0  ...          0          0          0          1   
2         0         0  ...          0          0          0          0   
3         0         0  ...          0          0          0          0   
4         0  

In [22]:
result = result.drop(columns=[ 'Météo'])


In [23]:

result_df = pd.DataFrame(result)

result_df

Unnamed: 0,Date,Month_2,Month_3,Month_4,Month_5,Month_6,Month_7,Month_8,Month_9,Month_10,...,Weekday_1,Weekday_2,Weekday_3,Weekday_4,Weekday_5,Weekday_6,Is_Weekend,Weather_Conditions_Rain,Weather_Conditions_Snow,Cluster_2
0,2024-12-12,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,1,0,1
1,2024-12-13,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,1
2,2024-12-14,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,1,0,0,1
3,2024-12-15,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,1,0,0,1
4,2024-12-16,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
5,2024-12-17,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,1
6,2024-12-18,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,1,0,1


In [24]:
from mlxtend.frequent_patterns import apriori, association_rules
import pandas as pd


frequent_itemsets = apriori(df_, min_support=0.05, use_colnames=True)
num_itemsets = len(frequent_itemsets)

apriori_rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.5, num_itemsets=num_itemsets)

with open('frequent_itemsets.pkl', 'wb') as f: 
    pickle.dump(frequent_itemsets, f) 
    
with open('apriori_rules.pkl', 'wb') as f: 
    pickle.dump(apriori_rules, f)

def calculate_probability(row, rules):
    confidences = []
    used_rules = []
    for _, rule in rules.iterrows():
        antecedents = list(rule['antecedents'])
        if all(antecedent in row and row[antecedent] == 1 for antecedent in antecedents):
            confidences.append(rule['confidence'])
            used_rules.append(rule['antecedents'])
    probability = sum(confidences) / len(confidences) if confidences else 0
    return probability


result_df['Probability'] = result_df.apply(lambda row: calculate_probability(row, apriori_rules), axis=1)

print(result_df[['Date', 'Probability']])

with open('calculate_probability.pkl', 'wb') as calculate_probability_f:
    pickle.dump(calculate_probability, calculate_probability_f)




        Date  Probability
0 2024-12-12     0.681008
1 2024-12-13     0.634494
2 2024-12-14     0.666059
3 2024-12-15     0.650485
4 2024-12-16     0.628099
5 2024-12-17     0.636745
6 2024-12-18     0.681245


In [25]:
print(result_df.columns)


Index(['Date', 'Month_2', 'Month_3', 'Month_4', 'Month_5', 'Month_6',
       'Month_7', 'Month_8', 'Month_9', 'Month_10', 'Month_11', 'Month_12',
       'Weekday_1', 'Weekday_2', 'Weekday_3', 'Weekday_4', 'Weekday_5',
       'Weekday_6', 'Is_Weekend', 'Weather_Conditions_Rain',
       'Weather_Conditions_Snow', 'Cluster_2', 'Probability'],
      dtype='object')
