In [None]:
import os
import time
from io import StringIO

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from datetime import datetime, timedelta

#preprocessing
from sklearn.model_selection import TimeSeriesSplit

from sklearn.impute import SimpleImputer
#model
from statsmodels.tsa.api import ExponentialSmoothing
from statsmodels.tsa.arima_model import ARIMA
from pmdarima import auto_arima
from sklearn.linear_model import LinearRegression
import xgboost as xgb
import lightgbm as lgb
from prophet import Prophet

#metrics
from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_absolute_percentage_error

from google.cloud import storage
import mlflow.sklearn
import mlflow



In [198]:
import os
import time
from io import StringIO

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from datetime import datetime, timedelta

#preprocessing
from sklearn.model_selection import TimeSeriesSplit

from sklearn.impute import SimpleImputer
#model
from statsmodels.tsa.api import ExponentialSmoothing
from statsmodels.tsa.arima_model import ARIMA
from pmdarima import auto_arima
from sklearn.linear_model import LinearRegression
import xgboost as xgb
import lightgbm as lgb
from prophet import Prophet

#metrics
from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_absolute_percentage_error

from google.cloud import storage
import mlflow.sklearn
import mlflow

# Feature Engineering Pipeline

In [2]:
def read_data_from_gcs(bucket_name, folder, filename, delimiter=','):
    start_time = time.time()  # Start measuring time
    blob = storage_client.get_bucket(bucket_name).blob(f'{folder}/{filename}')
    csv_data = blob.download_as_text()
    df = pd.read_csv(StringIO(csv_data), delimiter=delimiter)
    elapsed_time = time.time() - start_time  # Calculate elapsed time
    print(f"Read {filename} complete. Elapsed time: {elapsed_time:.2f} seconds")
    return df

def remove_lawyers(df, lawyer_ids):
    filtered_df = df[~df['lawyer_id'].isin(lawyer_ids)]
    return filtered_df

# Function to create lagged features for time series data
def create_lagged_features(data, lag):
    lagged_data = data.copy()
    for i in range(1, lag + 1):
        lagged_data[f'Lag_{i}'] = data['count'].shift(i)
    return lagged_data

def data_process_ml(df, train_index, test_index):
    
    df = df.set_index('date', inplace=False)

    lag = 7  # Number of lagged values, adjust as needed
    # Apply the function to create lagged features
    lagged_df = create_lagged_features(df, lag)
    
    X = lagged_df.drop(['count'], axis=1)
    y = lagged_df['count']
    
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    return X_train, y_train, X_test, y_test


def visualize(df, x, y):
    # Set a Seaborn style and color palette
    sns.set_style("whitegrid")
    sns.set_palette("tab10")

    # Create a line plot
    plt.figure(figsize=(12, 6))  # Adjust the figure size as needed
    sns.lineplot(data=df, x=x, y=y)

    # Customize the plot
    plt.title('Daily Chat Consultations', fontsize=16)
    plt.xlabel('Date', fontsize=16)
    plt.ylabel('Count', fontsize=16)
    plt.xticks(rotation=45)  # Rotate x-axis labels for better readability

    # Add grid lines
    plt.grid(True, alpha=0.5)

    # Show the plot
    plt.tight_layout()  # Ensure all elements fit nicely
    plt.show()
    


def data_process_stats(df, train_index, test_index):
    time_series = df['count']
    train_data, test_data = time_series[train_index], time_series[test_index]
    return train_data, test_data

def model_auto_arima(train_data, test_data):
    auto_model = auto_arima(train_data, seasonal=True, m=7, trace=True)
    n_forecast = len(test_data)
    y_pred, conf_int = auto_model.predict(n_forecast, return_conf_int=True)
    return y_pred

def data_process_prophet(df, train_index, test_index):
    df = df.rename(columns={'date': 'ds', 'count': 'y'}, inplace=False)
    train_data, test_data = df.iloc[train_index], df.iloc[test_index]
    return train_data, test_data

def model_prophet(train_data_pr, test_data_pr):
    model = Prophet()
    model.fit(train_data_pr)
    y_pred = model.predict(test_data_pr)
    return y_pred['yhat']

In [7]:
# Set path
relative_path = '../../deep-flash-sa.json'
file_path = os.path.abspath(relative_path)
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = file_path
storage_client = storage.Client()

In [8]:
df_consultations = read_data_from_gcs('perqara-dendrobium', 'raw/postgres/csv/consultations', 'consultations.csv', delimiter='|')

Read consultations.csv complete. Elapsed time: 3.66 seconds


In [24]:
def run():
    
    #get data consultation
    data_consultation = get_data_consultation()
    #get data availibity
    data_availability = get_data_availability()
    #get data web_visitor
    data_web = get_data_web_visitor()
    #transform consultation
    
    #transform availability
    
    #transform web visitor
        
    #merge
    
    #load

In [9]:
def get_data_consultation():
    data = read_data_from_gcs('perqara-dendrobium', 'raw/postgres/csv/consultations', 'consultations.csv', delimiter='|')
    return data

def get_data_availability():
    data = df_availability_instants = read_data_from_gcs('perqara-dendrobium', 'raw/postgres/csv/availability_instants', 'availability_instants.csv', delimiter=',')
    return data
    
def get_data_web_visitor():
    data = pd.read_csv('website-visitor_20231114.csv')
    return data

In [10]:
def remove_lawyer(df, lawyer_ids):
    filtered_df = df[~df['lawyer_id'].isin(lawyer_ids)]
    return filtered_df

def filter_status(df):
    filtered_df = df[df['status'] == 600]
    return filtered_df

def cast_column(df, columns):
    data = df.copy()
    for column in columns:
        data[column] = pd.to_datetime(data[column])
    return data

def count_daily_consultations(df):
    count = df['created_at'].to_frame().reset_index(drop=True)
    count.set_index('created_at', inplace=True)
    df_daily_count = count.resample('D').size().reset_index()
    return df_daily_count

def rename_columns_consultation(df):
    df.rename(columns={'created_at': 'date', 0: 'count'}, inplace=True)
    return df

# Function to create lagged features for time series data
def create_lagged_features(data, lag):
    lagged_data = data.copy()
    for i in range(1, lag + 1):
        lagged_data[f'lag_{i}'] = data['count'].shift(i)
    data = lagged_data.fillna(lagged_data.mean(numeric_only=True))
    return data

def filter_columns(df, column):
    data = df[column]
    return data

def calculate_duration(df):
    df['duration'] = (df['end_datetime'] - df['start_datetime']).astype('timedelta64[m]') / 60
    return df

def extract_date(df, column):
    df['date'] = df[column].dt.date
    df['date'] = pd.to_datetime(df['date'])
    return df

def filter_working_hours(df):
    working_hours_mask = (df['start_datetime'].dt.hour >= 10) & (df['end_datetime'].dt.hour <= 18)
    return df[working_hours_mask]

def filter_late_hours(df):
    late_hours_mask = (df['end_datetime'].dt.hour > 18) & (df['end_datetime'].dt.hour <= 24)
    return df[late_hours_mask]

def get_daily_lawyer_count(df, time_column, count_column):
    return df.groupby(df[time_column].dt.date)[count_column].nunique()

def fill_date_range(daily_count):
    date_range = pd.date_range(start=daily_count.index.min(), end=daily_count.index.max(), freq='D')
    data = daily_count.reindex(date_range, fill_value=0).fillna(0)
    data = data.reset_index()
    return data

def rename_index(df):
    df = df.rename(columns={'index': 'date'})
    return df

def create_df_lawyer_count(working_hours_lawyers, late_hours_lawyers):
    new_data = {'working_hours_lawyers': working_hours_lawyers, 'late_hours_lawyers': late_hours_lawyers}
    return pd.DataFrame(new_data)

def rename_columns_website(df):
    column_mapping = {'event_date': 'date', 'f0_': 'web_visitor'}
    return df.rename(columns=column_mapping)

def convert_date_to_datetime(df):
    df['date'] = pd.to_datetime(df['date'], format='%Y%m%d')
    return df

def generate_date_range(start_date, end_date):
    return pd.date_range(start=start_date, end=end_date)

def calculate_mean_count(df, count_column):
    return df[count_column].mean()

def create_new_data(date_range, mean_count):
    new_data = {'date': date_range, 'web_visitor': mean_count}
    return pd.DataFrame(new_data)

def concatenate_dataframes(df1, df2):
    return pd.concat([df1, df2]).sort_values('date')


In [11]:
def transform_consultation(df):
    # cleaning
    df = remove_lawyers(df, [36, 38, 48, 120, 192, 195])
    # filter status 6000
    df = filter_status(df)
    # cast columns
    df = cast_column(df, ['created_at'])
    # count consultations
    df = count_daily_consultations(df)
    # rename column
    df = rename_columns_consultation(df)
    # create lagged feature
    df = create_lagged_features(df, 7)
    return df
    
def transform_availability(df):
    # cleaning
    df = remove_lawyers(df, [36, 38, 48, 120, 192, 195])
    # filter column
    df = filter_columns(df, ['lawyer_id', 'start_datetime', 'end_datetime', 'created_at'])
    # cast columns
    df = cast_column(df, ['start_datetime', 'end_datetime', 'created_at'])
    # calculate duration
    df = calculate_duration(df)
    # extract date
    df = extract_date(df, 'created_at')
    # filter working hours
    df_working_hours = filter_working_hours(df)
    # filter late hours
    df_late_hours = filter_late_hours(df)
    # get daily lawyer count for working hours
    working_hours_lawyers = get_daily_lawyer_count(df_working_hours, 'start_datetime', 'lawyer_id')
    # get daily lawyer count for late hours
    late_hours_lawyers = get_daily_lawyer_count(df_late_hours, 'start_datetime', 'lawyer_id')
    # create daily count DataFrame
    df_new = create_df_lawyer_count(working_hours_lawyers, late_hours_lawyers)
    # fill date range
    df_new = fill_date_range(df_new)
    # rename
    df_new = rename_index(df_new)
    return df_new
    
def transform_web_visitor(df):
    
    df = rename_columns_website(df)
    
    df = convert_date_to_datetime(df)
    
    # Generate date range
    date_range = generate_date_range('2023-05-06', '2023-06-25')

    # Calculate mean count
    mean_count = calculate_mean_count(df, 'web_visitor')

    # Create new data
    new_rows = create_new_data(date_range, mean_count)

    # Concatenate DataFrames
    df = concatenate_dataframes(df, new_rows)
    
    return df

In [12]:
def filter_date_range(df, start_date, end_date):
    mask = (df['date'] >= start_date) & (df['date'] <= end_date)
    return df[mask]

def merge_dataframes(df1, df2, df3):
    # Merge the DataFrames on the 'date' column
    merged_df = pd.merge(df1, df2, on='date', how='outer')
    merged_df = pd.merge(merged_df, df3, on='date', how='outer')
    
    # Sort the DataFrame by 'date'
    merged_df = merged_df.sort_values('date')
    
    return merged_df

In [14]:
data_consultation = get_data_consultation()
data_availability = get_data_availability()
data_web = get_data_web_visitor()

Read consultations.csv complete. Elapsed time: 3.48 seconds
Read availability_instants.csv complete. Elapsed time: 1.91 seconds


In [15]:
data_consultation_tr = transform_consultation(data_consultation)
data_availability_tr = transform_availability(data_availability)
data_web_tr = transform_web_visitor(data_web)

In [40]:
data_consultation_tr = filter_date_range(data_consultation_tr, '2023-05-06', '2023-11-13')
data_availability_tr = filter_date_range(data_availability_tr, '2023-05-06', '2023-11-13')
data_web_tr = filter_date_range(data_web_tr, '2023-05-06', '2023-11-13')

In [45]:
df_merged = merge_dataframes(data_consultation_tr, data_availability_tr, data_web_tr)
df_merged.head()

Unnamed: 0,date,count,lag_1,lag_2,lag_3,lag_4,lag_5,lag_6,lag_7,working_hours_lawyers,late_hours_lawyers,web_visitor
0,2023-05-06,1,15.757282,15.55122,15.627451,15.704433,15.613861,15.452736,15.32,0.0,1.0,533.652482
1,2023-05-07,0,1.0,15.55122,15.627451,15.704433,15.613861,15.452736,15.32,0.0,0.0,533.652482
2,2023-05-08,0,0.0,1.0,15.627451,15.704433,15.613861,15.452736,15.32,1.0,1.0,533.652482
3,2023-05-09,3,0.0,0.0,1.0,15.704433,15.613861,15.452736,15.32,2.0,0.0,533.652482
4,2023-05-10,0,3.0,0.0,0.0,1.0,15.613861,15.452736,15.32,0.0,0.0,533.652482


In [59]:
df_merged.to_csv('feature_ml.csv', index=False)

# Training Pipeline

In [199]:
df = pd.read_csv('feature_ml.csv')
df = df.set_index('date')

In [200]:
# train ml model
## -train model linear regression
## -train model lightgbm
## -train model xgboost

# choose ml model


# save model 

In [201]:
def split_timesries_data(df, n_splits=5):
    tscv = TimeSeriesSplit(n_splits=n_splits)
    splits = tscv.split(df)
    return splits

def train_and_evaluate_model(model, X_train, y_train, X_test):
    y_pred = model(X_train, y_train, X_test)
    metrics = produce_metrics(y_test, y_pred)
    return metrics

def model_linear_regression(X_train, y_train, X_test):
    model = LinearRegression()
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    return y_pred
    
def model_lgb(X_train, y_train, X_test):
    model = lgb.LGBMRegressor(verbose=-1)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    return y_pred
    
def model_xgboost(X_train, y_train, X_test):
    model = xgb.XGBRegressor()
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    return y_pred

def produce_metrics(y_test, y_pred):
    mae = mean_absolute_error(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    mape = mean_absolute_percentage_error(y_test, y_pred)
    return [mae, rmse, mape]
    
def metrics_results(model_score):
    num_rows = len(model_score)
    # Initialize a list to store the column sums
    column_sums = []

    # Calculate the sum of each column
    for col in range(len(model_score[0])):
        col_sum = sum(row[col] for row in model_score)
        column_sums.append(col_sum / num_rows)

    return column_sums

def train_ml(df, model_name):
    models = {
        'Linear Regression': model_linear_regression,
        'LightGBM': model_lgb,
        'XGBoost': model_xgboost
    }
    
    scores = []
    splits = split_timesries_data(df)
    
    X = df.drop('count', axis=1)
    y = df['count']
    
    for i, (train_index, test_index) in enumerate(splits):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        
    
        y_pred = models[model_name](X_train, y_train, X_test)
        
        metrics = produce_metrics(y_test, y_pred)
        scores.append(metrics)
    
    average_score = metrics_results(scores)
    print(f"Print {model_name} score: {average_score}")
    return average_score

def best_model(scores, metric):
    index = {'MAE': 0, 'RMSE': 1, 'MAPE': 2}
    
    if metric not in index:
        print("Invalid metric")
        return None
    
    metric_index = index[metric]
    min_value = min(score[metric_index] for score in scores)
    
    best_model_index = [i for i, score in enumerate(scores) if score[metric_index] == min_value][0]
    models = ['Linear Regression', 'LightGBM', 'XGBoost']  # Update with your actual model names
    best_model_name = models[best_model_index]

    print(f"Best model for {metric}: {best_model_name} with {metric} of {min_value}")
    return best_model_name, min_value

In [None]:
def train_best_model(df, best_model_name):
    
    X = df.drop('count', axis=1)
    y = df['count']
    
    if best_model_name == "Linear Regression":
        model = LinearRegression()
        model.fit(X, y)
        
    if best_model_name == "LightGBM":
        model = lgb.LGBMRegressor(verbose=-1)
        model.fit(X, y)
        
    if best_model_name == "XGBoost":
        model = xgb.XGBRegressor()
        model.fit(X, y)
    
    with mlflow.start_run():
        mlflow.sklearn.log_model(model, "best_model")
    
    return model

In [202]:
#PIPELINE

#train lr
score_lr = train_ml(df, 'Linear Regression')
#train lightgbm
score_lgbm = train_ml(df, 'LightGBM')
#train xgboost
score_xgb = train_ml(df, 'XGBoost')

#choose
best_model_name, _ = best_model([score_lr, score_lgbm, score_xgb], 'MAE')

#train model
##hyperparameter tuning

#log model


Print Linear Regression score: [5.908098920368892, 7.608417910844674, 947189531089036.6]
Print LightGBM score: [6.7620595311001965, 8.702145804912673, 748121394042056.6]
Print XGBoost score: [7.214254293361591, 9.338427526216872, 684549064795751.0]
Best model for MAE: Linear Regression with MAE of 5.908098920368892


In [204]:
best_model_name

'Linear Regression'