In [1]:
import os
import time
from io import StringIO

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
b
from datetime import datetime, timedelta

#preprocessing
from sklearn.model_selection import TimeSeriesSplit
from sklearn.impute import SimpleImputer
#model
from statsmodels.tsa.api import ExponentialSmoothing
from statsmodels.tsa.arima_model import ARIMA
from pmdarima import auto_arima
from sklearn.linear_model import LinearRegression
import xgboost as xgb
import lightgbm as lgb
from prophet import Prophet

#metrics
from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_absolute_percentage_error

from google.cloud import storage

In [2]:
def read_data_from_gcs(bucket_name, folder, filename, delimiter=','):
    start_time = time.time()  # Start measuring time
    blob = storage_client.get_bucket(bucket_name).blob(f'{folder}/{filename}')
    csv_data = blob.download_as_text()
    df = pd.read_csv(StringIO(csv_data), delimiter=delimiter)
    elapsed_time = time.time() - start_time  # Calculate elapsed time
    print(f"Read {filename} complete. Elapsed time: {elapsed_time:.2f} seconds")
    return df

def remove_lawyers(df, lawyer_ids, column):
    filtered_df = df[~df[column].isin(lawyer_ids)]
    return filtered_df

In [6]:
# Set path
relative_path = '../../deep-flash-sa.json'
file_path = os.path.abspath(relative_path)
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = file_path
storage_client = storage.Client()

In [7]:
df_consultations = read_data_from_gcs('perqara-dendrobium', 'raw/postgres/csv/consultations', 'consultations.csv', delimiter='|')
df_lawyers = read_data_from_gcs('perqara-dendrobium', 'raw/postgres/csv/lawyers', 'lawyers.csv', delimiter='|')

Read consultations.csv complete. Elapsed time: 2.96 seconds
Read lawyers.csv complete. Elapsed time: 0.93 seconds


In [8]:
lawyer_user = [36,38,48,120,192,195]
client_user = [25,42,43,44,45,49,54,56,58,62,63,66,85,146,157,295,959,1609,1655,2421,2526,3180,4205]

df_consultations = remove_lawyers(df_consultations, lawyer_user, 'lawyer_id')
df_consultations = remove_lawyers(df_consultations, client_user, 'client_id')

df_lawyers = remove_lawyers(df_lawyers, lawyer_user, 'id')

In [10]:
lawyer_count = df_consultations['lawyer_id'].value_counts().reset_index()
lawyer_count = lawyer_count.rename(columns={'index': 'lawyer_id', 'lawyer_id': 'consultation_count'})
lawyer_count

Unnamed: 0,lawyer_id,consultation_count
0,74,564
1,59,376
2,140,321
3,208,232
4,128,198
...,...,...
117,170,1
118,116,1
119,86,1
120,216,1


In [14]:
pd.Series(df_lawyers.columns)

0                     id
1                user_id
2                  price
3              photo_url
4          affidavit_url
5                ktp_url
6                city_id
7                 gender
8               year_exp
9             avg_rating
10                  slug
11            created_at
12            updated_at
13            deleted_at
14             is_online
15            is_probono
16    agency_province_id
17        agency_city_id
18           agency_name
19           description
20               address
21                  sort
22        affidavit_date
23               is_busy
dtype: object

In [20]:
df_lawyers[['id','slug','year_exp', 'description']]

Unnamed: 0,id,slug,year_exp,description
0,14,Arya Senatama,6,Memiliki pengalaman dalam menangani perkara da...
1,16,"James W.H. Pangaribuan, S.H.",13,-
2,18,Nurul Firdausi,5,Berpengalaman dalam membantu menyelesaikan ber...
3,19,"Andra Reinhard Pasaribu, S.H., M.H.",11,Mr Andra Reinhard Pasaribu has completed his L...
4,20,"Alex Argo Hernowo, S.H.",14,-
...,...,...,...,...
231,263,"Imam Akbaru Al Husein, S.H., M.H.",15,"Imam Akbaru Al Husein, S.H., M.H. merupakan Ad..."
232,264,"Zagky Drajat, S.H.",3,
233,265,"Raju Diagunsyah, S.H., M.H.",7,
234,266,"Albertus Luter, S.H., M.H., CTL., CPCD.",7,"Albertus Luter, S.H., M.H., CTL., CPCD. juga m..."
