In [29]:
import os
import glob
import pandas as pd

# Use the absolute path to the science directory
directory = '/Users/luisfaria/Desktop/sEngineer/dash/science'

# Create an empty list to store DataFrames
dfs = []

# Read all Excel files and store them in the list
for filename in glob.glob(os.path.join(directory, '*.xlsx')):
    print(f"Reading file: {filename}")
    try:
        df = pd.read_excel(filename)
        # Add a column to identify the source file (optional)
        df['source_file'] = os.path.basename(filename)
        dfs.append(df)
        print(f"Shape of {os.path.basename(filename)}: {df.shape}")
    except Exception as e:
        print(f"Error reading {filename}: {str(e)}")

# Concatenate all DataFrames
if dfs:
    combined_df = pd.concat(dfs, ignore_index=True)
    print("\nShape of combined DataFrame:", combined_df.shape)
    print("\nFirst 5 rows of combined data:")
    print(combined_df.head())
else:
    print("No DataFrames to concatenate!")

Reading file: /Users/luisfaria/Desktop/sEngineer/dash/science/agd_2024.xlsx
Shape of agd_2024.xlsx: (426241, 26)
Reading file: /Users/luisfaria/Desktop/sEngineer/dash/science/agd_2023.xlsx
Shape of agd_2023.xlsx: (403038, 24)
Reading file: /Users/luisfaria/Desktop/sEngineer/dash/science/agd_2022.xlsx
Shape of agd_2022.xlsx: (478218, 25)

Shape of combined DataFrame: (1307497, 29)

First 5 rows of combined data:
   Unnamed: 0 ID agendamento ID cliente                Nome cliente  \
0         0.0        2023421     848102              Roberta Reimao   
1         1.0        2061210     753045     Kamilla Katzor de Mello   
2         2.0        2061211     753045     Kamilla Katzor de Mello   
3         3.0        2061212     753045     Kamilla Katzor de Mello   
4         4.0        2065159     854439  Rosiane Rodrigues da Silva   

              CPF                            Email  \
0             NaN         reimao.roberta@gmail.com   
1  359.614.958-41  kamilla.katzormello@hotmail.com

In [32]:
combined_df['source_file'].value_counts()

Index(['Unnamed: 0', 'ID agendamento', 'ID cliente', 'Nome cliente', 'CPF',
       'Email', 'Telefone', 'Endereço', 'Fonte de cadastro do cliente',
       'Unidade do agendamento', 'Procedimento', 'Prestador',
       'Grupo do procedimento', 'Data', 'Hora', 'Status', 'Duração', 'Máquina',
       'Data primeira atendente', 'Nome da primeira atendente',
       'Grupo da primeira atendente', 'Observação (mais recente)',
       'Última data de alteração do status',
       'Último usuário a alterar o status', 'Grupo Financeiro', 'source_file',
       'Mês', 'dia', 'mes'],
      dtype='object')

In [34]:
columns_to_keep = ['ID agendamento', 'ID cliente', 'Nome cliente', 'CPF',
       'Telefone', 'Unidade do agendamento', 'Data', 'Status']

combined_df = combined_df[columns_to_keep]

In [38]:
# Transforming 'Data' column to datetime with correct format
combined_df['Data'] = pd.to_datetime(combined_df['Data'], dayfirst=True, errors='coerce')

# Extract the year after successful conversion
combined_df['ano'] = combined_df['Data'].dt.year

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  combined_df['Data'] = pd.to_datetime(combined_df['Data'], dayfirst=True, errors='coerce')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  combined_df['ano'] = combined_df['Data'].dt.year


In [43]:
mask_1 = combined_df['Status'] == "Atendido"
mask_2 = combined_df['Unidade do agendamento'] == "LAPA"

served_combined_df_lapa = combined_df.loc[mask_1 & mask_2]
served_combined_df_lapa

lapa_served_2022 = served_combined_df_lapa[served_combined_df_lapa['ano'] == 2022]

lapa_served_2023_and_2024 = served_combined_df_lapa[(served_combined_df_lapa['ano'] == 2023) | (served_combined_df_lapa['ano'] == 2024)]

# Lapa served in 2022 that were not served in 2023 and 2024

Unnamed: 0,ID agendamento,ID cliente,Nome cliente,CPF,Telefone,Unidade do agendamento,Data,Status,ano
144,2372414,899875,Jaqueline da Silva Costa Barbosa,326.007.488-06,11966647267,LAPA,2024-03-05,Atendido,2024.0
145,2372415,899875,Jaqueline da Silva Costa Barbosa,326.007.488-06,11966647267,LAPA,2024-03-12,Atendido,2024.0
210,2378398,889991,Stephanie Rodrigues,481.089.268-90,11940412536,LAPA,2024-03-02,Atendido,2024.0
211,2378399,889991,Stephanie Rodrigues,481.089.268-90,11940412536,LAPA,2024-03-02,Atendido,2024.0
216,2378405,889991,Stephanie Rodrigues,481.089.268-90,11940412536,LAPA,2024-03-23,Atendido,2024.0
...,...,...,...,...,...,...,...,...,...
829069,2354056,861463,Leila Evangelista da Silva,045.134.668-80,11953450872,LAPA,2023-11-06,Atendido,2023.0
829070,2354057,861463,Leila Evangelista da Silva,045.134.668-80,11953450872,LAPA,2023-11-06,Atendido,2023.0
829099,2354116,889149,Tayná Matos de Novaes,540.968.908-98,11971092709,LAPA,2023-11-06,Atendido,2023.0
829100,2354117,889149,Tayná Matos de Novaes,540.968.908-98,11971092709,LAPA,2023-11-06,Atendido,2023.0


In [45]:
# Unique clients served in Lapa in 2022
clients_2022 = set(lapa_served_2022['Nome cliente'])

# Unique clients served in Lapa in 2023 or 2024
clients_2023_2024 = set(lapa_served_2023_and_2024['Nome cliente'])

# Clients served in 2022 but not in 2023 or 2024
clients_only_2022 = clients_2022 - clients_2023_2024

# Filter the DataFrame to show these clients
lapa_served_only_2022 = lapa_served_2022[lapa_served_2022['Nome cliente'].isin(clients_only_2022)]

# Display the result
lapa_served_only_2022



Unnamed: 0,ID agendamento,ID cliente,Nome cliente,CPF,Telefone,Unidade do agendamento,Data,Status,ano
829298,2388680,846657,Julia Costa Paulo Afoz,47826391806,(11) 998227979,LAPA,2022-01-03,Atendido,2022.0
829409,2361465,804976,Paula Francisca Rodrigues Rezende,30848711831,(11) 989094303,LAPA,2022-01-03,Atendido,2022.0
829463,2361466,804976,Paula Francisca Rodrigues Rezende,30848711831,(11) 989094303,LAPA,2022-01-03,Atendido,2022.0
829464,2361743,843065,Maiara Carla Gomes Domingos,44045333819,(11) 995819468,LAPA,2022-01-03,Atendido,2022.0
829512,2361744,843065,Maiara Carla Gomes Domingos,44045333819,(11) 995819468,LAPA,2022-01-03,Atendido,2022.0
...,...,...,...,...,...,...,...,...,...
1306259,1968011,842440,Stephanie Marques,440.042.668-82,(11) 96709-9956,LAPA,2022-12-30,Atendido,2022.0
1306280,1968095,816735,Irlane Vieira Santos,018.590.935-31,11954249632,LAPA,2022-12-28,Atendido,2022.0
1306377,1968359,835439,Marcelo Cardoso da Silva,,11986931816,LAPA,2022-12-27,Atendido,2022.0
1307389,1974460,836358,Geovana Queiroz Oliveira,363.817.298-80,11970387018,LAPA,2022-12-06,Atendido,2022.0


In [52]:
lapa_served_only_2022_unique = lapa_served_only_2022.drop_duplicates("Nome cliente")

final_columns = ['Nome cliente', 'Telefone']
lapa_served_only_2022_unique = lapa_served_only_2022[final_columns]

In [54]:
# removing duplicates
lapa_served_only_2022_unique.to_excel("lapa_atendidos_apenas_2022.xlsx", index=False)