In [1]:
import pandas as pd
import glob
import os
import string


def read_excel(file_path: str) -> pd.DataFrame:
    """
    Reads an Excel file with the specified format.
    
    Args:
        file_path (str): Path to the Excel file.
    
    Returns:
        pd.DataFrame: The DataFrame loaded from the Excel file.
    """
    return pd.read_excel(file_path, sheet_name='Respuestas de formulario 1')


def process_data(df: pd.DataFrame) -> (pd.DataFrame, str):
    """
    Processes the DataFrame by converting "Marca temporal" to a date format and cleaning email addresses.
    
    Args:
        df (pd.DataFrame): The DataFrame to process.
    
    Returns:
        pd.DataFrame: The processed DataFrame.
        str: The formatted target date.
    """
    df['Marca temporal'] = pd.to_datetime(df['Marca temporal'])
    # Convert "Marca temporal" to a date in format
    df['Marca temporal'] = df['Marca temporal'].apply(lambda x: x.replace(hour=0, minute=0, second=0, microsecond=0).strftime("%Y-%m-%d %H:%M:%S"))
    target_date = pd.to_datetime(df['Marca temporal'].iloc[0]).to_pydatetime()
    df['Dirección de correo electrónico'] = df['Dirección de correo electrónico'].apply(clean_email)
    return df, target_date

def match_and_update(asistencias: pd.DataFrame, df: pd.DataFrame, target_date: str) -> pd.DataFrame:
    """
    Matches emails and updates the `asistencias` DataFrame.
    
    Args:
        asistencias (pd.DataFrame): The main attendance DataFrame.
        df (pd.DataFrame): The processed DataFrame.
        target_date (str): The target date extracted from the processed DataFrame.
    
    Returns:
        pd.DataFrame: The updated `asistencias` DataFrame.
    """
    if target_date in asistencias.columns:
        # Match emails
        email_matches = asistencias.iloc[:, 3].apply(
            lambda email: 1 if email in df['Dirección de correo electrónico'].values else 0
        )
        
        # Assign results to the column in `asistencias`
        asistencias[target_date] = email_matches
        print(f"En la lista hay {len(df['Dirección de correo electrónico'].unique())} alumnos")
        print(f"Se generaron {email_matches.sum()} match entre alumnos")
    else:
        print(f"Column with date {target_date} not found in asistencias.")
    
    return asistencias


def clean_email(email: str) -> str:
    """
    Cleans an email address by removing non-printable characters and converting to lowercase.
    
    Args:
        email (str): The email to clean.
    
    Returns:
        str: The cleaned email address.
    """
    return email.strip().strip(string.whitespace).lower()

def convert_datetime_columns(df):
    # Iterate through column names and check for the format `YYYY-MM-DD 00:00:00`
    new_column_names = {}
    for col in df.columns:
        try:
            # Try converting the column name to a datetime object
            date = pd.to_datetime(col, format='%Y-%m-%d %H:%M:%S', errors='coerce')
            if pd.notna(date):  # If conversion is successful
                # Convert to the new format `YYYY-MM-DD`
                new_column_names[col] = date.strftime('%Y-%m-%d')
        except Exception:
            pass  # Skip columns that cannot be converted

    # Rename columns in the DataFrame
    return df.rename(columns=new_column_names, inplace=True)

def run_pipeline(folder_path: str, asistencias: pd.DataFrame):
    """
    Main function to process all attendance Excel files and update the `asistencias` DataFrame.
    
    Args:
        folder_path (str): Path to the folder containing attendance Excel files.
        asistencias (pd.DataFrame): The main attendance DataFrame to update.
    
    Returns:
        pd.DataFrame: The updated `asistencias` DataFrame.
    """

    asistencias.iloc[:, 3] = asistencias.iloc[:, 3].apply(clean_email)
    # Find all Excel files matching the pattern
    excel_files = glob.glob(os.path.join(folder_path, "Encuesta clase y asistencia *.xlsx"))
    
    for file_path in excel_files:
        print(f"Processing file: {file_path}")
        
        # Step 1: Read the Excel file with assistance data
        df = read_excel(file_path)
        
        # Step 2: Process assistance data
        df, target_date = process_data(df)
        
        # Step 3: Match and update with `asistencias', which is the main DataFrame with all attendance
        asistencias = match_and_update(asistencias, df, target_date)
        
        # Step 4: Delete NA's column to get only the days with classes
        asistencias.dropna(how='all', axis=1,inplace=True)
    
    return asistencias


# Example usage
if __name__ == "__main__":
    # read `asistencias', which is the main DataFrame with all attendance
    folder_path =  '/Users/iairlinker/Documents/repos/mitic-data-science-team-1-septiembre-2024/automatizaciones/data/'
    asistencias = pd.read_excel(folder_path + 'mitic-data-science-team-1-septiembre-2024.xlsx',
                            sheet_name='Asistencia',header=2)
    asistencias = run_pipeline(folder_path, asistencias)
    # Optionally save the updated `asistencias`
    asistencias.to_excel('/Users/iairlinker/Documents/repos/mitic-data-science-team-1-septiembre-2024/automatizaciones/data/updated_asistencias.xlsx', index=False)


AttributeError: 'int' object has no attribute 'strip'

In [2]:
asistencias = pd.read_excel(folder_path + 'mitic-data-science-team-1-septiembre-2024.xlsx',sheet_name='Asistencia',header=2)
asistencias.head()


Unnamed: 0.1,Unnamed: 0,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,% Asistencia,Estado Asistencia,2024-09-23 00:00:00,2024-09-24 00:00:00,2024-09-25 00:00:00,...,2025-01-31 00:00:00,2025-02-01 00:00:00,2025-02-02 00:00:00,2025-02-03 00:00:00,2025-02-04 00:00:00,2025-02-05 00:00:00,2025-02-06 00:00:00,2025-02-07 00:00:00,2025-02-08 00:00:00,2025-02-09 00:00:00
0,1,Alejandro,Romero Villalba,Alejandroromero68@gmail.com,Current Student,1.0,,,,,...,,,,,,,,,,
1,2,Alexis Ramon,Frutos Gallardo,Alexisfrutos2012@gmail.com,Current Student,1.0,,,,,...,,,,,,,,,,
2,3,Andrea Monserrat,Echague Morel,Andreamechaguem@gmail.com,Current Student,1.0,,,,,...,,,,,,,,,,
3,4,Annia Micaela,Benitez Hofbauer,Anniamicaela@gmail.com,Current Student,1.0,,,,,...,,,,,,,,,,
4,5,Antonio Jose,Miltos Velaztiqui,velaztiquimaxi@gmail.com,Current Student,0.875,,,,,...,,,,,,,,,,


In [10]:
for file_path in excel_files:
    print(f"Processing file: {file_path}")

    # Step 1: Read the Excel file with assistance data
    df = read_excel(file_path)

    # Step 2: Process assistance data
    df, target_date = process_data(df)

    # Step 3: Match and update with `asistencias', which is the main DataFrame with all attendance
    asistencias = match_and_update(asistencias, df, target_date)

    # Step 4: Delete NA's column to get only the days with classes
    asistencias.dropna(how='all', axis=1,inplace=True)

Processing file: /Users/iairlinker/Documents/repos/mitic-data-science-team-1-septiembre-2024/automatizaciones/data/Encuesta clase y asistencia 05-11-2024.xlsx
En la lista hay 39 alumnos
Se generaron 4 match entre alumnos
Processing file: /Users/iairlinker/Documents/repos/mitic-data-science-team-1-septiembre-2024/automatizaciones/data/Encuesta clase y asistencia 19-11-2024.xlsx
Column with date 2024-11-19 00:00:00 not found in asistencias.
Processing file: /Users/iairlinker/Documents/repos/mitic-data-science-team-1-septiembre-2024/automatizaciones/data/Encuesta clase y asistencia 31-10-2024.xlsx
Column with date 2024-10-31 00:00:00 not found in asistencias.
Processing file: /Users/iairlinker/Documents/repos/mitic-data-science-team-1-septiembre-2024/automatizaciones/data/Encuesta clase y asistencia 24-10-2024.xlsx
En la lista hay 39 alumnos
Se generaron 5 match entre alumnos
Processing file: /Users/iairlinker/Documents/repos/mitic-data-science-team-1-septiembre-2024/automatizaciones/data