In [None]:
import os
import shutil
from datetime import datetime

import pandas as pd
from reportlab.lib.enums import TA_LEFT
from reportlab.lib.pagesizes import letter
from reportlab.lib.styles import ParagraphStyle, getSampleStyleSheet
from reportlab.lib.units import inch
from reportlab.platypus import Paragraph, SimpleDocTemplate, Spacer

In [None]:
# Read in the CSV file
df = pd.read_csv(
    "./data/incoming_first_year_application/incoming_first_year_application.csv"  # Give a location to the csv data file, could be a real path or a relative path.
    # If you want to use the current path, follow below steps
    # 1. In the location of this python notebook (ipynb) file, create a folder "data".
    # 2. In data, create a folder "incoming_first_year_application" (*).
    # 3. Extract the zip file downloaded into (*).
)
print(len(df))
df.head(5)

In [None]:
# Sort by R IDs and submission date of the application. Drop duplicate applications except the latest one.
print("Entries before drop:", len(df))
df["date"] = pd.to_datetime(df["Completed"])
df_sorted = df.sort_values(by=["Rhodes ID", "date"], ascending=[True, False])
df = df_sorted.drop_duplicates(subset="Rhodes ID", keep="first")
print("Entries after drop:", len(df))

In [None]:
# Check "Incoming Students Apps" in the data folder. 
# DOWNLOAD THIS FOLDER DIRECTLY FROM BOX, representing the student we have folders already. 
# If we have 0 folders, just create an empty folder at "./data/Incoming Students Apps/"
# Reason: avoid recreate, and reupload those folders.
existed = []
path = "./data/Incoming Students Apps/"
for d in os.listdir(path):
    if not os.path.isdir(os.path.join(path, d)):
        print(f"Not dir: {d}")

    existed.append(d.split()[-1])

existed = set(existed)
len(existed), existed

In [None]:
# Count all entries that have folder in "Incoming Students Apps" already
existed_df = df[df["Rhodes ID"].isin(existed)]
existed_df["Rhodes ID"]

In [None]:
# Remove all entries that have folder in "Incoming Students Apps" already
filtered_df = df[~df["Rhodes ID"].isin(existed)]
len(filtered_df)

In [None]:
# 2 helper functions
def get_resume(row):
    '''
    Given a row, return the resume file location and its extension.
    Assume all the resume are in their corresponding "submission-xxx" folders, and these "submission-xxx" folders are in "./data/incoming_first_year_application"
    '''
    fold = f"./data/incoming_first_year_application/submission-{row["Serial number"]}/"
    files = os.listdir(fold)
    if len(files) < 1:
        return
    else:
        file = os.path.join(fold, files[0])
        if os.path.isfile(file):
            return file, os.path.splitext(file)[1]
        else:
            print(files)
            raise Exception


def create_pdf(selected_row, column_headers, output_filepath):
    '''
    Given a row, column headers, and a output filepath, this renders a PDF into that filepath.
    '''
    doc = SimpleDocTemplate(output_filepath, pagesize=letter)
    styles = getSampleStyleSheet()
    story = []

    # Add document title
    title_style = styles["h1"]
    title_style.alignment = TA_LEFT
    story.append(Paragraph("Resume", title_style))
    story.append(Spacer(1, 0.2 * inch))

    # Define custom styles for questions and answers
    question_style = ParagraphStyle(
        name="QuestionStyle",
        parent=styles["h3"],  # Use a heading style for questions
        fontSize=12,
        leading=14,
        spaceAfter=6,
        textColor="#333333",
    )
    answer_style = ParagraphStyle(
        name="AnswerStyle",
        parent=styles["Normal"],  # Use normal style for answers
        fontSize=10,
        leading=12,
        spaceAfter=12,
        textColor="#555555",
    )

    # Iterate through the column headers and the selected row's values
    for header in column_headers:
        question = f"Q: {header}"
        answer = f"A: {selected_row[header]}"

        # Add question and answer to the story
        story.append(Paragraph(question, question_style))
        story.append(Paragraph(answer, answer_style))
        story.append(Spacer(1, 0.1 * inch))  # Small space after each Q&A pair

    # Build the PDF
    doc.build(story)
    print(f"PDF '{output_filepath}' generated successfully!")

In [None]:
# Columns that I want to remove from the final PDF file. These are mainly information of the submission.
cols = [
    "Serial number",
    "Changed",
    "Is draft",
    "Submitted by: ID",
    "Submitted by: Title",
    "Submitted by: URL",
    "date",
]
dropped_df = filtered_df.drop(columns=cols)

In [None]:
# Main loop
# For each entries, create that student's folder path
# Then, copy the resume into that folder, and render the PDF into that folder
for index, row in filtered_df.iterrows():
    name = f"{row["First Name"]} {row["Last Name"]}"
    folder_name = f"{name} - {row["Rhodes ID"]}".strip()

    # Render all new student folders into "./data/New Incoming Students Apps/"
    new_fold = f"./data/New Incoming Students Apps/{folder_name}"
    resume, ext = get_resume(row)
    os.makedirs(new_fold, exist_ok=True)
    shutil.copy(resume, f"{new_fold}/{name}_Resume.{ext}")
    create_pdf(row, dropped_df.columns.tolist(), f"{new_fold}/{name}_App.pdf")

In [None]:
# Now, all new student folders are in "./data/New Incoming Students Apps"