In [2]:
from bokeh.plotting import figure, show
from bokeh.models import ColumnDataSource, HoverTool
from bokeh.palettes import Viridis256
from bokeh.io import save

import pandas as pd

# Cursos activos en Canvas

TODO: Agregar el filtro por periodo 

In [10]:
filename = "../src/CSVs/context_modules.csv"
# Read the CSV file into a pandas DataFrame
df = pd.read_csv(f"{filename}")

# Drop duplicates based on two columns 'value.name' and 'value.workflow_state' to get total courses
total_courses = df.drop_duplicates(subset=['value.name', 'value.workflow_state', 'value.id'])

# Filter the DataFrame to get only active courses
active_courses = total_courses[total_courses['value.workflow_state'] == 'active']
deleted_courses = total_courses[total_courses['value.workflow_state'] == 'deleted']

# Calculate the active courses rate
active_courses_rate = len(active_courses) / len(total_courses)
deleted_courses_rate = len(deleted_courses) / len(total_courses)


# Print the results
print(f"Total Courses: {len(total_courses)}")
print(f"Active Courses: {len(active_courses)}")
print(f"Active Courses Rate: {active_courses_rate:.2%}")
print(f"Deleted Courses: {len(active_courses)}")
print(f"Deleted Courses Rate: {deleted_courses_rate:.2%}")

blue_color = "#2D92B2"
# Create a new dictionary to store the data for the bar plot
data = {
    "Workflow State": ["Active", "Deleted"],
    "Courses Rate": [active_courses_rate, deleted_courses_rate],
    "Color": ["green", blue_color]
}

# Create a new figure object
p = figure(
    x_range=["Active", "Deleted"],
    height=400,
    title="Active vs Deleted Courses Rate Aggregated data till 2023)",
    toolbar_location="right",
    tools="pan,box_zoom,wheel_zoom,reset,save"
)

# Add HoverTool with tooltips
hover = HoverTool()
hover.tooltips = [
    ("Rate", "@top{0.00%}"),
    ("Color", "$color[swatch]:fill_color")
]
p.add_tools(hover)

# Create the bar plot
p.vbar(
    x="Workflow State",
    top="Courses Rate",
    width=0.5,
    source=data,
    color="Color",
)

# Add labels and title to the axes
p.xaxis.axis_label = "Workflow State"
p.yaxis.axis_label = "Courses Rate"
p.y_range.start = 0
p.y_range.end = 1

save(p, "active_vs_deleted_courses_rate.html")

# Display the plot
show(p)



Total Courses: 16802
Active Courses: 11071
Active Courses Rate: 65.89%
Deleted Courses: 11071
Deleted Courses Rate: 18.38%


  save(p, "active_vs_deleted_courses_rate.html")
  save(p, "active_vs_deleted_courses_rate.html")


# Módulos

In [7]:
filename = "../src/CSVs/context_modules.csv"
# Read the CSV file into a pandas DataFrame
df = pd.read_csv(f"{filename}")

# Drop duplicates based on two columns 'value.name' and 'value.workflow_state' to get total modules
total_modules = df.drop_duplicates(subset=['value.name', 'value.workflow_state'])

# Filter the DataFrame to get only active courses
active_modules = total_modules[total_modules['value.workflow_state'] == 'active']
deleted_modules = total_modules[total_modules['value.workflow_state'] == 'deleted']
unpublished_modules = total_modules[total_modules['value.workflow_state'] == 'unpublished']

# Calculate the active courses rate
active_modules_rate = len(active_modules) / len(total_modules)
deleted_modules_rate = len(deleted_modules) / len(total_modules)
unpublished_modules_rate = len(unpublished_modules) / len(total_modules)

# Print the results
print(f"Total Modules: {len(total_modules)}")
print(f"Active Modules: {len(active_modules)}")
print(f"Active Modules Rate: {active_modules_rate:.2%}")
print(f"Deleted Modules: {len(deleted_modules)}")
print(f"Deleted Modules Rate: {deleted_modules_rate:.2%}")
print(f"Unpublished Modules Rate: {unpublished_modules_rate:.2%}")


blue_color = "#2D92B2"
yellow_color = "#FFBF00"
# Create a new dictionary to store the data for the bar plot
data = {
    "Workflow State": ["Active", "Deleted", "Unpublished"],
    "Modules Rate": [active_modules_rate, deleted_modules_rate, unpublished_modules_rate],
    "Color": ["green", blue_color, yellow_color]
}

# Create a new figure object
p = figure(
    x_range=["Active", "Deleted", "Unpublished"],
    height=400,
    title="Active, Deleted and Deleted modules rate (Aggregated data till 2023)",
    toolbar_location="right",
    tools="pan,box_zoom,wheel_zoom,reset,save"
)

# Add HoverTool with tooltips
hover = HoverTool()
hover.tooltips = [
    ("Rate", "@top{0.00%}"),
    ("Color", "$color[swatch]:fill_color")
]
p.add_tools(hover)

# Create the bar plot
p.vbar(
    x="Workflow State",
    top="Modules Rate",
    width=0.5,
    source=data,
    color="Color",
)

# Add labels and title to the axes
p.xaxis.axis_label = "Workflow State"
p.yaxis.axis_label = "Modules Rate"
p.y_range.start = 0
p.y_range.end = 1

save(p, "modules_rates.html")

# Display the plot
show(p)


Total Modules: 16802
Active Modules: 11071
Active Modules Rate: 65.89%
Deleted Modules: 3088
Deleted Modules Rate: 18.38%
Unpublished Modules Rate: 15.73%


  save(p, "modules_rates.html")
  save(p, "modules_rates.html")


In [66]:
unique_ids_count = df['key.id'].nunique()
print(unique_ids_count) 
unique_ids_count == len(df['key.id'])

58039


True

1. que quiero mostrar?
2. definir el KPI = formula del KPI
3. de dónde salen los datos para la fórmula 
4. cuales son los marcos de tiempo

In [22]:
path = "/Users/larissatrasvina/thesis-canvas/src/courses/courses.csv"

## KPIs
### 2. Course Availability and Activity
KPI: Ratio of active to inactive courses per semester.
- Tables Used: courses
- Relevant Fields:
    - workflow_state: Indicates the status of the course (e.g., 'active', 'completed', 'deleted').
    - created_at: Date the course was created.
    - end_at: Date the course ended (used to determine if a course is inactive if it has ended).



In [23]:
from typing import Tuple

courses_path = "/Users/larissatrasvina/thesis-canvas/src/courses/courses.csv"

def create_course_availability_and_activity(courses_path: str) -> Tuple:
    courses_df = pd.read_csv(courses_path)

    courses_df['start_at'] =  pd.to_datetime(courses_df['value.created_at'])
    courses_df['end_at'] = pd.to_datetime(courses_df['value.updated_at'])

    # Define the semester start and end dates (naive timestamps)
    semester_start_date = pd.to_datetime('2024-01-01', utc=True)
    semester_end_date = pd.to_datetime('2024-05-31', utc=True)

    # Filter for Active Courses within the semester
    active_courses = courses_df[
        (courses_df['value.workflow_state'] == 'available') &
        (courses_df['start_at'] <= semester_end_date) &
        ((courses_df['end_at'] >= semester_start_date) | (courses_df['end_at'].isna()))
    ]

    # Filter for Inactive Courses within the semester
    inactive_courses = courses_df[
        (courses_df['value.workflow_state'] != 'available') |
        (courses_df['end_at'].notna() & (courses_df['end_at'] < semester_start_date))
    ]

    # Calculate the counts
    active_count = active_courses.shape[0]
    inactive_count = inactive_courses.shape[0]

    # Calculate the Ratio of Active to Inactive Courses
    if inactive_count > 0:
        ratio_active_to_inactive = active_count / inactive_count
    else:
        ratio_active_to_inactive = float('inf')  # To handle division by zero if no inactive courses

    return active_count, inactive_count, ratio_active_to_inactive

active_count, inactive_count, ratio_active_to_inactive = create_course_availability_and_activity(courses_path)


### KPI: Percentage of students retained from the beginning to the end of the course.
- Tables Used: enrollments
- enrollments table:
    - course_id: Identifies the course.
    - user_id: Identifies the student.
    - type: Identifies the type of enrollment (e.g., 'StudentEnrollment').
    - state: Indicates the current state of the enrollment (e.g., 'active', 'completed', 'dropped').

In [26]:
enrollments_path = "/Users/larissatrasvina/thesis-canvas/src/enrollments/enrollments.csv"

enrollments_df = pd.read_csv(enrollments_path)
enrollments_df.head()

cols = ["key.id",
        "value.storage_quota",
        "value.integration_id",
        "value.lti_context_id",
        "value.sis_batch_id",
        "value.created_at",value.updated_at,value.workflow_state,value.account_id,value.grading_standard_id,value.start_at,value.sis_source_id,value.group_weighting_scheme,value.conclude_at,value.is_public,value.allow_student_wiki_edits,value.syllabus_body,value.default_wiki_editing_roles,value.wiki_id,value.allow_student_organized_groups,value.course_code,value.default_view,value.abstract_course_id,value.enrollment_term_id,value.open_enrollment,value.tab_configuration,value.turnitin_comments,value.self_enrollment,value.license,value.indexed,value.restrict_enrollments_to_course_dates,value.template_course_id,value.replacement_course_id,value.public_description,value.self_enrollment_code,value.self_enrollment_limit,value.turnitin_id,value.show_announcements_on_home_page,value.home_page_announcement_limit,value.latest_outcome_import_id,value.grade_passback_setting,value.template,value.homeroom_course,value.sync_enrollments_from_homeroom,value.homeroom_course_id,value.locale,value.name,value.time_zone,value.uuid,value.settings.allow_student_discussion_editing,value.settings.allow_student_discussion_topics,value.settings.course_format,value.settings.filter_speed_grader_by_student_group,value.settings.hide_distribution_graphs,value.settings.hide_final_grade,value.settings.is_public_to_auth_users,value.settings.lock_all_announcements,value.settings.public_syllabus,value.settings.public_syllabus_to_auth,value.settings.restrict_student_future_view,value.settings.restrict_student_past_view,value.settings.syllabus_updated_at,value.settings.usage_rights_required,value.settings.allow_student_forum_attachments,meta.ts]
# Convert dates to datetime
# enrollments_df['enrollment_date'] = pd.to_datetime(enrollments_df['enrollment_date'])
# enrollments_df['end_date'] = pd.to_datetime(enrollments_df['end_date'])

# # Define course start and end dates
course_start_date = pd.to_datetime('2024-01-01')
course_end_date = pd.to_datetime('2024-05-31')

# # Count students enrolled at the start of the course
# initial_enrollment = enrollments_df[
#     (enrollments_df['enrollment_date'] <= course_start_date) & 
#     (enrollments_df['state'] == 'active')
# ].shape[0]

# # Count students still enrolled at the end of the course
# final_enrollment = enrollments_df[
#     (enrollments_df['end_date'] >= course_end_date) & 
#     (enrollments_df['state'] == 'active')
# ].shape[0]

# # Calculate the Student Retention Rate
# if initial_enrollment > 0:
#     retention_rate = (final_enrollment / initial_enrollment) * 100
# else:
#     retention_rate = 0

Unnamed: 0,key.id,value.storage_quota,value.integration_id,value.lti_context_id,value.sis_batch_id,value.created_at,value.updated_at,value.workflow_state,value.account_id,value.grading_standard_id,...,value.settings.is_public_to_auth_users,value.settings.lock_all_announcements,value.settings.public_syllabus,value.settings.public_syllabus_to_auth,value.settings.restrict_student_future_view,value.settings.restrict_student_past_view,value.settings.syllabus_updated_at,value.settings.usage_rights_required,value.settings.allow_student_forum_attachments,meta.ts
0,8216,,,d43cae8ce1d59cf27a6b10b2ed9d1c4266adf457,14449.0,2022-12-15T23:04:38.472Z,2024-05-27T00:08:52.695Z,available,322,,...,False,False,False,False,False,False,"""2023-01-13 17:53:09 UTC""",False,False,2024-06-01T00:32:24.225Z
1,8270,,,0d9c9507fe6a615ca737f612da611b169d44803e,14449.0,2022-12-15T23:04:39.455Z,2024-05-27T00:08:52.704Z,available,441,,...,,,,,,,"""2023-01-07 03:49:32 UTC""",,,2024-06-01T00:32:24.225Z
2,8260,,,7c1ceea026c98d274da42ba60add1b2d7d52ec4b,14449.0,2022-12-15T23:04:39.257Z,2024-05-27T00:08:52.695Z,available,325,,...,False,False,,,True,True,"""2023-01-03 20:35:02 UTC""",False,False,2024-06-01T00:32:24.225Z
3,8241,,,67e805caaa74f211d6197864ae548729a2674bef,14449.0,2022-12-15T23:04:38.873Z,2024-05-27T00:08:52.695Z,available,363,,...,False,False,,,True,True,,False,False,2024-06-01T00:32:24.225Z
4,8167,,,3793acd52bbf07984a38beaa23f16f99173672b4,14449.0,2022-12-15T23:04:36.356Z,2024-05-27T00:08:52.695Z,available,131,,...,False,False,False,False,False,False,"""2023-01-05 19:59:41 UTC""",False,True,2024-06-01T00:32:24.225Z
