In [1]:
import pandas as pd

def get_semester_dates(year, semester):
    """
    Returns the start and end date of the semester based on the given year and semester type, respecting UTC time.
    Semesters:
    - Spring: January to June
    - Summer: June to July
    - Winter: August to December
    """
    if semester == 'Spring':
        return pd.Timestamp(f'{year}-01-01', tz='UTC'), pd.Timestamp(f'{year}-06-30', tz='UTC')
    elif semester == 'Summer':
        return pd.Timestamp(f'{year}-06-01', tz='UTC'), pd.Timestamp(f'{year}-07-31', tz='UTC')
    elif semester == 'Winter':
        return pd.Timestamp(f'{year}-08-01', tz='UTC'), pd.Timestamp(f'{year}-12-31', tz='UTC')
    else:
        raise ValueError("Invalid semester. Choose between 'Spring', 'Summer', or 'Winter'.")


In [7]:
def create_course_availability_and_activity_for_semester(courses, year, semester):
    # Get the start and end date for the selected semester with UTC time
    start_semester, end_semester = get_semester_dates(year, semester)

    # Convert course dates to UTC
    courses['start_at'] = pd.to_datetime(courses['start_at'], utc=True)
    courses['end_at'] = pd.to_datetime(courses['end_at'], utc=True)

    # Filter active courses within the semester
    active_courses = courses[
        (courses['workflow_state'] == 'active') &
        (courses['start_at'] <= end_semester) &
        ((courses['end_at'] >= start_semester) | courses['end_at'].isna())
    ]

    # Filter inactive courses within the semester
    inactive_courses = courses[
        (courses['workflow_state'] != 'active') &
        (courses['start_at'] <= end_semester) &
        (courses['end_at'] < start_semester) & 
        courses['end_at'].notna()  # Ensure the course has ended
    ]

    # Calculate counts
    active_count = active_courses.shape[0]
    inactive_count = inactive_courses.shape[0]

    # Calculate ratio of active to inactive courses
    if inactive_count > 0:
        ratio_active_to_inactive = active_count / inactive_count
    else:
        ratio_active_to_inactive = float('inf')

    return active_count, inactive_count, ratio_active_to_inactive


In [51]:
from utils.constants import SCORES_PATH, ENROLLMENTS_PATH
scores_df = pd.read_csv(SCORES_PATH)
enrollments_df = pd.read_csv(ENROLLMENTS_PATH)

def calculate_average_score(scores, year, semester):
    start_semester, end_semester = get_semester_dates(year, semester)

    scores_df['value.updated_at'] = pd.to_datetime(scores_df['value.updated_at'])

    filtered_scores = scores_df[
        (scores_df['value.updated_at'] >= start_semester) & 
        (scores_df['value.updated_at'] <= end_semester) & 
        (scores_df['value.workflow_state'] == 'active')  # Consider only active scores
    ]

    if filtered_scores.empty:
        return 0.0

    average_score = filtered_scores['value.current_score'].mean()
    return average_score

# Example usage:
# Assuming 'scores_df' is the DataFrame from the CSV provided, and filtering for Spring 2023:
average_score_spring_2023 = calculate_average_score(scores_df, 2023, 'Summer')
print(average_score_spring_2023)
# print(f"Average score for Spring 2023: {average_score_spring_2023:.2f}")

65.97918946301924


In [47]:
average_score_spring_2023 = calculate_average_score(scores_df, 2022, 'Winter')
print(f"Average score for Spring 2023: {average_score_spring_2023:.2f}")

Average score for Spring 2023: 67.86


In [75]:
def calculate_score_distribution(scores_df, year, semester):

    scores_df['value.created_at'] = pd.to_datetime(scores_df['value.created_at'], utc=True)
    scores_df['value.updated_at'] = pd.to_datetime(scores_df['value.updated_at'], utc=True)
    scores_df = scores_df.dropna(subset=['value.created_at'])

    start_date, end_date = get_semester_dates(year, semester)

    filtered_scores = scores_df[
        (scores_df['value.created_at'] <= end_date) &   
        ((scores_df['value.updated_at'] >= start_date) |
         (scores_df['value.updated_at'].isna()))  
    ]

    scores = filtered_scores['value.final_score'].dropna()

    return scores



In [71]:
scores = calculate_score_distribution(scores_df, 2024, 'Spring')
print("scores are: ", scores)
scores.shape

scores are:  27         90.0
30          0.0
43         95.0
148        50.5
161        88.0
           ... 
1071544     0.0
1071545     0.0
1071546     0.0
1071549     0.0
1071550     0.0
Name: value.final_score, Length: 97228, dtype: float64


(97228,)

In [78]:
def calculate_average_feedback_time(submissions_df, year, semester):
    submissions_df['value.created_at'] = pd.to_datetime(submissions_df['value.created_at'], utc=True)
    submissions_df['value.updated_at'] = pd.to_datetime(submissions_df['value.updated_at'], utc=True)

    start_date, end_date = get_semester_dates(year, semester)

    filtered_submissions = submissions_df[
        (submissions_df['value.created_at'] >= start_date) & 
        (submissions_df['value.created_at'] <= end_date)
    ]

    filtered_submissions['feedback_time'] = (filtered_submissions['value.updated_at'] - filtered_submissions['value.created_at']).dt.days

    return filtered_submissions[['feedback_time']].dropna()


In [7]:
from utils.constants import SUBMISSIONS_PATH
submissions_df = pd.read_csv(SUBMISSIONS_PATH)
year = 2024
semester = "Spring"
# avg_feedback_time = calculate_average_feedback_time(submissions_df, year, semester)
# print(avg_feedback_time)

  submissions_df = pd.read_csv(SUBMISSIONS_PATH)


In [8]:
import pandas as pd
import numpy as np

def calculate_average_feedback_time(submissions_df, year, semester):
    # Convert dates to datetime
    submissions_df['value.created_at'] = pd.to_datetime(submissions_df['value.created_at'], utc=True)
    submissions_df['value.updated_at'] = pd.to_datetime(submissions_df['value.updated_at'], utc=True)

    # Get semester start and end dates
    start_date, end_date = get_semester_dates(year, semester)

    # Filter submissions by semester
    filtered_submissions = submissions_df[
        (submissions_df['value.created_at'] >= start_date) & 
        (submissions_df['value.created_at'] <= end_date)
    ]

    # Calculate feedback time in hours
    filtered_submissions['feedback_time'] = (
        filtered_submissions['value.updated_at'] - filtered_submissions['value.created_at']
    ).dt.total_seconds() / 3600

    # Remove rows with NaN feedback times
    filtered_submissions = filtered_submissions.dropna(subset=['feedback_time'])

    # Calculate average feedback time per course
    avg_feedback_time_per_course = filtered_submissions.groupby('value.course_id')['feedback_time'].mean().reset_index()

    # Rename columns for clarity
    avg_feedback_time_per_course.columns = ['value.course_id', 'average_feedback_time']

    return avg_feedback_time_per_course


In [9]:
print(calculate_average_feedback_time(submissions_df, year, semester))

     value.course_id  average_feedback_time
0               3203               0.002404
1               3577            1581.234593
2               3904               0.000000
3               6309               0.012713
4               6349             517.929287
..               ...                    ...
439            12279             561.380041
440            12283             506.386754
441            12284            1613.543194
442            12292            1385.362135
443            12295            2067.664797

[444 rows x 2 columns]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_submissions['feedback_time'] = (


In [10]:
from bokeh.io import curdoc
from bokeh.layouts import column
from bokeh.models import ColumnDataSource, HoverTool, ColorBar, LinearColorMapper
from bokeh.plotting import figure
from bokeh.transform import linear_cmap
from bokeh.palettes import Viridis256
import pandas as pd

from utils import constants  # Assuming this imports constants used elsewhere
from kpi_calculator import calculate_average_feedback_time

# Read CSV data
submissions_df = pd.read_csv(constants.SUBMISSIONS_PATH)

# Calculate average feedback time per course (assuming the function returns a DataFrame)
avg_feedback_time_per_course = calculate_average_feedback_time(submissions_df, year=2024, semester='Spring')

# Prepare data for plotting (using `.loc` for modification)
avg_feedback_time_per_course.loc[:, 'size'] = (
    avg_feedback_time_per_course['average_feedback_time'] /
    avg_feedback_time_per_course['average_feedback_time'].max() * 40
)

# Create ColumnDataSource with the modified DataFrame
feedback_time_bubble_source = ColumnDataSource(avg_feedback_time_per_course)
print(feedback_time_bubble_source)

  submissions_df = pd.read_csv(constants.SUBMISSIONS_PATH)


ColumnDataSource(id='p1001', ...)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_submissions['feedback_time'] = (


In [11]:
print(avg_feedback_time_per_course)

     value.course_id  average_feedback_time       size
0               3203               0.002404   0.000040
1               3577            1581.234593  26.393248
2               3904               0.000000   0.000000
3               6309               0.012713   0.000212
4               6349             517.929287   8.645040
..               ...                    ...        ...
439            12279             561.380041   9.370300
440            12283             506.386754   8.452377
441            12284            1613.543194  26.932529
442            12292            1385.362135  23.123834
443            12295            2067.664797  34.512520

[444 rows x 3 columns]


In [15]:
def calculate_feedback_time_vs_assignment_count(submissions_df, year, semester):
    # Convert dates to datetime
    submissions_df['value.created_at'] = pd.to_datetime(submissions_df['value.created_at'], utc=True)
    submissions_df['value.updated_at'] = pd.to_datetime(submissions_df['value.updated_at'], utc=True)

    # Get semester start and end dates
    start_date, end_date = get_semester_dates(year, semester)

    # Filter submissions by semester
    filtered_submissions = submissions_df[
        (submissions_df['value.created_at'] >= start_date) & 
        (submissions_df['value.created_at'] <= end_date)
    ]

    # Calculate feedback time in hours
    filtered_submissions['feedback_time'] = (
        filtered_submissions['value.updated_at'] - filtered_submissions['value.created_at']
    ).dt.total_seconds() / 3600

    # Remove rows with NaN feedback times
    filtered_submissions = filtered_submissions.dropna(subset=['feedback_time'])

    # Calculate average feedback time per course
    avg_feedback_time_per_course = filtered_submissions.groupby('value.course_id')['feedback_time'].mean().reset_index()
    avg_feedback_time_per_course.columns = ['value.course_id', 'average_feedback_time']

    # Calculate the number of assignments per course
    assignment_count_per_course = filtered_submissions.groupby('value.course_id').size().reset_index(name='assignment_count')

    # Merge the two results to have both average feedback time and assignment count per course
    feedback_time_vs_assignment_count = avg_feedback_time_per_course.merge(
        assignment_count_per_course, on='value.course_id'
    )

    return feedback_time_vs_assignment_count



In [16]:
print(calculate_feedback_time_vs_assignment_count(submissions_df, year, semester))

     value.course_id  average_feedback_time  assignment_count
0               3203               0.002404                15
1               3577            1581.234593                 1
2               3904               0.000000                 2
3               6309               0.012713                 3
4               6349             517.929287               120
..               ...                    ...               ...
439            12279             561.380041               224
440            12283             506.386754               629
441            12284            1613.543194               216
442            12292            1385.362135               329
443            12295            2067.664797               110

[444 rows x 3 columns]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_submissions['feedback_time'] = (
