In [4]:
import pandas as pd

def get_semester_dates(year, semester):
    """
    Returns the start and end date of the semester based on the given year and semester type, respecting UTC time.
    Semesters:
    - Spring: January to June
    - Summer: June to July
    - Winter: August to December
    """
    if semester == 'Spring':
        return pd.Timestamp(f'{year}-01-01', tz='UTC'), pd.Timestamp(f'{year}-06-30', tz='UTC')
    elif semester == 'Summer':
        return pd.Timestamp(f'{year}-06-01', tz='UTC'), pd.Timestamp(f'{year}-07-31', tz='UTC')
    elif semester == 'Winter':
        return pd.Timestamp(f'{year}-08-01', tz='UTC'), pd.Timestamp(f'{year}-12-31', tz='UTC')
    else:
        raise ValueError("Invalid semester. Choose between 'Spring', 'Summer', or 'Winter'.")


In [7]:
def create_course_availability_and_activity_for_semester(courses, year, semester):
    # Get the start and end date for the selected semester with UTC time
    start_semester, end_semester = get_semester_dates(year, semester)

    # Convert course dates to UTC
    courses['start_at'] = pd.to_datetime(courses['start_at'], utc=True)
    courses['end_at'] = pd.to_datetime(courses['end_at'], utc=True)

    # Filter active courses within the semester
    active_courses = courses[
        (courses['workflow_state'] == 'active') &
        (courses['start_at'] <= end_semester) &
        ((courses['end_at'] >= start_semester) | courses['end_at'].isna())
    ]

    # Filter inactive courses within the semester
    inactive_courses = courses[
        (courses['workflow_state'] != 'active') &
        (courses['start_at'] <= end_semester) &
        (courses['end_at'] < start_semester) & 
        courses['end_at'].notna()  # Ensure the course has ended
    ]

    # Calculate counts
    active_count = active_courses.shape[0]
    inactive_count = inactive_courses.shape[0]

    # Calculate ratio of active to inactive courses
    if inactive_count > 0:
        ratio_active_to_inactive = active_count / inactive_count
    else:
        ratio_active_to_inactive = float('inf')

    return active_count, inactive_count, ratio_active_to_inactive


In [51]:
from utils.constants import SCORES_PATH, ENROLLMENTS_PATH
scores_df = pd.read_csv(SCORES_PATH)
enrollments_df = pd.read_csv(ENROLLMENTS_PATH)

def calculate_average_score(scores, year, semester):
    start_semester, end_semester = get_semester_dates(year, semester)

    scores_df['value.updated_at'] = pd.to_datetime(scores_df['value.updated_at'])

    filtered_scores = scores_df[
        (scores_df['value.updated_at'] >= start_semester) & 
        (scores_df['value.updated_at'] <= end_semester) & 
        (scores_df['value.workflow_state'] == 'active')  # Consider only active scores
    ]

    if filtered_scores.empty:
        return 0.0

    average_score = filtered_scores['value.current_score'].mean()
    return average_score

# Example usage:
# Assuming 'scores_df' is the DataFrame from the CSV provided, and filtering for Spring 2023:
average_score_spring_2023 = calculate_average_score(scores_df, 2023, 'Summer')
print(average_score_spring_2023)
# print(f"Average score for Spring 2023: {average_score_spring_2023:.2f}")

65.97918946301924


In [47]:
average_score_spring_2023 = calculate_average_score(scores_df, 2022, 'Winter')
print(f"Average score for Spring 2023: {average_score_spring_2023:.2f}")

Average score for Spring 2023: 67.86


In [70]:
def calculate_score_distribution(scores_df, year, semester):

    scores_df['value.created_at'] = pd.to_datetime(scores_df['value.created_at'], errors='coerce')
    scores_df['value.updated_at'] = pd.to_datetime(scores_df['value.updated_at'], errors='coerce')
    scores_df = scores_df.dropna(subset=['value.created_at'])

    start_date, end_date = get_semester_dates(year, semester)

    filtered_scores = scores_df[
        (scores_df['value.created_at'] <= end_date) &   
        ((scores_df['value.updated_at'] >= start_date) |
         (scores_df['value.updated_at'].isna()))  
    ]

    scores = filtered_scores['value.final_score'].dropna()

    return scores



In [71]:
scores = calculate_score_distribution(scores_df, 2024, 'Spring')
print("scores are: ", scores)
scores.shape

scores are:  27         90.0
30          0.0
43         95.0
148        50.5
161        88.0
           ... 
1071544     0.0
1071545     0.0
1071546     0.0
1071549     0.0
1071550     0.0
Name: value.final_score, Length: 97228, dtype: float64


(97228,)