In [None]:
import pandas as pd
import os
import numpy
import seaborn as sns
import matplotlib.pyplot as plt

# pd.set_option('display.max_rows', None)  # To display all rows
# pd.set_option('display.max_columns', None)  # To display all columns

In [None]:
dir = os.getcwd()
data_path = os.path.join(dir, 'full_03_04')

page_views = pd.read_csv(os.path.join(data_path, 'page_views.csv'), index_col='student_id')

In [None]:
page_views.head(25)

We want to take a look at engaged and was_completed

In [None]:
# Let's look at the size of page_views
page_views.shape

In [None]:
# Filter by desired book type
filtered_page_views = page_views[page_views['book'].isin(['College / Statistics and Data Science (ABC)', 'High School / Advanced Statistics and Data Science I (ABC)'])]

In [None]:
# Filter for relevant columns
filtered_page_views = filtered_page_views.loc[:,['book', 'chapter_number', 'section_number', 'was_complete']]
filtered_page_views.head(25)

# Drop NAs
filtered_page_views.dropna(subset=['chapter_number', 'section_number'], inplace=True)

In [None]:
# Determine number of unique students in data
num_unique_students = filtered_page_views.index.nunique()
num_unique_students

### Create engagement dataframe

In [None]:
# convert was_complete column to bool
filtered_page_views['was_complete'] = filtered_page_views['was_complete'].astype(bool)
filtered_page_views['chapter_number'] = filtered_page_views['chapter_number'].astype(int)
filtered_page_views['section_number'] = filtered_page_views['section_number'].astype(int)

# create new dataframe which includes only rows where was_complete is True
completed_views = filtered_page_views[filtered_page_views['was_complete']]

# Group by chapter and section and count unique occurences of True
chapter_section_counts = completed_views.reset_index().groupby(['chapter_number', 'section_number']).agg({'student_id': 'nunique'})

# Rename columns
chapter_section_counts.rename(columns={'student_id': 'completion proportion'}, inplace=True)

# convert completion to proportion
chapter_section_counts['completion proportion'] = (chapter_section_counts['completion proportion'] / num_unique_students).round(3)

# Print the results

pd.set_option('display.max_rows', None)  # To display all rows
chapter_section_counts

# export as csv
# chapter_section_counts.to_csv('chapter_section_engagement_metrics.csv')

In [None]:
# Graph 1: Plotted mean proportion per CHAPTER

avg_completion_proportion = chapter_section_counts.groupby(level='chapter_number')['completion proportion'].mean().reset_index()

# Plot using Seaborn
plt.figure(figsize=(10, 6))
sns.lineplot(data=avg_completion_proportion, x='chapter_number', y='completion proportion', marker='o')
plt.xlabel('Chapter Number')
plt.ylabel('Average Completion Proportion')
plt.title('Average Completion Proportion by Chapter')
sns.set_theme(style="whitegrid")
plt.show()

In [None]:
# calculate chapter over chapter % difference

pct_difference = avg_completion_proportion['completion proportion'].pct_change() * 100
pct_difference.mean()

In [None]:
pct_difference

In [None]:
# Plot chapter section completion rate
chapter_section_counts_reset = chapter_section_counts.reset_index()

# Plot completion proportion by chapter section for each chapter
plt.figure(figsize=(10, 6))
for chapter in chapter_section_counts_reset['chapter_number'].unique():
    chapter_data = chapter_section_counts_reset[chapter_section_counts_reset['chapter_number'] == chapter]
    plt.plot(chapter_data['section_number'], chapter_data['completion proportion'], marker='o', linestyle='-', label=f'Chapter {chapter}')

plt.xlabel('Section Number')
plt.ylabel('Completion Proportion')
plt.title('Completion Proportion by Chapter Section')
plt.grid(True)
plt.legend()
plt.show()

In [None]:
# Graph 2: Plot the same thing as a boxplot

import seaborn as sns
import matplotlib.pyplot as plt

# Set the dark theme
sns.set_theme(style="white")

# Assuming your DataFrame is named chapter_section_counts
# Group by chapter_number and calculate the mean completion proportion
avg_completion_proportion = chapter_section_counts.groupby(level='chapter_number')['completion proportion'].mean().reset_index()

# Plot using Seaborn
plt.figure(figsize=(10, 6))
sns.boxplot(data=chapter_section_counts, x='chapter_number', y='completion proportion', palette='rocket_r')
plt.xlabel('Chapter Number')
plt.ylabel('Completion Proportion')
plt.title('Distribution of Completion Proportion by Chapter')
plt.show()

In [None]:
import matplotlib.pyplot as plt

# Assuming you have a DataFrame called 'text_df' with columns 'chapter_number' and 'section_number'
sections_per_chapter = chapter_section_counts.groupby('chapter_number')['section_number'].count()

# Plotting
plt.figure(figsize=(10, 6))
sections_per_chapter.plot(kind='bar', color='skyblue')
plt.title('Number of Sections per Chapter')
plt.xlabel('Chapter Number')
plt.ylabel('Number of Sections')
plt.xticks(rotation=45)
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()

In [None]:
# Plot engagement by chapter

sns.set_theme(style='white')
avg_engagement_time = chapter_section_counts.groupby(level='chapter_number')['engagement time (minutes)'].mean().reset_index()

plt.figure(figsize=(10,6))
sns.boxplot(data=chapter_section_counts, x='chapter_number', y='engagement time (minutes)', palette='rocket_r')
plt.xlabel('Chapter Number')
plt.ylabel('Engagement Time (minutes)')
plt.title('Distribution of Engagement by Chapter')
plt.show()

### Drill-down analytics

In [None]:
# Let's investigate chapter 11

df = chapter_section_counts.reset_index()
ch11 = df[df['chapter_number'] == 11]
ch11

In [None]:
# Let's plot chapter 11
plt.figure(figsize=(8,8))
sns.lineplot(data=ch11, x='section_number', y='engagement time (minutes)', linewidth = 3.5, color='darkblue')
plt.xlabel('Section Number')
plt.ylabel('Number of Students')
plt.title('Engagement Time by Chapter Section')
plt.show()

In [None]:
# Let's investigate chapter 4

df = chapter_section_counts.reset_index()
ch4 = df[df['chapter_number'] == 4]
ch4['completion proportion'].min()
ch4

In [None]:
# Let's plot chapter 4
plt.figure(figsize=(8,8))
sns.lineplot(data=ch4, x='section_number', y='completion proportion', linewidth = 3.5, color='darkblue')
plt.xlabel('Section Number')
plt.ylabel('Number of Students')
plt.title('Number of Students Completing Each Section by Chapter Section')
plt.show()