In [None]:
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import os

In [None]:
# URL of webpage
base_url = 'https://coursekata.org'
url = f'{base_url}/preview/default/program'

# Send a GET request to the URL
response = requests.get(url)

# Parse the HTML doc
soup = BeautifulSoup(response.text, 'html.parser')

# Find the <a> elements under class course-content with href attribute
course_content_links = soup.find_all('a', href=True)

# Filter links to only include textbook section links
relative_links = [link for link in course_content_links if '/preview/book' in link['href']]

# combine relative URLs with base URL
full_links = [urljoin(base_url, link['href']) for link in relative_links]

for link in full_links:
    print(link)

## Scrape individual section page data

In [None]:
# let's create a function to help us scrape from each individual page

def scrape_textbook_page(url):
    # send GET request to url
    response = requests.get(url)

    # parse HTML content
    soup = BeautifulSoup(response.text, 'html.parser')

    # Find text
    paragraphs = soup.find_all('p')
    text = '\n'.join(paragraph.get_text() for paragraph in paragraphs)

    return text

In [None]:
# # Scrape data from each link and store it in a DataFrame
# data = []
# for link in full_links:
#     text = scrape_textbook_page(link)
#     data.append({'URL': link,
#                  'Text': text})
    
# df = pd.DataFrame(data)
# df

## Let's add more columns to the data

In [None]:
# Add col to count the number of words in each page
df['word_count'] = df['Text'].apply(lambda x: len(str(x).split()))

In [None]:
# Export data to a CSV
# df.to_csv('text_data.csv')

In [None]:
# Set display options to show full content
pd.set_option('display.max_colwidth', None)

# Filter the DataFrame and print the text content
print(df[df['URL'] == 'https://coursekata.org/preview/book/fd645e20-5a0d-482e-ad16-ee689acb7431/lesson/3/6']['Text'])

## Import Data

In [None]:
text_df = pd.read_csv('text_data.csv')
text_df['Text'] = text_df['Text'].astype(str)
text_df.head(1)

In [None]:
# Plot word_count with completion on the y
plt.figure(figsize=(16,10))
sns.scatterplot(data = text_df, x='word_count', y='completion proportion')
plt.show()

In [None]:
# Let's look at complexity
import textstat

In [None]:
text_df['text_fog_score'] = text_df['Text'].apply(lambda x: textstat.gunning_fog(x))
text_df['text_fog_score']

# Flesch reading index

In [None]:
text_df['text_readability_score'] = text_df['Text'].apply(lambda x: textstat.flesch_reading_ease(x))

In [None]:
# Plot completion against readibility
plt.figure(figsize=(8,7))
sns.scatterplot(data=text_df, x='text_readability_score', y='completion proportion', hue='chapter_number')
sns.regplot(data=text_df, x='text_readability_score', y='completion proportion', scatter=False)

In [None]:
# Plot completion against fog
plt.figure(figsize=(8,7))
sns.scatterplot(data=text_df, x='text_fog_score', y='completion proportion', hue='chapter_number')
sns.regplot(data=text_df, x='text_fog_score', y='completion proportion', scatter=False)

In [None]:
import seaborn as sns
sns.pairplot(text_df)

In [None]:
# Test for statistical significance - linear reg

import statsmodels.api as sm
# Fit a linear regression model
X = text_df['text_readability_score']
y = text_df['completion proportion']
X = sm.add_constant(X)  # Add a constant term to the predictor
model = sm.OLS(y, X).fit()

# Print the summary of the regression model
print(model.summary())


# Fog index

In [None]:
text_df.columns

In [None]:
# Plot completion against fog_index
plt.figure(figsize=(8,7))
sns.scatterplot(data=text_df, x='text_fog_score', y='completion proportion', hue='chapter_number')
sns.regplot(data=text_df, x='text_fog_score', y='completion proportion', scatter=False)

In [None]:
# Test for statistical significance

import statsmodels.api as sm
# Fit a linear regression model
X = text_df['text_fog_score']
y = text_df['completion proportion']
X = sm.add_constant(X)  # Add a constant term to the predictor
model = sm.OLS(y, X).fit()

# Print the summary of the regression model
print(model.summary())


In [None]:
text_df.to_csv('scraped_text.csv')

In [None]:
# Calculate the number of sections per chapter
sections_per_chapter = text_df.groupby('chapter_number')['section_number'].count().reset_index()

# Calculate the completion rate for each chapter
completion_rate = text_df.groupby('chapter_number')['completion proportion'].mean().reset_index()

# Merge the two dataframes on 'chapter_number'
merged_df = sections_per_chapter.merge(completion_rate, on='chapter_number', suffixes=('_sections', '_completion_rate'))

# Plot the graph
sns.scatterplot(x='section_number', y='completion proportion', data=merged_df)
plt.xlabel('Number of Sections per Chapter')
plt.ylabel('Completion Proportion')
plt.title('Completion Proportion vs Number of Sections per Chapter')
plt.show()