In [1]:
%load_ext autoreload
%autoreload 2

In [9]:
import pandas as pd
import numpy as np
import nltk
import gensim

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [None]:
from bs4 import BeautifulSoup
import requests

url_base = 'https://www.webster.edu'
html = requests.get(f'{url_base}/catalog/current/undergraduate-catalog/courses/index.html').text
soup = BeautifulSoup(html, 'lxml')
links_class = 'container content2Cols'
links = soup.find_all('div', {'class': links_class})

courses_links = []
for link in links:
    courses_links.extend([item.get('href') for item in link.find_all('a')])

for i, link in enumerate(courses_links):
    courses_links[i] = f'{url_base}{link}'

final = []
for link in courses_links:
    html = requests.get(link).text
    soup = BeautifulSoup(html, 'html.parser')
    body = soup.find_all('div', {'class': 'bodyCopy'})[0]
    titles = body.find_all('h3')
    paragraphs = soup.find_all('p')[3:]
    print(titles[0])

    for i, title in enumerate(titles):
        actual_title = title.get_text()
        actual_paragraph = paragraphs[i].get_text()
        final.append((actual_title, actual_paragraph))

In [None]:
df = pd.DataFrame(final, columns=['course_id', 'description'])

In [None]:
df

In [None]:
df.to_parquet('../data/courses.parquet')

### Tokenizing course descriptions

In [5]:
def tokenize_description(tokenizer: nltk.TreebankWordTokenizer, desc: str) -> str:
    return tokenizer.tokenize(desc)

tokenizer = nltk.TreebankWordTokenizer()
tokens = [tokenize_description(tokenizer, df.iloc[i]['description']) for i in range(df.shape[0])]
df['token_desc'] = tokens

### Removing stopwords from course descriptions

In [6]:
def remove_stopwords(tokens: list) -> list:
    return [token for token in tokens if token not in stopwords.words('english')]

df['token_desc'] = df['token_desc'].apply(remove_stopwords)

### Lemmatize tokens in description

In [11]:
lemmatizer = WordNetLemmatizer()

def lemmatize_tokens(tokens: list) -> list:
    return [lemmatizer.lemmatize(token) for token in tokens]

df['token_desc'] = df['token_desc'].apply(lemmatize_tokens)

In [12]:
df

Unnamed: 0,course_id,description,token_desc
0,ACCT 2010 Financial Accounting (3),,[]
1,ACCT 2025 Managerial Accounting (3),Introduces accounting with an emphasis on syst...,"[Introduces, accounting, emphasis, systemic, t..."
2,ACCT 3025 Advanced Managerial and Cost Accoun...,Managerial Accounting emphasizes the use of ac...,"[Managerial, Accounting, emphasizes, use, acco..."
3,ACCT 3030 Intermediate Accounting I (3),Expands on topics developed in Managerial Acco...,"[Expands, topic, developed, Managerial, Accoun..."
4,ACCT 3040 Intermediate Accounting II (3),Covers theoretical foundations of accounting w...,"[Covers, theoretical, foundation, accounting, ..."
...,...,...,...
1900,WRIT 3400 Editing and Style (3),"Students will learn to find, evaluate and pres...","[Students, learn, find, ,, evaluate, present, ..."
1901,WRIT 4000 Topics in Professional Writing (3),This course directs students' attention to the...,"[This, course, directs, student, ', attention,..."
1902,WSBT 2000 Career Exploration (1),,[]
1903,WSBT 2300 Personal Branding (1),This course is the first of three courses desi...,"[This, course, first, three, course, designed,..."


### Initialize word2vec model from Google binaries

In [22]:
model = gensim.models.KeyedVectors.load_word2vec_format('../data/GoogleNews-vectors-negative300.bin', binary=True)

### Leave only tokens which are present in a model vocab

In [23]:
def get_present_vocab(tokens):
    return [token for token in tokens if token in model.index_to_key]


df['previous_tokens'] = df['token_desc'].apply(len)
df['token_desc'] = df['token_desc'].apply(get_present_vocab)

### Check how many words we trimmed and drop NaN from zero divison, we do not need empty descriptions

In [24]:
df['tokens_share'] = [len(df['token_desc'].iloc[i]) / df['previous_tokens'].iloc[i] for i in range(len(df))]
df = df.dropna()

  df['tokens_share'] = [len(df['token_desc'].iloc[i]) / df['previous_tokens'].iloc[i] for i in range(len(df))]


## Save Aggregate

In [26]:
df.to_parquet('../data/courses_preprocessed.parquet')