In [1]:
%matplotlib inline
import pandas as pd
from ast import literal_eval
import numpy as np
from collections import OrderedDict
import requests
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
import re
from nltk.stem import WordNetLemmatizer

In [45]:
required_features = ["of","in"]
def contains_required(x):
    return any(r in x.split() 
               for r in required_features)

with open("courses-tier0.json") as f:
    js = literal_eval(f.read())

df = pd.DataFrame(js)    
df.columns = ['found_on_url', 'go_to_url', 'home_url', 'qualification', 'course_name']

df = df.loc[~pd.isnull(df.qualification)]
df = df.loc[list(map(contains_required,df.course_name))]
df = df.drop_duplicates(["go_to_url","course_name"])

In [46]:
kws = list((("Education",('education','teaching')),
                   ("Science",('bio',)),
                   ("Comp/IT",('comp',' it ','info','network','software')),
                   ("Engineering",('eng',)),
                   ("Medical",('health','medic','nursing','pharma')),
                   ("Business/Finance",('business','manag','admin','financ','commerce','account')),
                   ("Science",('science',)),
                   ("Other",('',))))

def assign_limit(x):
    for label,kw in kws:
        if any(k in x.lower() for k in kw):
            return label
df["broad_category"] = df.course_name.apply(assign_limit)

In [47]:
lemmatizer = WordNetLemmatizer()
_stops = set(stopwords.words("english"))

'''
Split words by tokens, including numbers as tokens. Useful for splitting URLs.
'''
def tokenize_alphanum(text):
    words = list(filter(('').__ne__, re.split('[^a-zA-Z]',text)))
    _words = []
    for w in words:
        w = w.lower()
        w = lemmatizer.lemmatize(w)
        _words.append(w)
    return _words

In [84]:
from bs4.element import Comment

def tag_visible(element):
    if element.parent.name in ['style', 'script', 'head', 'title', 'meta', '[document]','a']:
        return False
    if isinstance(element, Comment):
        return False
    return True

def len_sentence(x):
    try:
        return len(tokenize_alphanum(x))
    except TypeError as err:
        print("Error with",x)
        raise err      

def get_sentences(row):
    #irow,row = row_info
    url = row["go_to_url"]
    home_url = row["home_url"] 
    found_on = row["found_on_url"]    
    if pd.isnull(url):
        return None
    if url.endswith("pdf"):
        return None
    
    try:
        r = requests.get(url)
    except Exception:
        try:
            r = requests.get(home_url+"/"+url)
        except Exception:
            try:
                r = requests.get(found_on+"/"+url)
            except Exception as err:
                print(url,found_on,home_url)
                return None
    if r.status_code != 200:
        #print(url,"not found")
        return None

    soup = BeautifulSoup(r.text,"lxml")
    texts = soup.findAll(text=True)
    visible_texts = list(filter(tag_visible, texts))
    if len(visible_texts) == 0:
        return None
    
    return [t.strip() for t in visible_texts
            if len_sentence(t) > 0]

print("starting")
sentences = []
for i,(_,row) in enumerate(df.iterrows()):
    print(i,"/",len(df),":",row.course_name)
    s = get_sentences(row) 
    sentences.append(s)
df["sentences"] = sentences 

starting
0 / 453 : Master of Science in Clinical Pharmacy
1 / 453 : Master in Private Law
2 / 453 : Master in Public Law
3 / 453 : Master of Art in Teaching English to Speakers of Other Languages
4 / 453 : Master of Business Administration in:
5 / 453 : Bachelor of Science in Computer Engineering
6 / 453 : Bachelor of Science in Network and Communications Engineering
7 / 453 : Bachelor of Science in Computer Science
8 / 453 : Bachelor of Science in Software Engineering
9 / 453 : Bachelor of Science in Pharmacy
10 / 453 : Bachelor of Arts in Law
11 / 453 : Bachelor of Arts in Applied Psychology
12 / 453 : Bachelor of Arts in Applied Sociology
13 / 453 : Bachelor of Education in Special Education
14 / 453 : Bachelor of Business Management - Management
15 / 453 : Bachelor of Business Management - Accounting
16 / 453 : Bachelor of Business Management - Finance & Banking
17 / 453 : Bachelor of Business Management - Marketing
18 / 453 : Bachelor of Business Management - Human Resources Manag

In [85]:
import math
def get_middle(sentences):
    if sentences is None:
        return None
    sentences = sentences[:-2]
    start = 0
    for i,s in enumerate(sentences):
        if len(s.split(' ')) > 5:
            break
        start = i
    if start == 210:
        print(sentences)
    end = -1
    for i,s in enumerate(reversed(sentences)):
        if len(s.split(' ')) > 5:
            break
        end = -(i+1)
    return sentences[start:end]

df["sentences_cut"] = df["sentences"].apply(get_middle)

In [86]:
def special_contains(x):
    if x is None:
        return False
    return any("PDF" in s for s in x)
condition = df.sentences_cut.apply(special_contains)
condition.sum()
df.loc[condition]

Unnamed: 0,found_on_url,go_to_url,home_url,qualification,course_name,broad_category,sentences,sentences_cut


In [87]:
inputs = []
for (home_url,course_name,qualification,broad_category),_df in df.groupby(["home_url","course_name",
                                                                          "qualification","broad_category"]):
    all_sentences = []
    for sentences in _df.sentences:
        if sentences is None:
            continue
        all_sentences += sentences
    new_info = dict(home_url=home_url,course_name=course_name,sentences=all_sentences,
                    qualification=qualification,broad_category=broad_category)
    inputs.append(new_info)
new_df = pd.DataFrame(inputs)

In [100]:
new_df.drop_duplicates(["home_url","course_name"],inplace=True)
len(new_df),new_df.sentences.apply(lambda x : len(x)>0).sum()

(375, 184)

In [95]:
import numpy as np
np.mean(new_df.loc[new_df.sentences.apply(lambda x : len(x)>0),"sentences"].apply(len))

334.04347826086956

In [32]:
for _url,_df in df.groupby("found_on_url"):
    n = _df.sentences.apply(n_sentences)
    print(_url,len(_df),n.sum())
    for s in _df.sentences.values:
        if s is None:
            continue
        print(s)
        print()
        print("--------------")
        print()
    break

http://buid.ac.ae/CLDR-Finance-Scholarships 27 121
['Search this site', 'BUiD Masters Graudates Testimonials', 'Nesrin Abdul Zaher Tantawy', 'Master of Education- TESOL 2016', 'Lecturer at Al Dar University College', '“All I can say is that when I first set out on this journey, I was an entirely different person from what I have become now.\xa0 Before joining BUiD, I was a self-contained individual who would shrink into herself among strangers and would feel intimidated in the limelight, but now I am a different person.\xa0 I graduated from BUiD as a person who feels quite comfortable being in the spotlight. This whole programme with the amazing professors helped me tap into my academic skills which I never knew were that profound. I would like to take that opportunity to thank Prof. Eman Gaad along with all my dear professors for assisting me in laying such a strong foundation towards a more professional academic career”.', 'Mina Ghassan Radhwan', 'Master of Education- Science Educati

In [23]:
df.columns

Index(['found_on_url', 'go_to_url', 'home_url', 'qualification', 'text',
       'broad_category', 'sentences'],
      dtype='object')

In [65]:
["a","b","c"][:-1]

['a', 'b']

In [97]:
new_df.to_json("course_info.json",orient="records")

In [101]:
new_df.head()

Unnamed: 0,broad_category,course_name,home_url,qualification,sentences
0,Education,Doctor of Education,http://www.buid.ac.ae,doctor,[]
1,Business/Finance,Master of Business Administration,http://www.buid.ac.ae,master,[]
2,Business/Finance,Master of Business Administration (Finance),http://www.buid.ac.ae,master,[]
3,Business/Finance,Master of Business Administration (General),http://www.buid.ac.ae,master,[]
4,Business/Finance,Master of Business Administration (Human Resou...,http://www.buid.ac.ae,master,[]
