In [1]:
%matplotlib inline
import pandas as pd
from ast import literal_eval
import numpy as np
from collections import OrderedDict
import requests
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
import re
from nltk.stem import WordNetLemmatizer

In [45]:
required_features = ["of","in"]
def contains_required(x):
    return any(r in x.split() 
               for r in required_features)

with open("courses-tier0.json") as f:
    js = literal_eval(f.read())

df = pd.DataFrame(js)    
df.columns = ['found_on_url', 'go_to_url', 'home_url', 'qualification', 'course_name']

df = df.loc[~pd.isnull(df.qualification)]
df = df.loc[list(map(contains_required,df.course_name))]
df = df.drop_duplicates(["go_to_url","course_name"])

In [46]:
kws = list((("Education",('education','teaching')),
                   ("Science",('bio',)),
                   ("Comp/IT",('comp',' it ','info','network','software')),
                   ("Engineering",('eng',)),
                   ("Medical",('health','medic','nursing','pharma')),
                   ("Business/Finance",('business','manag','admin','financ','commerce','account')),
                   ("Science",('science',)),
                   ("Other",('',))))

def assign_limit(x):
    for label,kw in kws:
        if any(k in x.lower() for k in kw):
            return label
df["broad_category"] = df.course_name.apply(assign_limit)

In [47]:
lemmatizer = WordNetLemmatizer()
_stops = set(stopwords.words("english"))

'''
Split words by tokens, including numbers as tokens. Useful for splitting URLs.
'''
def tokenize_alphanum(text):
    words = list(filter(('').__ne__, re.split('[^a-zA-Z]',text)))
    _words = []
    for w in words:
        w = w.lower()
        w = lemmatizer.lemmatize(w)
        _words.append(w)
    return _words

In [48]:
from bs4.element import Comment

def tag_visible(element):
    if element.parent.name in ['style', 'script', 'head', 'title', 'meta', '[document]','a']:
        return False
    if isinstance(element, Comment):
        return False
    return True

def len_sentence(x):
    try:
        return len(tokenize_alphanum(x))
    except TypeError as err:
        print("Error with",x)
        raise err      

def get_sentences(row):
    #irow,row = row_info
    url = row["go_to_url"]
    home_url = row["home_url"] 
    found_on = row["found_on_url"]    
    if pd.isnull(url):
        return None
    try:
        r = requests.get(url)
    except Exception:
        try:
            r = requests.get(home_url+"/"+url)
        except Exception:
            try:
                r = requests.get(found_on+"/"+url)
            except Exception as err:
                print(url,found_on,home_url)
                return None
    if r.status_code != 200:
        #print(url,"not found")
        return None

    soup = BeautifulSoup(r.text,"lxml")
    texts = soup.findAll(text=True)
    visible_texts = list(filter(tag_visible, texts))
    if len(visible_texts) == 0:
        return None
    
    return [t.strip() for t in visible_texts
            if len_sentence(t) > 0]

print("starting")
sentences = []
for i,(_,row) in enumerate(df.iterrows()):
    print(i,"/",len(df),":",row.course_name)
    s = get_sentences(row) 
    sentences.append(s)
df["sentences"] = sentences 

starting
0 / 453 : Master of Science in Clinical Pharmacy
1 / 453 : Master in Private Law
2 / 453 : Master in Public Law
3 / 453 : Master of Art in Teaching English to Speakers of Other Languages
4 / 453 : Master of Business Administration in:
5 / 453 : Bachelor of Science in Computer Engineering
6 / 453 : Bachelor of Science in Network and Communications Engineering
7 / 453 : Bachelor of Science in Computer Science
8 / 453 : Bachelor of Science in Software Engineering
9 / 453 : Bachelor of Science in Pharmacy
10 / 453 : Bachelor of Arts in Law
11 / 453 : Bachelor of Arts in Applied Psychology
12 / 453 : Bachelor of Arts in Applied Sociology
13 / 453 : Bachelor of Education in Special Education
14 / 453 : Bachelor of Business Management - Management
15 / 453 : Bachelor of Business Management - Accounting
16 / 453 : Bachelor of Business Management - Finance & Banking
17 / 453 : Bachelor of Business Management - Marketing
18 / 453 : Bachelor of Business Management - Human Resources Manag

In [71]:
import math
def get_middle(sentences):
    if sentences is None:
        return None
    sentences = sentences[:-2]
    start = 0
    for i,s in enumerate(sentences):
        start = i
        if len(s.split()) > 5:
            break
    if start == 210:
        print(sentences)
    end = -1
    for i,s in enumerate(reversed(sentences)):
        end = -(i+1)
        if len(s.split()) > 5:
            break
    print(start,end,len(sentences))
    return sentences[start:end]

df["sentences_cut"] = df["sentences"].apply(get_middle)

17 -8 44
4 -8 12
4 -8 12
4 -8 12
4 -8 12
4 -8 12
4 -8 12
4 -8 12
15 -16 16
4 -8 12
4 -8 12
4 -8 13
4 -8 12
4 -8 12
4 -8 12
9 -10 10
9 -10 10
21 -10 107
21 -10 72
21 -10 106
21 -10 200
21 -10 207
21 -10 206
21 -10 149
28 -10 136
28 -10 143
28 -10 139
25 -10 130
25 -10 127
21 -10 72
21 -10 71
21 -10 71
28 -10 71
21 -10 72
28 -10 88
29 -10 86
21 -10 79
26 -10 91
26 -10 79
27 -10 93
21 -10 86
21 -10 77
21 -10 200
21 -10 207
21 -10 206
21 -10 123
21 -10 116
9 -1 16
4 -1 30
4 -2 91
4 -2 93
4 -2 80
4 -2 85
4 -2 92
4 -2 86
4 -2 86
4 -2 91
0 -4 61
0 -4 47
0 -4 53
0 -4 44
0 -4 175
0 -4 177
0 -4 163
0 -4 53
5 -6 29
2 -6 28
2 -6 31
2 -6 28
2 -6 32
2 -6 26
2 -6 26
2 -6 25
2 -6 39
5 -6 23
5 -6 21
5 -6 17
5 -6 15
2 -6 20
2 -6 22
2 -6 21
2 -6 21
5 -6 28
5 -6 29
2 -8 37
5 -6 20
2 -6 47
2 -6 48
2 -6 49
2 -6 50
2 -6 46
2 -6 48
2 -6 57
5 -6 13
2 -6 18
2 -6 13
2 -6 19
2 -6 25
2 -6 26
2 -6 38
5 -15 393
4 -15 367
4 -15 367
4 -15 367
4 -15 367
4 -15 367
1 -15 390
2 -15 424
0 -11 12
0 -1 15
0 -12 13
0 -12 13
0

In [69]:
df.sentences_cut.values[0]

['Admission Requirements for M.Sc. in Clinical Pharmacy',
 '1. General Admission Requirements:',
 'To be accepted for the M.Sc. in clinical pharmacy program, the applicant must:',
 'Hold a Bachelor degree of pharmacy obtained from a university recognized by MOHESR. A\xa0Higher diploma is not equivalent to a baccalaureate degree and does not qualify an applicant for\xa0admission to the M.Sc. program.',
 'Have a minimum Cumulative Grade Point Average (CGPA) of 3.0 on a 4.0 point scale (or its\xa0established equivalent) in the applicant’s Bachelor degree program. Applicants must provide\xa0official transcripts of all earned undergraduate credits.',
 'have a minimum TOEFL score of 550 on the Paper-Based, 213 on the Computer-Based, or 79\xa0on the Internet Based test, or the equivalent score on another standardized test approved by\xa0MOHESR such as IELTS score of 6.0, the following are excepted:',
 'A native speaker of English who has completed his/her undergraduate education in an English

In [50]:
inputs = []
for _,_df in df.groupby(["home_url","course_name"]):
    print(_df)
    break
    all_sentences = []
    for sentences in _df.sentences:
        if sentences is None:
            continue
        all_sentences +=  sentences
    

                                     found_on_url go_to_url  \
5143  http://buid.ac.ae/CLDR-Finance-Scholarships      None   

                   home_url qualification          course_name broad_category  \
5143  http://www.buid.ac.ae        doctor  Doctor of Education      Education   

     sentences sentences_cut  
5143      None          None  


In [32]:
for _url,_df in df.groupby("found_on_url"):
    n = _df.sentences.apply(n_sentences)
    print(_url,len(_df),n.sum())
    for s in _df.sentences.values:
        if s is None:
            continue
        print(s)
        print()
        print("--------------")
        print()
    break

http://buid.ac.ae/CLDR-Finance-Scholarships 27 121
['Search this site', 'BUiD Masters Graudates Testimonials', 'Nesrin Abdul Zaher Tantawy', 'Master of Education- TESOL 2016', 'Lecturer at Al Dar University College', '“All I can say is that when I first set out on this journey, I was an entirely different person from what I have become now.\xa0 Before joining BUiD, I was a self-contained individual who would shrink into herself among strangers and would feel intimidated in the limelight, but now I am a different person.\xa0 I graduated from BUiD as a person who feels quite comfortable being in the spotlight. This whole programme with the amazing professors helped me tap into my academic skills which I never knew were that profound. I would like to take that opportunity to thank Prof. Eman Gaad along with all my dear professors for assisting me in laying such a strong foundation towards a more professional academic career”.', 'Mina Ghassan Radhwan', 'Master of Education- Science Educati

In [23]:
df.columns

Index(['found_on_url', 'go_to_url', 'home_url', 'qualification', 'text',
       'broad_category', 'sentences'],
      dtype='object')

In [65]:
["a","b","c"][:-1]

['a', 'b']