## add custom data, generate tooltips & drop probable duplicates 

---

In [1]:
import os
import time

import geopandas
import numpy as np
import pandas as pd
from fuzzywuzzy import fuzz
from tqdm.notebook import tqdm
flatten = lambda l: [i for s in l for i in s]

In [2]:
%%capture
%%tqdm.pandas()

### read data

In [3]:
boot_locs = pd.read_csv('boot_locs.csv')
uni_locs = pd.read_csv('degs_gcoded.csv')
len(boot_locs), len(uni_locs)

(973, 2740)

# --> add more data here <--

In [4]:
boot_locs = pd.concat([boot_locs,
                       
                      # additional bootcamp data here <----
                       
                      ]).reset_index(drop=True)

uni_locs = pd.concat([uni_locs,
                      
                      # additional college data here <-----
                      
                      ]).reset_index(drop=True)

---

## Format features for tooltips (University data)

In [5]:
online_map = {'No': 'In Person', 'Yes': 'Online', 'Blended':'In Person + Online (blended)'}
uni_locs.online = uni_locs.online.map(online_map)

In [6]:
def parse_enroll(en):
    splt = str(en).split(' & ')
    if len(splt)==1: return en
    return ' & '.join(set(splt))
uni_locs.enrollment = uni_locs.enrollment.apply(parse_enroll).value_counts()

In [7]:
uni_locs.dropna(subset=['latitude'], inplace=True)

for row in uni_locs.index:
    
    dp = uni_locs.loc[row, 'dur_period']
    dn = uni_locs.loc[row, 'dur_number']
       
    try:
        dn = int(dn) # adjust months/yeasr
        if dn<=18:
            leng = f'{int(dn)}'
            dp = 'month'
        else:
            leng = f'{round(int(dn)/12, 1)}'.rstrip('.0')
            dp = 'year'

        uni_locs.loc[row, 'duration'] = f'{leng} {dp}s'            
    
    except:
        uni_locs.loc[row, 'duration'] = f'(?)'
        
    deg_spec = uni_locs.loc[row, 'degree'] # pretty degree
    deg_class = uni_locs.loc[row, 'class']
    commitment = uni_locs.loc[row, 'enrollment']
    commitment = (str(commitment)+' ') if 'nan'!=str(commitment).lower()!='(?)' else ''
    online = uni_locs.loc[row, 'online']  
    online = '| '+str(online) if 'nan'!=str(online).lower()!='(?)' else ''
    if str(deg_spec).lower()!='nan' and deg_spec[:5].lower() != deg_class[:5].lower():
        uni_locs.loc[row, 'pretty_degree'] = f'<b>{commitment}{deg_class} ({deg_spec})</b> {online}'
    else:
        uni_locs.loc[row, 'pretty_degree'] = f'<b>{commitment}{deg_class} </b> {online}'
    
uni_locs = uni_locs.fillna('(?)')

In [8]:
uni_locs.pretty_degree.value_counts().sample(10)

<b>Bachelor's </b> | In Person + Online (blended)            7
<b>Master's (M.Eng.)</b> | In Person + Online (blended)      1
<b>Master's (M.A.)</b> | In Person                          36
<b>Bachelor's (B.Sc.)</b> | In Person                      610
<b>Master's (M.Litt.)</b> | In Person                        3
<b>Ph.D. </b> | In Person + Online (blended)                 2
<b>Master's (M.Sc.)</b> | Online                            46
<b>Master's (M.A.)</b> | In Person + Online (blended)        2
<b>Master's (M.B.A.)</b> | In Person                        15
<b>Bachelor's (B.A.)</b> | In Person                        80
Name: pretty_degree, dtype: int64

#### fix online indicator:

In [9]:
for ridx in uni_locs.index:
    
    if 'online' in uni_locs.loc[ridx, 'course_name'].lower():
        uni_locs.loc[ridx, 'online'] = 'Yes'
    

## add custom degrees data from African countries

In [10]:
pd.read_csv('africa_degs.csv',)[uni_locs.columns]

Unnamed: 0,course_name,institution,location,description,cost,dur_period,dur_number,class,degree,enrollment,online,in_person,blended_learning,geo_search,gmaps_info,coord,latitude,longitude,duration,pretty_degree
0,African Master’s in Machine Intelligence,African Institute for Mathematical Sciences,"Limbe, Cameroon",,(?),,,Master's,,Full-time,In Person,Yes,No,,,,4.019991,9.196396,1 year,<b>Full-time Master's (AMMI)</b> | In Person &...
1,African Master’s in Machine Intelligence,African Institute for Mathematical Sciences,"Biriwa, Ghana",,(?),,,Master's,,Full-time,In Person,Yes,No,,,,5.168048,-1.144657,1 year,<b>Full-time Master's (AMMI)</b> | In Person &...
2,African Master’s in Machine Intelligence,African Institute for Mathematical Sciences,"MBour, Sénégal",,(?),,,Master's,,Full-time,In Person,Yes,No,,,,14.392066,16.958084,1 year,<b>Full-time Master's (AMMI)</b> | In Person &...
3,African Master’s in Machine Intelligence,African Institute for Mathematical Sciences,"Muizenberg, South Africa",,(?),,,Master's,,Full-time,In Person,Yes,No,,,,-34.107178,18.470513,1 year,<b>Full-time Master's (AMMI)</b> | In Person &...
4,African Master’s in Machine Intelligence,African Institute for Mathematical Sciences,"Bagamoyo, Tanzania",,(?),,,Master's,,Full-time,In Person,Yes,No,,,,-6.445638,38.898731,1 year,<b>Full-time Master's (AMMI)</b> | In Person &...
5,African Master’s in Machine Intelligence,African Institute for Mathematical Sciences,"Kigali, Rwanda",,(?),,,Master's,,Full-time,In Person,Yes,No,,,,-1.961283,30.112964,1 year,<b>Full-time Master's (AMMI)</b> | In Person &...
6,Data Analytics and Business Intelligence,University of the Western Cape,"Cape Town, South Africa",,R50 000,,,Master's,,Part-time,In Person,Yes,No,,,,-33.933529,18.628591,18 months,<b>Part-time Master's</b> | In Person


In [15]:
# I generated this data manually through research:
a_degs = pd.read_csv('african_masters_degs.csv',)[uni_locs.columns]

uni_locs = pd.concat([uni_locs, a_degs]).reset_index(drop=True)

a_boot = pd.read_csv('africa_boots.csv',)[boot_locs.columns]
boot_locs = pd.concat([boot_locs, a_boot]).reset_index(drop=True)


uni_locs.latitude = uni_locs.latitude.astype(float)
uni_locs.longitude = uni_locs.longitude.astype(float)

boot_locs.latitude = boot_locs.latitude.astype(float)
boot_locs.longitude = boot_locs.longitude.astype(float)

---

## Format features for tooltips (Bootcamp data)

In [17]:
boot_locs['pretty_camp'] = '<b>'+boot_locs['Commitment'].fillna('').apply(lambda x: x.replace(' ', '-' ).capitalize()
                                        )+ ' Bootcamp</b> | ' + boot_locs['Course Type'].fillna('')
boot_locs = boot_locs.fillna('(?)')

In [18]:
# drop bootcamps without one of these terms in the title

def topic_check(c):
    c = c.lower().replace('-', ' ')
    
    hot_words = ['data', 'stat', 'analy', 'busin', 'python'
                 'intell', 'informat', 'fintech', 'cybersecur',
                 'machine learn', 'machine-learn', 'optimiz',
                 'geosp', 'ter sci', 'guage proc']
    
    if any(kword in c for kword in hot_words):
        return True
    return False

for row_idx in boot_locs.index:
    c_name = boot_locs.loc[row_idx, 'course_name']
    if not topic_check(c_name): boot_locs.drop(row_idx, inplace=True)

---

## final duplicates check:

universities are the main problem

In [19]:
##### drop english language courses
uni_locs = uni_locs[~uni_locs.course_name.str.contains('English Language')]

# isolatae african progs (to save because manually added)
african_progs = uni_locs[uni_locs.institution=='African Institute for Mathematical Sciences']

In [20]:
len(uni_locs) + len(boot_locs)

3387

In [21]:
# check for duplicates

verb = False # activate for tuning

def parse_title(title):
    title = str(title).split('(')[0].lower()
    return title

dropped = 0
scanned = 0
for ridx in uni_locs.index: # compare each uni program against all others
    scanned += 1 # to print progress 
    if ridx not in uni_locs.index: continue # already dropped
    
    # parameters to compare:
    rcourse = parse_title(uni_locs.loc[ridx, 'course_name']) # track name
    rinst = uni_locs.loc[ridx, 'institution']
    rclass = uni_locs.loc[ridx, 'class'] # bachelors, masters or phd
    blat = round(uni_locs.loc[ridx, 'latitude'], 3) # generalized location
    blng = round(uni_locs.loc[ridx, 'latitude'], 3)
    
    if 'African Institute for Mathematical Sciences' in rinst: continue
        # these were all manually added and should be preserved
    
    for cidx in uni_locs.drop(ridx).index:
        if ridx not in uni_locs.index: continue # (already dropped, don't check)
        
        # parameters to compare:
        ccourse = parse_title(uni_locs.loc[cidx, 'course_name'])
        cinst = uni_locs.loc[cidx, 'institution']
        cclass = uni_locs.loc[cidx, 'class']
        clat = round(uni_locs.loc[cidx, 'latitude'], 3) # same general location
        clng = round(uni_locs.loc[cidx, 'latitude'], 3)
        
        # combined match ratios:
        if fuzz.ratio(rinst, cinst) > 95 and rclass==cclass:
            if fuzz.partial_ratio(rcourse, ccourse) > 95:
                if blat==blng and clat==clng:
                    rnan = uni_locs.loc[ridx].isna().sum() + (uni_locs.loc[ridx]=='(?)').sum()
                    cnan = uni_locs.loc[cidx].isna().sum() + (uni_locs.loc[cidx]=='(?)').sum()
                    if rnan>cnan:
                        uni_locs.drop(ridx, inplace=True)
                        keeping = f'{cclass[:2]}: {ccourse} ({cinst}) - ({cnan} NA)'
                        dropping = f'{rclass[:2]}: {rcourse} ({rinst}) - ({rnan} NA)'
                    else:
                        uni_locs.drop(cidx, inplace=True)
                        dropping = f'{cclass[:2]}: {ccourse} ({cinst}) - ({cnan} NA)'
                        keeping = f'{rclass[:2]}: {rcourse} ({rinst}) - ({rnan} NA)'
                    dropped +=1
                    
                    if verb:
                        print('keeping:', keeping)
                        print('dropping:', dropping)
                        print('—'*80)
    
    if ridx%10==0 and not verb:
        print(f'\rDropped: {dropped} | Scanned: {scanned}/{len(uni_locs)+dropped}', end='')

Dropped: 0 | Scanned: 1/2735

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


Dropped: 274 | Scanned: 2712/2735

# SAVE

In [23]:
uni_locs.to_csv('degs_to_map.csv', index=False)
boot_locs.to_csv('boot_to_map.csv', index=False)

In [24]:
len(uni_locs) + len(boot_locs)

3112

In [25]:
uni_locs

Unnamed: 0,course_name,institution,location,description,cost,dur_period,dur_number,class,degree,enrollment,online,in_person,blended_learning,geo_search,gmaps_info,coord,latitude,longitude,duration,pretty_degree
0,"Business Administration, Concentration in Mana...",San José State University,"San Jose, California, United States","Business Administration, Concentration in Mana...","4,963 EUR / year",month,48,Bachelor's,B.Sc.,(?),In Person,No,No,"San José State University, San Jose, Californi...","[{'address_components': [{'long_name': '1', 's...","(37.3351874, -121.8810715)",37.335187,-121.881072,4 years,<b>Bachelor's (B.Sc.)</b> | In Person
1,Information Engineering,Hamburg University of Applied Sciences,"Hamburg, Germany",Today‘s complex information processing systems...,0 EUR / year,month,36,Bachelor's,B.Sc.,(?),In Person,No,No,"Hamburg University of Applied Sciences, Hambur...","[{'address_components': [{'long_name': '5', 's...","(53.5560114, 10.0227163)",53.556011,10.022716,3 years,<b>Bachelor's (B.Sc.)</b> | In Person
2,Health Care Informatics,University of Illinois Springfield,"Springfield, Illinois, United States",The Health Care Informatics bachelor’s degree ...,"26,437 EUR / year",month,48,Bachelor's,B.Sc.,(?),In Person,No,No,"University of Illinois Springfield, Springfiel...",[{'address_components': [{'long_name': 'Spring...,"(39.7301376, -89.61852689999999)",39.730138,-89.618527,4 years,<b>Bachelor's (B.Sc.)</b> | In Person
3,Information Systems Technology - Information S...,Regent University,Online,The Bachelor of Science in Information Systems...,496 EUR / credit,month,48,Bachelor's,B.Sc.,(?),In Person,No,No,"Regent University, Online","[{'address_components': [{'long_name': '1000',...","(36.7993593, -76.1925395)",36.799359,-76.192539,4 years,<b>Bachelor's (B.Sc.)</b> | In Person
4,Mathematics Operational Research and Statistics,Cardiff University,"Cardiff, Wales, United Kingdom",Combine statistics and operational research wi...,"24,160 EUR / year",month,36,Bachelor's,B.Sc.,(?),In Person,No,No,"Cardiff University, Cardiff, Wales, United Kin...",[{'address_components': [{'long_name': 'Cardif...,"(51.48662710000001, -3.1788641)",51.486627,-3.178864,3 years,<b>Bachelor's (B.Sc.)</b> | In Person
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2738,African Master’s in Machine Intelligence,African Institute for Mathematical Sciences,"Biriwa, Ghana",,(?),,,Master's,,Full-time,In Person,Yes,No,,,,5.168048,-1.144657,1 year,<b>Full-time Master's (AMMI)</b> | In Person &...
2739,African Master’s in Machine Intelligence,African Institute for Mathematical Sciences,"MBour, Sénégal",,(?),,,Master's,,Full-time,In Person,Yes,No,,,,14.392066,-16.958084,1 year,<b>Full-time Master's (AMMI)</b> | In Person &...
2740,African Master’s in Machine Intelligence,African Institute for Mathematical Sciences,"Muizenberg, South Africa",,(?),,,Master's,,Full-time,In Person,Yes,No,,,,-34.107178,18.470513,1 year,<b>Full-time Master's (AMMI)</b> | In Person &...
2741,African Master’s in Machine Intelligence,African Institute for Mathematical Sciences,"Bagamoyo, Tanzania",,(?),,,Master's,,Full-time,In Person,Yes,No,,,,-6.445638,38.898731,1 year,<b>Full-time Master's (AMMI)</b> | In Person &...
