In [3]:
import pandas as pd
from sqlalchemy import create_engine

PROJECT_ID = '702148157522'
LOCATION = 'us-central1'

# initialize a connection to Postgres SQL database
engine = create_engine(
   'postgresql://debug:debug@localhost/grant_api'
)


In [4]:
# Concatenate broad categories into labels, e.g. "disability" "area of study" "heritage"
OUT_FILE = '../data/scholarship_category_labels.csv'

# writes out a dataset of program name/description text and tag category

df = pd.read_sql("""
    SELECT CONCAT(program_name, ' ',program_description) as text, string_agg(distinct(category), ',') as labels
    FROM scholarships_tag
    join scholarships_scholarship_tags on scholarships_tag.id = scholarships_scholarship_tags.tag_id
    join scholarships_scholarship on scholarships_scholarship.id = scholarships_scholarship_tags.scholarship_id
    where scholarships_tag.category in (
    'religion',
    'academic_level',
    'hobby',
    'disciplines',
    'gender',
    'disability',
    'employment_sectors',
    'gpa',
    'heritage',
    'military_service'
    ) or scholarships_tag.category is null

    group by scholarships_scholarship.id

""", con=engine)


with open(OUT_FILE, 'a') as out_file:
    
    for idx, row in df.iterrows():
        # replace all instances of \" in text string
        # wrap text string in \"
        out_file.write("\"" + row['text'].replace('"', '') +"\"" + ',')
        if row['labels']:
            out_file.write(row['labels'])
        out_file.write('\n')
print('finished categories')

FileNotFoundError: [Errno 2] No such file or directory: '../data/scholarship_category_labels.csv'

In [None]:
# Aggregate raw tag text into labels
# This didn't work out well, because the labels contained duplication
# originally written to '../data/scholarship_tag_labels.csv'
OUT_FILE = 'data/scholarship_app_req_labels_v1.csv' '

# writes out a dataset of program name/description text and tag text
# long-tail tags that appear less than 100 times are omitted

df = pd.read_sql("""
    SELECT CONCAT(program_name, ' ',program_description) as text, array_to_string(array_agg(text),',') as labels
    FROM scholarships_tag
    join scholarships_scholarship_tags on scholarships_tag.id = scholarships_scholarship_tags.tag_id
    join scholarships_scholarship on scholarships_scholarship.id = scholarships_scholarship_tags.scholarship_id
    where scholarships_tag.text in (SELECT text from scholarships_tag
    join scholarships_scholarship_tags on scholarships_tag.id = scholarships_scholarship_tags.tag_id
    join scholarships_scholarship on scholarships_scholarship.id = scholarships_scholarship_tags.scholarship_id
    group by scholarships_tag.id
    having count(*) > 100
    order by count(*) DESC)

    group by scholarships_scholarship.id

""", con=engine)


with open(OUT_FILE, 'a') as out_file:
    
    for idx, row in df.iterrows():
        
        out_file.write("\"" + row['text'].replace('"', '') +"\"" + ',')
        if row['labels']:
            out_file.write(row['labels'])
        out_file.write('\n')
print('finished tag text')

df

In [None]:
# manually create highest-level labels, like "engineering" "science" "humanities" "undergrad"
# originally written to '../data/scholarship_top_level_v1.csv' 
OUT_FILE = 'data/scholarship_app_req_labels_v2.csv' 

# writes out a dataset of program name/description text and tag text
# long-tail tags that appear less than 100 times are omitted

df = pd.read_sql("""

SELECT CONCAT(program_name, ' ',program_description) as text, array_to_string(array_agg(text),',') as labels

from scholarships_scholarship
join scholarships_scholarship_training_labels on scholarships_scholarship_training_labels.scholarship_id =  scholarships_scholarship.id
join scholarships_traininglabel on scholarships_traininglabel.id = scholarships_scholarship_training_labels.traininglabel_id
where scholarships_traininglabel.text in (
SELECT text from scholarships_scholarship
join scholarships_scholarship_training_labels on scholarships_scholarship_training_labels.scholarship_id =  scholarships_scholarship.id
join scholarships_traininglabel on scholarships_traininglabel.id = scholarships_scholarship_training_labels.traininglabel_id
group by scholarships_traininglabel.id
having count(*) > 100
order by count(*) DESC
)
group by scholarships_scholarship.id

""", con=engine)


with open(OUT_FILE, 'a') as out_file:
    
    for idx, row in df.iterrows():
        
        out_file.write("\"" + row['text'].replace('"', '') +"\"" + ',')
        if row['labels']:
            out_file.write(row['labels'])
        out_file.write('\n')
print('finished tag text')

df

In [13]:
import pandas as pd
from sqlalchemy import create_engine

# initialize a connection to Postgres SQL database
engine = create_engine(
   'postgresql://debug:debug@localhost/grant_api'
)

# Basic application requirements labels, most are boolean
# transcript, bio, test scores, interview, essay, no essay, community service, 
# high school, undergrad, graduate, freshman, sophomore, junior, senior, two year college, four year college, trade school, 
# financial need

# stretch goals: state residence / state use

OUT_FILE = 'data/scholarship_app_req_labels_v3.csv' 

# writes out a dataset of program name/description text and tag text
# long-tail tags that appear less than 100 times are omitted

df = pd.read_sql("""

SELECT 
CONCAT(program_name, ' ',program_description) as text,
-- boolean fields
transcript,
bio as autobiography,
test_scores,
essay,
community_service,
high_school,
financial_need,
application_form,
application_fee,
interview,
"references",
contest,

-- varchar
gpa,


CASE WHEN military_service is not null THEN true
    ELSE false
END as military_service,

CASE WHEN disability is not null THEN true
    ELSE false
END as disability,

-- array fields
academic_level,
-- label 'undergraduate' if scholarship includes f/s/j/sr
CASE WHEN (
'FRESHMAN'=ANY(academic_level) or
'SOPHMORE'=ANY(academic_level) or
'JUNIOR'=ANY(academic_level) or
'SENIOR'=ANY(academic_level)
) THEN true
    ELSE false
END as undergrad,
institution_type



FROM 
scholarships_scholarship
join
scholarships_applicationrequirements on scholarships_applicationrequirements.id = scholarships_scholarship.application_requirements_id


""", con=engine)


with open(OUT_FILE, 'a') as out_file:
    
    for idx, row in df.iterrows():
        labels = []
        for column in df.columns:
            # exclude 'text' column, which contains program_name and program_description
            if column == 'text':
                continue
            # <class 'bool'>
            elif type(row[column]) is bool and row[column]:
                labels.append(column)

            # <class 'list'>
            elif type(row[column]) is list:
                labels = labels + row[column]
            # class <class 'str'> (gpa)
            elif type(row[column]) is str:
                labels.append(f'{column}_{row[column]}')

        out_file.write("\"" + row['text'].replace('"', '') +"\"" + ',')
        if len(labels) > 0:
            out_file.write(','.join(labels))
        out_file.write('\n')
print(f'finished writing {OUT_FILE}')

df

finished writing data/scholarship_app_req_labels_v3.csv


Unnamed: 0,text,transcript,autobiography,test_scores,essay,community_service,high_school,financial_need,application_form,application_fee,interview,references,contest,gpa,military_service,disability,academic_level,undergrad,institution_type
0,Wildlife Leadership Awards Program established...,False,False,False,False,False,False,False,True,False,False,False,False,,False,False,"[JUNIOR, SENIOR]",True,[FOUR-YEAR COLLEGE]
1,National High School Journalist of the Year/Si...,False,False,False,True,False,True,False,True,False,False,False,False,3.0,False,False,[FRESHMAN],True,[FOUR-YEAR COLLEGE]
2,Association for Iron and Steel Technology Balt...,True,False,True,True,False,False,False,True,False,False,False,False,,False,False,"[FRESHMAN, SOPHMORE, JUNIOR, SENIOR]",True,[FOUR-YEAR COLLEGE]
3,WOCN Accredited Nursing Education Program Scho...,False,False,False,False,False,False,False,True,False,False,False,False,,False,False,"[FRESHMAN, SOPHMORE, JUNIOR, SENIOR]",True,"[TWO-YEAR COLLEGE, FOUR-YEAR COLLEGE, TRADE OR..."
4,Appraisal Institute Education Trust Education ...,True,False,False,True,False,False,False,True,False,False,True,False,,False,False,"[SOPHMORE, JUNIOR, SENIOR, GRADUATE]",True,[FOUR-YEAR COLLEGE]
5,Father James B. Macelwane Annual Awards Availa...,False,False,False,False,False,False,False,False,False,False,False,False,3.0,False,False,"[SOPHMORE, JUNIOR, SENIOR]",True,"[TWO-YEAR COLLEGE, FOUR-YEAR COLLEGE]"
6,Astrid G. Cates and Myrtle Beinhauer Scholarsh...,True,False,True,True,True,False,False,True,False,False,True,False,3.0,False,False,"[FRESHMAN, SOPHMORE, JUNIOR, SENIOR]",True,"[TWO-YEAR COLLEGE, FOUR-YEAR COLLEGE, TRADE OR..."
7,Archaeology of Portugal Fellowship One-time aw...,False,False,False,True,False,False,False,True,False,False,False,False,,False,False,[GRADUATE],False,
8,Family Protection Specialist Social Worker For...,False,False,False,False,False,False,False,True,False,False,False,False,,False,False,[GRADUATE],False,[FOUR-YEAR COLLEGE]
9,Caleb L. Butler Scholarship Scholarship for gr...,True,False,False,True,False,True,True,True,False,False,False,False,,False,False,[FRESHMAN],True,"[TWO-YEAR COLLEGE, FOUR-YEAR COLLEGE]"


In [None]:
from google.cloud import automl_v1beta1 as automl

client = automl.AutoMlClient()
automl_account = client.location_path('get-grant', 'us-central1')

# scholarship_tagged
# TCN8770850244659049285

DATASET_ID='TCN8770850244659049285'
dataset_name = f'projects/{PROJECT_ID}/locations/{LOCATION}/datasets/{DATASET_ID}'

scholarship_tagged = client.get_dataset(dataset_name)

print(type(scholarship_tagged))
print(dir(scholarship_tagged))


scholarship_tagged.example_count
    


In [5]:
# Scraping scholarships by major

# Step 1:
# Scrape urls of each major

# https://www.scholarships.com/financial-aid/college-scholarships/scholarships-by-major/
# ul containing "$MajorName Scholarships" links
# class=bot10
# traverse through li
# <a href="/financial-aid/college-scholarships/scholarships-by-major/accounting-scholarships/">Accounting Scholarships</a>

import requests
import pandas as pd
from bs4 import BeautifulSoup

BASE_URL = 'https://www.scholarships.com'
OUT_FILE = 'data/scholarships_com_by_major_urls_v1.csv'

print(f'begin {OUT_FILE}')
# initialize DataFrame
scholarships_by_major_df = None

# get html
scholarships_by_majors_url = f'{BASE_URL}/financial-aid/college-scholarships/scholarships-by-major/'
scholarships_by_majors_page = requests.get(scholarships_by_majors_url)

# parse html
scholarships_by_majors_soup = BeautifulSoup(scholarships_by_majors_page.text, 'html.parser')

# finds all links in our element of interest (<ul class="bot10"></ul>)
majors_ul = scholarships_by_majors_soup.find('ul', attrs={'class': 'bot10'})
ahrefs = majors_ul.find_all('a')

for a in ahrefs:
    data = {
        'label': a.getText().replace(' Scholarships', '').lstrip().lower(),
        # e.g. "Accounting Scholarships" => "accounting"
        'url': f"{BASE_URL}{a['href']}",
        'label_type': 'major'
    }
    # initialize/concatenate data into dataframe
    if scholarships_by_major_df is None:
        scholarships_by_major_df = pd.DataFrame.from_records([data], columns=data.keys(), index='label')
    else:
        _df = pd.DataFrame.from_records([data], columns=data.keys(), index='label')
        scholarships_by_major_df = pd.concat([_df, scholarships_by_major_df], axis =0, sort=True)

scholarships_by_major_df.to_csv(OUT_FILE)
print(f'finished writing {OUT_FILE}')
scholarships_by_major_df


begin data/scholarships_com_by_major_urls_v1.csv
finished writing data/scholarships_com_by_major_urls_v1.csv


Unnamed: 0_level_0,label_type,url
label,Unnamed: 1_level_1,Unnamed: 2_level_1
social work,major,https://www.scholarships.com/financial-aid/col...
psychology,major,https://www.scholarships.com/financial-aid/col...
nursing,major,https://www.scholarships.com/financial-aid/col...
math,major,https://www.scholarships.com/financial-aid/col...
history,major,https://www.scholarships.com/financial-aid/col...
english,major,https://www.scholarships.com/financial-aid/col...
computer science,major,https://www.scholarships.com/financial-aid/col...
communications,major,https://www.scholarships.com/financial-aid/col...
business,major,https://www.scholarships.com/financial-aid/col...
biology,major,https://www.scholarships.com/financial-aid/col...


In [None]:
# Scraping scholarships by major

# Step 2:
# Scrape urls of each major

import pandas as pd
import requests
from bs4 import BeautifulSoup
import os.path
import re


scholarships_by_major_urls = pd.read_csv('data/scholarships_com_by_major_urls_v1.csv')
BASE_URL = 'https://www.scholarships.com'

def find_scholarship_divs_in_list(tag):
    '''
    https://www.crummy.com/software/BeautifulSoup/bs4/doc/#a-function
    
    Every scholarship.com list view uses the same format for scholarships.
    
    <div>
        <h3>program_name</h3>
        <ul>
            <li>application deadline</li>
            <li>amount</li>
            <li>description (truncated)
        </ul>
    </div>
    '''
    children = [child for child in tag.children if child != '\n']
    
    return len(children) == 2 and children[0].name == 'h3' and children[1].name == 'ul'

def clean_text(text):
    '''
        Prepares text for csv storage
    '''
    # escape quotes
    result = text.replace('"', '""')

    # remove whitespace
    return ''.join([char for char in result if char not in '"\n\r\t'])

def extract_contact_info(tag):
    '''
        Any of the contact fields can be empty  
    '''
    contact_title = None
    if tag.next_sibling and tag.next_sibling.next_sibling:
        contact_title_el = tag.next_sibling.next_sibling
        # replace whitespace
        contact_title = clean_text(contact_title_el.getText())

    contact_address1 = None
    contact_address1_el = details_soup.find('li', attrs={'id': 'liAddress1Text'})

    if contact_address1_el is not None:
        contact_address1 = clean_text(contact_address1_el.getText())

    contact_address2_el = details_soup.find('li', attrs={'id': 'liAddress2Text'})
    contact_address2 = None
    if contact_address2_el:
        contact_address2 = clean_text(contact_address2_el.getText())

    contact_city_state_zip_el = details_soup.find('li', attrs={'id': 'liCityStateZIPText'})
    contact_city_state_zip = None
    if contact_city_state_zip_el:
        contact_city_state_zip = clean_text(contact_city_state_zip_el.getText())

    contact_email = None
    contact_email_el = None
    if contact_city_state_zip_el and contact_city_state_zip_el.next_sibling:
        # occasionally, there's a div with \n as the inner content inserted here
        if contact_city_state_zip_el.next_sibling == '\n':
            if contact_city_state_zip_el.next_sibling.next_sibling:
                contact_email_el = contact_city_state_zip_el.next_sibling.next_sibling.find('a')
        else:
            contact_email_el = contact_city_state_zip_el.next_sibling.find('a')

        if contact_email_el:
            contact_email = clean_text(contact_email_el.getText())

    contact_phone = None

    if contact_email_el and contact_email_el.next_sibling:
        if contact_email_el.next_sibling == '\n':
            contact_phone_el = contact_email_el.next_sibling.next_sibling
        else:
            contact_phone_el = contact_email_el.next_sibling

        if contact_phone_el:  
            contact_phone = clean_text(contact_phone_el.getText())
    return {
        'contact_title': contact_title,
        'contact_address1': contact_address1,
        'contact_address2': contact_address2,
        'contact_city_state_zip': contact_city_state_zip,
        'contact_email': contact_email,
        'contact_phone': contact_phone 
    }

# for each major, scrape scholarships from major index page
for idx, row in scholarships_by_major_urls.iterrows():
    # get html
    page_html = requests.get(row['url']).text
    scholarships_details_df = None
    
    OUT_FILE = f"data/scholarships.com/{row['label_type']}-{(row['label'].replace(' ', '_'))}.csv"
    # skip file if it already exists
    if os.path.isfile(OUT_FILE):
        continue
    print(f'starting {OUT_FILE}')
    # parse html
    page_soup = BeautifulSoup(page_html, 'html.parser')
    section_div = page_soup.find('div', attrs={'class': 'innercontent'})
    
    scholarship_divs = [div for div in page_soup.find_all('div') if find_scholarship_divs_in_list(div)]
    
    for scholarship_div in scholarship_divs:
        # the scholarship description text in the list view is truncated, so we need to follow each url to the details view
        details_url = scholarship_div.find('a')['href']
        details_url = f"{BASE_URL}{details_url}"
        details_html = requests.get(details_url).text
        details_soup = BeautifulSoup(details_html, 'html.parser')
        
        program_name = details_soup.find('div', attrs={'class': 'innercontent'}).find('h1').getText()
        
        amount_raw, due_date_raw, num_available_raw = [
            h3.getText() for h3 in
            details_soup.find('div', attrs={'class': 'innercontent'}).find_all('h3')
        ]
        
        text_el = details_soup.find('li', attrs={'class': 'scholdescrip'}).find('div')
        program_description = text_el.getText()
        contact_info = extract_contact_info(text_el)
        
        data = {
            'details_url': details_url,
            'program_name': program_name,
            'program_description': program_description,
            **contact_info
        }
        if scholarships_details_df is None:
            scholarships_details_df = pd.DataFrame.from_records([data], columns=data.keys())
        else:
            _df = pd.DataFrame.from_records([data], columns=data.keys())
            scholarships_details_df = pd.concat([_df, scholarships_details_df], axis =0, sort=True)

    scholarships_details_df.to_csv(OUT_FILE)
    print(f'finished writing {OUT_FILE}')
    scholarships_details_df
    

starting data/scholarships.com/major-communications.csv
finished writing data/scholarships.com/major-communications.csv
starting data/scholarships.com/major-business.csv
