# Imports

In [1]:
## imports
import pandas as pd
import re
import numpy as np

## print multiple things from same cell
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# Load data and show examples

In [2]:
schools_df = pd.read_csv("../public_data/schools_df.csv")
schools_df.head()

Unnamed: 0,schoolname,individualispjune2020,participatingincepsy2021,is_elem_exercise,is_charter_exercise,is_highschool_exercise
0,stove prairie elementary school,0.0,N,True,False,False
1,stewart county elementary school,0.7603,Y,True,False,False
2,desert springs elementary school,,N,True,False,False
3,saunemin elem school,0.3893999999999999,N,True,False,False
4,fifth district elementary,0.0275,N,True,False,False


# re.sub illustration

**Task**: 

- Use the `school_df` dataset and filter to `is_elem_exercise` == True 
- Using the `schoolname` field, replace the different varieties of elementary school in the data with `elemschool` 

## Incorrect approach 

Returns incorrect results that we'll see below

In [3]:
elem_ex = schools_df[schools_df.is_elem_exercise].copy()
elem_ex.head()

Unnamed: 0,schoolname,individualispjune2020,participatingincepsy2021,is_elem_exercise,is_charter_exercise,is_highschool_exercise
0,stove prairie elementary school,0.0,N,True,False,False
1,stewart county elementary school,0.7603,Y,True,False,False
2,desert springs elementary school,,N,True,False,False
3,saunemin elem school,0.3893999999999999,N,True,False,False
4,fifth district elementary,0.0275,N,True,False,False


In [4]:
elem_pattern = r"elementary|elem|elem\.|elementary school"

new_schools = [re.sub(elem_pattern, "elemschool", school) 
               for school in elem_ex.schoolname]

elem_ex['cleaned_name_try1'] = new_schools
elem_ex[["schoolname", "cleaned_name_try1"]]

Unnamed: 0,schoolname,cleaned_name_try1
0,stove prairie elementary school,stove prairie elemschool school
1,stewart county elementary school,stewart county elemschool school
2,desert springs elementary school,desert springs elemschool school
3,saunemin elem school,saunemin elemschool school
4,fifth district elementary,fifth district elemschool
5,paint branch elementary,paint branch elemschool
6,oak hill elem.,oak hill elemschool.
7,lewis and clark elem.,lewis and clark elemschool.
8,linden elementary school,linden elemschool school
9,winchester avenue elementary school,winchester avenue elemschool school


In [None]:
## Better way to apply to each row in the column
elem_ex.schoolname.apply(lambda x: re.sub(elem_pattern, "elemschool", x))

## A correct approach

Addresses issues above with `elementary school` and `elem.`

In [5]:
elem_pattern_try2 = r"(elem.*)(\s+)?(school)?"
    
new_schools_try2 = [re.sub(elem_pattern_try2, "elemschool", school) 
                    for school in elem_ex.schoolname]    

elem_ex['cleaned_name_try2'] = new_schools_try2
elem_ex[["schoolname", "cleaned_name_try1", "cleaned_name_try2"]]

Unnamed: 0,schoolname,cleaned_name_try1,cleaned_name_try2
0,stove prairie elementary school,stove prairie elemschool school,stove prairie elemschool
1,stewart county elementary school,stewart county elemschool school,stewart county elemschool
2,desert springs elementary school,desert springs elemschool school,desert springs elemschool
3,saunemin elem school,saunemin elemschool school,saunemin elemschool
4,fifth district elementary,fifth district elemschool,fifth district elemschool
5,paint branch elementary,paint branch elemschool,paint branch elemschool
6,oak hill elem.,oak hill elemschool.,oak hill elemschool
7,lewis and clark elem.,lewis and clark elemschool.,lewis and clark elemschool
8,linden elementary school,linden elemschool school,linden elemschool
9,winchester avenue elementary school,winchester avenue elemschool school,winchester avenue elemschool


# re.findall and re.search illustrations

**Task**: 

- Filter to `is_charter_exercise` == True; note that this contains a mix of schools with charter in the name and schools without
- Construct a pattern that, for charter schools, gets the school name prior to appearance of the word charter. School names without charter will not have matches (so Hanover Charter becomes Hanover; Hanover High stays Hanover High)


## re.findall 

In [6]:
## filter to charter exercise
charter_ex = schools_df[schools_df.is_charter_exercise].copy()
charter_ex.head(6)

Unnamed: 0,schoolname,individualispjune2020,participatingincepsy2021,is_elem_exercise,is_charter_exercise,is_highschool_exercise
10,frontier elementary school,8.51%,N,False,True,False
11,life source international charter,0.7201946472019465,Y,False,True,False
12,east valley senior high,0.45807770961145194,Y,False,True,False
13,children's community charter,0.8888888888888888,Y,False,True,False
14,south fork elementary,0.49640287769784175,Y,False,True,False
15,thomas edison charter academy ...,0.2855191256830601,N,False,True,False


In [13]:
## charter pattern
charter_pattern = r"(.*)\s+(charter)(\s+)?(\w+)?"

## findall 
test_charter_findall = [re.findall(charter_pattern, school) 
                        for school in charter_ex.schoolname]

## print result
test_charter_findall

[[],
 [('life source international', 'charter', '', '')],
 [],
 [("children's community", 'charter', '', '')],
 [],
 [('thomas edison', 'charter', ' ', 'academy')],
 [('moving everest', 'charter', ' ', 'school')],
 [],
 [],
 [('south valley academy', 'charter', ' ', 'school')],
 [('brighter choice', 'charter', ' ', 'school')],
 [('buffalo collegiate', 'charter', ' ', 'school')],
 [('neighborhood', 'charter', ' ', 'school')],
 [],
 [],
 []]

In [14]:
## show example of one
print(test_charter_findall[1][0][0])

life source international


## re.search

In [15]:
## get matches
test_charter_search = [re.search(charter_pattern, school) 
                       for school in charter_ex.schoolname]

test_charter_search


[None,
 <re.Match object; span=(0, 33), match='life source international charter'>,
 None,
 <re.Match object; span=(0, 28), match="children's community charter">,
 None,
 <re.Match object; span=(0, 29), match='thomas edison charter academy'>,
 <re.Match object; span=(0, 29), match='moving everest charter school'>,
 None,
 None,
 <re.Match object; span=(0, 35), match='south valley academy charter school'>,
 <re.Match object; span=(0, 30), match='brighter choice charter school'>,
 <re.Match object; span=(0, 33), match='buffalo collegiate charter school'>,
 <re.Match object; span=(0, 27), match='neighborhood charter school'>,
 None,
 None,
 None]

In [16]:
## extract matches

### here, we're just focusing on the 3rd match or the 6th entry (thomas edison charter academy)
### and we're getting the first group from that match
thomas_match = test_charter_search[5]
thomas_match

### example where we're just getting the first group
### (name of school before charter)
thomas_firstgroup = thomas_match.group(1)
thomas_firstgroup

<re.Match object; span=(0, 29), match='thomas edison charter academy'>

'thomas edison'

In [17]:
### iterate over all groups and print
for i in range(0, len(thomas_match.groups())+1):
    print("Group " + str(i) + " is: ")
    print(thomas_match.group(i))

## see error if we go beyond actual number of 
## groups thomas_match.group(5)

Group 0 is: 
thomas edison charter academy
Group 1 is: 
thomas edison
Group 2 is: 
charter
Group 3 is: 
 
Group 4 is: 
academy


In [18]:
## can also extract the groups as a tuple
## example- want to return group 1 and group 2 and paste together
thomas_groups_all = thomas_match.groups()
thomas_groups_all

## slice the tuple
thomas_groups_all[0:2]


('thomas edison', 'charter', ' ', 'academy')

('thomas edison', 'charter')

In [19]:
## can generalize to the full list with ifelse
def get_precharter_name(one_matchobj):
    
    if one_matchobj:
        school_name = one_matchobj.group(1)
    else:
        school_name = ""
    
    return(school_name)

all_charter_match = [get_precharter_name(one_search) 
                    for one_search in test_charter_search]

all_charter_match

['',
 'life source international',
 '',
 "children's community",
 '',
 'thomas edison',
 'moving everest',
 '',
 '',
 'south valley academy',
 'brighter choice',
 'buffalo collegiate',
 'neighborhood',
 '',
 '',
 '']

# Group activity

## Part 1: Subsetting
Filter the data to only those rows where `is_highschool_exercise` is True.

In [92]:
# your code here to filter high school data
highschool_df = schools_df[schools_df.is_highschool_exercise].copy()
highschool_df

Unnamed: 0,schoolname,individualispjune2020,participatingincepsy2021,is_elem_exercise,is_charter_exercise,is_highschool_exercise
26,mount pleasant area jshs,,N,False,False,True
27,huron high school,0.2867,N,False,False,True
28,thomson high school,0.4065,Y,False,False,True
29,kings county office of education highland faci...,,N,False,False,True
30,clovis east high,0.2634956587391468,N,False,False,True
31,camden jr. high school,0.0,N,False,False,True
32,jackson junior high,0.3253333333333333,Y,False,False,True
33,emmett junior high school,,N,False,False,True
34,atkins high,0.1024,N,False,False,True
35,lexington senior high,0.485,Y,False,False,True


## Part 2: Standardizing names
To find the names of high schools, try out some regex patterns to standardize the high school names (e.g., 'high school' and 'high' could both become 'highschool'). AKA make everything 'highschool.'

**Hint:** Look at the school names for hints on what to avoid matching--e.g., 'highland facility'. To avoid things like this, after 'high' or 'hs', have your pattern look for a space (`\s`) or the end of the string (`$`). 

In [93]:
# your code here
hs_sub_pattern = r"(high|hs|jshs)(\s|$)(school)?"

# Try testing it on a few strings first
test_string = "xx high"
re.sub(hs_sub_pattern, "highschool", test_string)


'xx highschool'

In [94]:
# your code here to standardize high school names
highschool_df["cleaned_name"] = highschool_df.schoolname.apply(lambda x: re.sub(hs_sub_pattern, "highschool", x))
highschool_df.head()

Unnamed: 0,schoolname,individualispjune2020,participatingincepsy2021,is_elem_exercise,is_charter_exercise,is_highschool_exercise,cleaned_name
26,mount pleasant area jshs,,N,False,False,True,mount pleasant area highschool
27,huron high school,0.2867,N,False,False,True,huron highschool
28,thomson high school,0.4065,Y,False,False,True,thomson highschool
29,kings county office of education highland faci...,,N,False,False,True,kings county office of education highland faci...
30,clovis east high,0.2634956587391468,N,False,False,True,clovis east highschool


## Part 3: Match schools
Using some example results, try writing a regex pattern and using `re.match` to get the name of the school that precedes the 'highschool' part of the name (e.g., 'new trier highschool' -> 'new trier')

In [97]:
# your code here to extract names of high schools
match_pattern = r"(.+)(\shighschool)"

match_list = [re.match(match_pattern, cleaned_name) for cleaned_name in highschool_df.cleaned_name]
# match_list
match_list[1].group(1)

'huron'

In [102]:
# highschool_df["short_name"] = [m.group(1) for m in match_list if m!=None]

def extract(m):
    if m != None:
        return m.group(1)
    return "Nuclear Facility"

highschool_df["short_name"] = [extract(m) for m in match_list]
highschool_df["short_name"]

26    mount pleasant area
27                  huron
28                thomson
29       Nuclear Facility
30            clovis east
31             camden jr.
32         jackson junior
33          emmett junior
34                 atkins
35       lexington senior
36                 temple
37            forest hill
38             pittsfield
39               matanzas
40                pontiac
Name: short_name, dtype: object

In [None]:
# Hermia
match_pattern = r"(.*)\s(highschool)(.*)"

matched_school = [re.match(match_pattern,school) for school in hs_ex["hs_name_cleaned"]]
hs_ex["short_name"]=[matched_school[i].group(1) for i in range(0,hs_ex.shape[0])]
hs_ex["short_name"]