## 1. Triming Raw Data - Extract Los Angeles metropolitant data only

Note: Raw data is not uploaded to GitHub. Do not run this section without downloading the raw_data folder!

In [5]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

# Set some options for pandas and to have matplotlib show the charts in the notebook
pd.set_option('display.max_rows', 1000)
pd.options.display.float_format = '{:,.2f}'.format
%matplotlib inline


In [3]:
# Check raw data for import, as decode error using 'utf-8'
import chardet

with open('raw_data/school_location_raw.txt','rb') as file:
    result = chardet.detect(file.read())
result

{'encoding': 'ISO-8859-1', 'confidence': 0.73, 'language': ''}

In [8]:
# Import raw data
school_raw = pd.read_csv('raw_data/school_location_raw.txt', encoding='ISO-8859-1',  delimiter=',')
print(school_raw.head())
print(school_raw.shape)
print(school_raw['County Name'].describe())
print(school_raw['County Name'].unique())

   County Code  District Code  School Code  Filler  Test Year  Type Id  \
0           37          68056       114686     nan       2019        7   
1           37          68056      6038111     nan       2019        7   
2           37          68056      6088983     nan       2019        7   
3           37          68056      6110696     nan       2019        7   
4           37          68056      6115620     nan       2019        7   

  County Name             District Name                 School Name Zip Code  
0   San Diego  Del Mar Union Elementary                   Ocean Air    92130  
1   San Diego  Del Mar Union Elementary  Del Mar Heights Elementary    92014  
2   San Diego  Del Mar Union Elementary    Del Mar Hills Elementary    92014  
3   San Diego  Del Mar Union Elementary   Carmel Del Mar Elementary    92130  
4   San Diego  Del Mar Union Elementary     Ashley Falls Elementary    92130  
(11389, 10)
count           11389
unique             59
top       Los Angeles
fre

In [10]:
# Triming data to only include Los Angeles Metro Politant
# Select Los Angeles & Orange County only
school_raw = school_raw.copy()[(school_raw['County Name'] == 'Los Angeles') | 
                               (school_raw['County Name'] == 'Orange')]
print(school_raw.head())
print(school_raw.shape)
print(school_raw['County Name'].value_counts())

# Export trimmed data to csv
school_raw.to_csv('data/2_school_location.csv', index=False)

      County Code  District Code  School Code  Filler  Test Year  Type Id  \
5435           19              0            0     nan       2019        5   
5436           19          10199            0     nan       2019        6   
5437           19          10199       100776     nan       2019        9   
5438           19          10199       109660     nan       2019        9   
5439           19          10199       109942     nan       2019        9   

      County Name                            District Name  \
5435  Los Angeles                                      NaN   
5436  Los Angeles   Los Angeles County Office of Education   
5437  Los Angeles  North Valley Military Institute College   
5438  Los Angeles        Aspire Antonio Maria Lugo Academy   
5439  Los Angeles   Los Angeles International Charter High   

                                            School Name Zip Code  
5435                                                NaN           
5436                          

In [11]:
print(school_raw['County Code'].value_counts())

19    2331
30     661
Name: County Code, dtype: int64


## 2. Process data - Public & Charter School only

Below school type are not considered standard public schools:
- Continuation Schools (Having Continuation in the school name)
- Alternative Schools (Having "Alternative in the school name)
- Home & Hospital Instruction (Having "Home" in the school name)
- Juvenile Court Schools (Having "Juvenile" in the school name)
- Opportunity Education (Having "Opportunity" in the school name)
- Community Day Schools (Having "Community Day" in the school name)

In [44]:
# Function to check if a school is a normal public school
def normal_public_school(series):
    '''Return True if the school is a normal public school.
    Which means it does not have the special term in the school name.
    Special terms are: Continuation, Alternative, Home, Juvenile, (Opportunity), Community Day'''
    
    terms = ['continuation', 'alternative', 'home', 'juvenile', '(opportunity)', 'community day', 'not school']
    returns = []
    
    for j in range(series.size):
        for term in terms:
            if term in series[j].lower():
                returns.append(False)
                break
            else:
                continue
        else:
            returns.append(True)
    
    return pd.Series(returns)

# Test normal_public_school function:
school1 = "Aspire Antonio Maria Lugo Academy"
school2 = 'KIPP Academy (Opportunity)'
test_school = pd.Series([school1, school2])
print(type(test_school))

print(test_school.size)
print(test_school[0])
print(normal_public_school(test_school))

<class 'pandas.core.series.Series'>
2
Aspire Antonio Maria Lugo Academy
0     True
1    False
dtype: bool


In [55]:
# Filter out non-standard public school to reduce result biases
school_raw['School Name'].fillna('Not School', inplace=True)

school = school_raw[~school_raw['School Name'].str.contains('Continuation|Alternative|\
                                Home|Juvenile|Opportunity|Community Day|Not School')]
print(school.head(10))
print(school.shape)

school.to_csv('data/8_public_school.csv', index=False)

      County Code  District Code  School Code  Filler  Test Year  Type Id  \
5437           19          10199       100776     nan       2019        9   
5438           19          10199       109660     nan       2019        9   
5439           19          10199       109942     nan       2019        9   
5440           19          10199       112128     nan       2019        9   
5441           19          10199       115030     nan       2019        9   
5442           19          10199       115212     nan       2019        9   
5443           19          10199       121772     nan       2019        9   
5444           19          10199       121822     nan       2019        7   
5447           19          10199       121905     nan       2019        7   
5448           19          10199       121921     nan       2019        7   

      County Name                             District Name  \
5437  Los Angeles   North Valley Military Institute College   
5438  Los Angeles         