## Clean Schools.csv
1. Split address column
2. Look at length - this displays discrepancies in addresses (looking for lengths 1, 3, 4)
3. Create a city and state column - consistency with other data
4. Create columns for schools categories - pk, k, elementary, middle, and high school

In [1]:
import pandas as pd
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows', 10)



In [2]:
df = pd.read_csv('csv/schools.csv')

In [3]:
print(df.shape)
df.head()

(58782, 10)


Unnamed: 0.1,Unnamed: 0,School,Score,Rating,Address,Type,Grades,Total Students Enrolled,Students per teacher,District
0,0,Akron Early College High School,10/10,Top rated,"225 South Main Street, Akron, OH, 44308",Public district,9-12,384.0,34:1,Akron City School District
1,1,Revere Middle School,9/10,Above average,"3195 Spring Valley Road, Akron, OH, 44333",Public district,6-8,624.0,13:1,Revere Local School District
2,2,Arrowhead Primary Elementary School,8/10,Above average,"1600 Raleigh Boulevard, Akron, OH, 44321",Public district,K-4,345.0,20:1,Copley-Fairlawn City School District
3,3,Manchester Middle School,8/10,Above average,"760 West Nimisila Road, Akron, OH, 44319",Public district,5-8,387.0,16:1,Manchester Local School District
4,4,Nolley Elementary School,8/10,Above average,"6285 Renninger Rd, Akron, OH, 44319",Public district,K-4,483.0,17:1,Manchester Local School District


## 1. Split Address Column

In [4]:
df['City, State']  = df['Address'].str.split(',')

## 2. Create lengths to find discrepancies in 'Address' column


In [5]:
# Finding length because there are anomalies with the information in the address column
df['Length'] = df['City, State'].apply(lambda x: len(x) if x != None else 0 )
# 4 is the expected length 
df['Length'].unique()

array([4, 5, 6, 7, 1])

## Cleaning

In [6]:
df['Address'] = df['Address'].str.replace('2100 Morse Road, Suite 4609, Columbus, OH 43229, Columbus, OH, 43211', '2100 Morse Road, Suite 4609, Columbus, OH 43229')
df['Address'] = df['Address'].str.replace('2501 Syracuse Street, Denver, Colorado, 80238, Denver, CO, 80238', '2501 Syracuse Street, Denver, Colorado, 80238')
df['Address'] = df['Address'].str.replace('1018 Harding Street, Suite 112, Lafayette, La 70503, Lafayette, LA, 70507', '1018 Harding Street, Suite 112, Lafayette, LA 70503')
df['Address'] = df['Address'].str.replace('4450 West Eau Gallie Boulevard, Suite 180, Melbourne, FL 32934, Melbourne, FL, 32934', '4450 West Eau Gallie Boulevard, Suite 180, Melbourne, FL 32934')
df['Address'] = df['Address'].str.replace('4530 MacArthur Blvd, NW, Washington, DC, Washington, DC, 20007', '4530 MacArthur Blvd, NW, Washington, DC, 20007')
df['Address'] = df['Address'].str.replace('1075 New Scotland Road, Albany NY 12208, Albany, NY, 12208', '1075 New Scotland Road, Albany NY 12208')
df['Address'] = df['Address'].str.replace('216 Shelburne Road Asheville, NC 28806, Asheville, NC, 28806', '216 Shelburne Road, Asheville, NC 28806')
df['Address'] = df['Address'].str.replace('26450 RR 12 Dripping Springs, TX 78620, Austin, TX, 78736', '26450 RR 12, Dripping Springs, TX 78620')
df['Address'] = df['Address'].str.replace('NE Stoneridge Loop, Prineville OR 97754, Bend, OR, 97702', 'NE Stoneridge Loop, Prineville OR 97754')
df['Address'] = df['Address'].str.replace('5225 - Seventy seven Center Dr, Charlotte NC 28217, Charlotte, NC, 28217', '5225 77 Center Dr, Charlotte, NC 28217')
df['Address'] = df['Address'].str.replace('3375 W. 99th Street Cleveland, OH 44102, Cleveland, OH, 44111', '3375 W. 99th Street, Cleveland, OH 44102')
df['Address'] = df['Address'].str.replace('21 Broadmoor Avenue Colorado Springs, CO 80906, Colorado Springs, CO, 80906', '21 Broadmoor Avenue, Colorado Springs, CO 80906')
df['Address'] = df['Address'].str.replace('220 Stoneridge Drive Suite 403 Columbia, SC 29210 , Columbia, SC, 29210', '220 Stoneridge Drive, Suite 403, Columbia, SC 29210')
df['Address'] = df['Address'].str.replace('2247 South Ridgewood South Daytona, Florida 32119, Daytona Beach, FL, 32117', '2247 South Ridgewood, South Daytona, FL 32119')
df['Address'] = df['Address'].str.replace('7005 Woodbine Ave Sacramento, Ca. 95822, Fairfield, CA, 94534', '7005 Woodbine Ave, Sacramento, CA 95822')
df['Address'] = df['Address'].str.replace('4424 Innovation Drive Fort Collins, Colorado 80525, Fort Collins, CO, 80525', '4424 Innovation Drive, Fort Collins, CO 80525')
df['Address'] = df['Address'].str.replace('5300 El Camino Road Las Vegas, NV 89118, Henderson, NV, 89014', '5300 El Camino Road, Las Vegas, NV 89118')
df['Address'] = df['Address'].str.replace('9039 Beach Blvd Jacksonville, FL 32216, Jacksonville, FL, 32207', '9039 Beach Blvd, Jacksonville, FL 32216')
df['Address'] = df['Address'].str.replace('390 New Holland Pike, Lancaster PA 17601, Lancaster, PA, 17601', '390 New Holland Pike, Lancaster PA 17601')
df['Address'] = df['Address'].str.replace('4801. S. Sandhill Drive LV, NV 89121, Las Vegas, NV, 89123', '4801. S. Sandhill Drive, Las Vegas, NV, 89123')
df['Address'] = df['Address'].str.replace('2727 Stinson Blvd. NE Minneapolis, MN 55418, Minneapolis, MN, 55418', '2727 Stinson Blvd. NE, Minneapolis, MN, 55418')
df['Address'] = df['Address'].str.replace('3000 53rd St SW Naples, FL 34116, Naples, FL, 34116', '3000 53rd St SW, Naples, FL 34116')
df['Address'] = df['Address'].str.replace('177 W Klein Rd. New Braunfels, TX 78130, New Braunfels, TX, 78130', '177 W Klein Rd., New Braunfels, TX 78130')
df['Address'] = df['Address'].str.replace('500 Soraparu St. New Orleans, La 70130, New Orleans, LA, 70130', '500 Soraparu St., New Orleans, LA 70130')
df['Address'] = df['Address'].str.replace('2162 Mountain Blvd, Oakland CA 94611, Oakland, CA, 94605', '2162 Mountain Blvd, Oakland CA 94611')
df['Address'] = df['Address'].str.replace('13231 N. 22nd St. Phoenix, AZ 85022, Phoenix, AZ, 85028', '13231 N. 22nd St., Phoenix, AZ 85022')
df['Address'] = df['Address'].str.replace('14124 SE Mill St, Portland OR 97233, Portland, OR, 97266', '14124 SE Mill St, Portland OR 97233')
df['Address'] = df['Address'].str.replace('555 Double Eagle Ct. Suite 2000 Reno, NV 89521 , Reno, NV, 89521', '555 Double Eagle Ct., Suite 2000, Reno, NV 89521')
df['Address'] = df['Address'].str.replace('3422 Rustin Ave Riverside, CA 92507, Riverside, CA, 92504', '3422 Rustin Ave Riverside, CA 92507')
df['Address'] = df['Address'].str.replace('2800 19th Stree NW Rochester, MN 55901, Rochester, MN, 55902', '2800 19th Stree NW, Rochester, MN 55901')
df['Address'] = df['Address'].str.replace('9510 Carmel Mountain Road, San Diego CA 92129, San Diego, CA, 92129', '9510 Carmel Mountain Road, San Diego CA 92129')
df['Address'] = df['Address'].str.replace('5531 Cancha De Golf Ste 202 Rancho Santa Fe, CA 92091, San Diego, CA, 92130', '5531 Cancha de Golf, Ste 202, Rancho, Santa Fe, CA 92091')
df['Address'] = df['Address'].str.replace('6540 Flanders Drive. San Diego, CA 92121, San Diego, CA, 92127', '6540 Flanders Drive., San Diego, CA 92121')
df['Address'] = df['Address'].str.replace('725 Washington St. Santa Clara, Ca 95050, Santa Clara, CA, 95050', '725 Washington St., Santa Clara, CA, 95050')
df['Address'] = df['Address'].str.replace('6715 S Boe Lane Sioux Falls, SD 57108, Sioux Falls, SD, 57105', '6715 S Boe Lane, Sioux Falls, SD 57108')
df['Address'] = df['Address'].str.replace('8740 Asheville Hwy Spartanburg, S.C. 29316, Spartanburg, SC, 29316', '8740 Asheville Hwy, Spartanburg, SC 29316')
df['Address'] = df['Address'].str.replace('12611 N. Wilson St. Mead, WA 99021, Spokane, WA, 99218', '12611 N. Wilson St., Mead, WA 99021')
df['Address'] = df['Address'].str.replace('1450 Newfield Avenue Stamford, CT 06905, Stamford, CT, 06905', '1450 Newfield Avenue, Stamford, CT 06905')
df['Address'] = df['Address'].str.replace('2525 Gold Brook Dr Stockton, CA 95212, Stockton, CA, 95212', '2525 Gold Brook Dr, Stockton, CA 95212')
df['Address'] = df['Address'].str.replace('1112 North G Street | Tacoma, WA 98403, Tacoma, WA, 98403', '1112 North G Street, Tacoma, WA 98403')
df['Address'] = df['Address'].str.replace('1250 Erbes Rd. Thousand Oaks, CA 91362, Thousand Oaks, CA, 91302', '1250 Erbes Rd., Thousand Oaks, CA 91362')
df['Address'] = df['Address'].str.replace('3201 N. Eastman Rd. Longview, TX 75605, Tyler, TX, 75799', '3201 N. Eastman Rd., Longview, TX 75605')
df['Address'] = df['Address'].str.replace('St. Catherine of Siena School, 3460 Tennessee Street, Vallejo, CA, 94591', '3460 Tennessee Street, Vallejo, CA, 94591')
df['Address'] = df['Address'].str.replace('1650 Godfrey Ave. Wyoming,Mi 49509 , Wyoming, MI, 49509', '1650 Godfrey Ave., Wyoming, MI 49509' )
df['Address'] = df['Address'].str.replace('3422 Rustin Ave Riverside, CA 92507', '3422 Rustin Ave, Riverside, CA 92507')
df['Address'] = df['Address'].str.replace('1018 Harding Street, Suite 112, Lafayette, La 70503, Lafayette, LA 70507', '1018 Harding Street, Suite 112, Lafayette, LA 70503') # 27205
df['Address'] = df['Address'].str.replace('San Martin De Porres Clinic: Kallumadanda Vinnie MD Mission, TX 78572', 'San Martin De Porres Clinic: Kallumadanda Vinnie MD, Mission, TX 78572') # 33396
df['Address'] = df['Address'].str.replace('Rockwood Plastic Surgery Center: Gardner Glenn P MD Spokane, WA 99204', 'Rockwood Plastic Surgery Center: Gardner Glenn P MD, Spokane, WA 99204' ) # 50841

In [7]:
df['Address'] = df['Address'].str.replace('  Cleveland, OH 44102, Cleveland, OH, 44111', ', Cleveland, OH 44102') # 10584
df['Address'] = df['Address'].str.replace('  Columbia, SC 29210 , Columbia, SC, 29210', ', Columbia, SC 29210 ') # 11668
df['Address'] = df['Address'].str.replace('  Las Vegas, NV 89118, Henderson, NV, 89014', ', Las Vegas, NV 89118' ) # 23334
df['Address'] = df['Address'].str.replace('  Naples, FL 34116, Naples, FL, 34116', ', Naples, FL 34116') # 34536
df['Address'] = df['Address'].str.replace('  Reno, NV 89521 , Reno, NV, 89521', ', Reno, NV 89521') # 42400
df['Address'] = df['Address'].str.replace('  Spartanburg, S.C. 29316, Spartanburg, SC, 29316', ', Spartanburg, S.C. 29316')
df['Address'] = df['Address'].str.replace('Lafayette, La 70503, Lafayette, LA 70507', 'Lafayette, LA 70503') # 27205

In [8]:
df['Address'] = df['Address'].str.replace('Washington, DC, Washington, DC,', 'Washington, DC ')
df['Address'] = df['Address'].str.replace('New Orleans, LA, New Orleans, LA,', 'New Orleans, LA ')
df['Address'] = df['Address'].str.replace('Albuquerque, NM, Albuquerque, NM,', 'Albuquerque, NM ' )
df['Address'] = df['Address'].str.replace('Chelsea, MA, Boston, MA,', 'Chelsea, MA ' )
df['Address'] = df['Address'].str.replace('Franklin, TN, Franklin, TN,', 'Franklin, TN ')
df['Address'] = df['Address'].str.replace('Hales Corners, WI, Milwaukee, WI', 'Hales Corners, WI') # 50525
df['Address'] = df['Address'].str.replace('Albany NY', 'Albany, NY' )
df['Address'] = df['Address'].str.replace('Prineville OR', 'Prineville, OR')
df['Address'] = df['Address'].str.replace('Lancaster PA', 'Lancaster, PA')
df['Address'] = df['Address'].str.replace('Portland OR', 'Portland, OR')
df['Address'] = df['Address'].str.replace('San Diego CA', 'San Diego, CA')

In [9]:
df['Address'] = df['Address'].str.replace('AL,', 'AL')
df['Address'] = df['Address'].str.replace('AK,', 'AK')
df['Address'] = df['Address'].str.replace('AR,', 'AR')
df['Address'] = df['Address'].str.replace('AZ,', 'AZ')
df['Address'] = df['Address'].str.replace('CA,', 'CA')
df['Address'] = df['Address'].str.replace('CO,', 'CO')
df['Address'] = df['Address'].str.replace('CT,', 'CT')
df['Address'] = df['Address'].str.replace('DE,', 'DE')
df['Address'] = df['Address'].str.replace('DC,', 'DC')
df['Address'] = df['Address'].str.replace('FL,', 'FL')
df['Address'] = df['Address'].str.replace('GA,', 'GA')
df['Address'] = df['Address'].str.replace('HI,', 'HI')
df['Address'] = df['Address'].str.replace('IA,', 'IA')
df['Address'] = df['Address'].str.replace('ID,', 'ID')
df['Address'] = df['Address'].str.replace('IL,', 'IL')
df['Address'] = df['Address'].str.replace('IN,', 'IN')
df['Address'] = df['Address'].str.replace('KS,', 'KS')
df['Address'] = df['Address'].str.replace('KY,', 'KY')
df['Address'] = df['Address'].str.replace('LA,', 'LA')
df['Address'] = df['Address'].str.replace('MA,', 'MA')
df['Address'] = df['Address'].str.replace('MD,', 'MD')
df['Address'] = df['Address'].str.replace('ME,', 'ME')
df['Address'] = df['Address'].str.replace('MI,', 'MI')
df['Address'] = df['Address'].str.replace('MN,', 'MN')
df['Address'] = df['Address'].str.replace('MO,', 'MO')
df['Address'] = df['Address'].str.replace('MS,', 'MS')
df['Address'] = df['Address'].str.replace('MT,', 'MT')
df['Address'] = df['Address'].str.replace('NC,', 'NC')
df['Address'] = df['Address'].str.replace('ND,', 'ND')
df['Address'] = df['Address'].str.replace('NH,', 'NH')
df['Address'] = df['Address'].str.replace('NJ,', 'NJ')
df['Address'] = df['Address'].str.replace('NM,', 'NM')
df['Address'] = df['Address'].str.replace('NV,', 'NV')
df['Address'] = df['Address'].str.replace('NY,', 'NY')
df['Address'] = df['Address'].str.replace('OH,', 'OH')
df['Address'] = df['Address'].str.replace('OK,', 'OK')
df['Address'] = df['Address'].str.replace('OR,', 'OR')
df['Address'] = df['Address'].str.replace('PA,', 'PA')
df['Address'] = df['Address'].str.replace('RI,', 'RI')
df['Address'] = df['Address'].str.replace('SC,', 'SC')
df['Address'] = df['Address'].str.replace('SD,', 'SD')
df['Address'] = df['Address'].str.replace('TN,', 'TN')
df['Address'] = df['Address'].str.replace('TX,', 'TX')
df['Address'] = df['Address'].str.replace('UT,', 'UT')
df['Address'] = df['Address'].str.replace('VA,', 'VA')
df['Address'] = df['Address'].str.replace('VT,', 'VT')
df['Address'] = df['Address'].str.replace('WA,', 'WA')
df['Address'] = df['Address'].str.replace('WI,', 'WI')
df['Address'] = df['Address'].str.replace('WV,', 'WV')
df['Address'] = df['Address'].str.replace('WY,', 'WY')

## Check - Recalculate Length

In [10]:
df = df.drop(columns = ['City, State', 'Length'])

In [11]:
df['City, State']  = df['Address'].str.split(',')

In [12]:
# Checking string lengths after cleaning
df['Length'] = df['City, State'].apply(lambda x: len(x) if x != None else 0 )
# 1, 3, 4 are the expected length 
df['Length'].unique()

array([3, 5, 4, 1, 6, 2])

In [13]:
df.loc[df['Length'] == 2]

Unnamed: 0.1,Unnamed: 0,School,Score,Rating,Address,Type,Grades,Total Students Enrolled,Students per teacher,District,"City, State",Length
30412,30412,Pine Tree Middle School,3/10,Below average,"600 PT PKWY Longview, TX 75604",Public district,5-6,725.0,14:1,Pine Tree Independent School District,"[600 PT PKWY Longview, TX 75604]",2
33396,33396,Building Blocks Child Care Center,0/10,Currently unrated,"San Martin De Porres Clinic: Kallumadanda Vinnie MD Mission, TX 78572",Private,PK,,,,"[San Martin De Porres Clinic: Kallumadanda Vinnie MD Mission, TX 78572]",2
50570,50570,Montessori West Christian School,0/10,Currently unrated,"8800 WARREN H. ABERNATHY HWY Spartanburg, SC 29301",Private,PK-K,68.0,,,"[8800 WARREN H. ABERNATHY HWY Spartanburg, SC 29301]",2
50841,50841,Happy Kids Day Care,0/10,Currently unrated,"Rockwood Plastic Surgery Center: Gardner Glenn P MD Spokane, WA 99204",Private,PK,,,,"[Rockwood Plastic Surgery Center: Gardner Glenn P MD Spokane, WA 99204]",2


In [14]:
df['Address'] = df['Address'].str.replace('600 PT PKWY Longview, TX 75604', '600 PT PKWY, Longview, TX 75604')
df['Address'] = df['Address'].str.replace('San Martin De Porres Clinic: Kallumadanda Vinnie MD Mission, TX 78572', 'San Martin De Porres Clinic: Kallumadanda Vinnie MD, Mission, TX 78572')
df['Address'] = df['Address'].str.replace('8800 WARREN H. ABERNATHY HWY Spartanburg, SC 29301', '8800 WARREN H. ABERNATHY HWY, Spartanburg, SC 29301')
df['Address'] = df['Address'].str.replace('Rockwood Plastic Surgery Center: Gardner Glenn P MD Spokane, WA 99204', 'Rockwood Plastic Surgery Center: Gardner Glenn P MD, Spokane, WA 99204')

## Check 

In [15]:
df = df.drop(columns = ['City, State', 'Length'])

In [16]:
df['City, State']  = df['Address'].str.split(',')

In [17]:
# Checking string lengths after cleaning
df['Length'] = df['City, State'].apply(lambda x: len(x) if x != None else 0 )
# 1, 3, 4 are the expected length 
df['Length'].unique()

array([3, 5, 4, 1, 6])

## Create new dataframe with rows with lengths 5, 6, and a specific row

### LENGTH 5

In [18]:
length_5 = df.loc[df['Length'] == 5]

In [19]:
length_5['City, State'] = length_5['City, State'].str[0:3]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  length_5['City, State'] = length_5['City, State'].str[0:3]


In [20]:
length_5['Address'] = length_5['City, State'].str.join(',')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  length_5['Address'] = length_5['City, State'].str.join(',')


In [21]:
# Clean specific addr
length_5['Address'] = length_5['Address'].str.replace('2950 East 29th Street, Long Beach, CA', '2950 E 29th St, Long Beach, CA 90806')	
length_5['Address'] = length_5['Address'].str.replace('2585 Business Park Drive, Vista, 92081', '2585 Business Park Dr, Vista, CA 92081')
length_5['Address'] = length_5['Address'].str.replace('401 E Arrowood Rd, Charlotte, Nc', '401 E Arrowood Rd, Charlotte, NC 28217')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  length_5['Address'] = length_5['Address'].str.replace('2950 East 29th Street, Long Beach, CA', '2950 E 29th St, Long Beach, CA 90806')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  length_5['Address'] = length_5['Address'].str.replace('2585 Business Park Drive, Vista, 92081', '2585 Business Park Dr, Vista, CA 92081')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-do

In [22]:
length_5 = length_5.drop(columns = ['City, State', 'Length'])

In [23]:
length_5['City, State']  = length_5['Address'].str.split(',')

In [24]:
# Checking string lengths after cleaning
length_5['Length'] = length_5['City, State'].apply(lambda x: len(x) if x != None else 0 )
# 1, 3, 4 are the expected length 
length_5['Length'].unique()

array([3])

In [25]:
print(length_5.shape)
length_5.head()

(79, 12)


Unnamed: 0.1,Unnamed: 0,School,Score,Rating,Address,Type,Grades,Total Students Enrolled,Students per teacher,District,"City, State",Length
553,553,Mountain Mahogany Community School,6/10,Average,"5014 4th Street NW, Albuquerque, NM 87107",Public charter,K-8,187.0,17:1,Albuquerque Public Schools,"[5014 4th Street NW, Albuquerque, NM 87107]",3
1006,1006,Acton Academy Albuquerque,0/10,Currently unrated,"3100 Menaul Blvd NE, Albuquerque, NM 87107",Private,1-6,,,,"[3100 Menaul Blvd NE, Albuquerque, NM 87107]",3
2096,2096,Anderson Christian School,0/10,Currently unrated,"5401 S. Madison Ave, Anderson, IN 46013",Private,"PK-9, 11-12",56.0,,,"[5401 S. Madison Ave, Anderson, IN 46013]",3
2148,2148,Go Like The Wind Montessori School,0/10,Currently unrated,"8845 Main St., Whitmore Lake, MI 48189",Private,PK-9,131.0,,,"[8845 Main St., Whitmore Lake, MI 48189]",3
2382,2382,Newman International Academy Of Ar,6/10,Average,"2011 S Fielder Rd, Arlington, TX 76013",Public charter,PK-12,870.0,13:1,Newman International Academy Of Arlington,"[2011 S Fielder Rd, Arlington, TX 76013 ]",3


### LENGTH 6

In [26]:
length_6 = df.loc[df['Length'] == 6]

In [27]:
length_6['City, State'] = length_6['City, State'].str[0:4]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  length_6['City, State'] = length_6['City, State'].str[0:4]


In [28]:
length_6['Address'] = length_6['City, State'].str.join(',')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  length_6['Address'] = length_6['City, State'].str.join(',')


In [29]:
length_6 = length_6.drop(columns = ['City, State', 'Length'])

In [30]:
length_6['City, State']  = length_6['Address'].str.split(',')

In [31]:
# Checking string lengths after cleaning
length_6['Length'] = length_6['City, State'].apply(lambda x: len(x) if x != None else 0 )
# 1, 3, 4 are the expected length 
length_6['Length'].unique()

array([4])

In [32]:
print(length_6.shape)
length_6.head()

(1, 12)


Unnamed: 0.1,Unnamed: 0,School,Score,Rating,Address,Type,Grades,Total Students Enrolled,Students per teacher,District,"City, State",Length
27205,27205,Upper Lafayette Academy-Louisiana Education Corporation,0/10,Currently unrated,"1018 Harding Street, Suite 112, Lafayette, La 70503",Private,11,,,,"[1018 Harding Street, Suite 112, Lafayette, La 70503]",4


### Specific rows

In [33]:
specific = df.loc[df['Unnamed: 0'] == 52126]

In [34]:
specific['Address'] = specific['Address'].str[:-38]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  specific['Address'] = specific['Address'].str[:-38]


In [35]:
specific.head()

Unnamed: 0.1,Unnamed: 0,School,Score,Rating,Address,Type,Grades,Total Students Enrolled,Students per teacher,District,"City, State",Length
52126,52126,St. Patrick Catholic School,0/10,Currently unrated,"1112 North G Street, Tacoma, WA 98403",Private,PK-8,445.0,,,"[1112 North G Street, Tacoma, WA 98403|1112 North G Street, Tacoma, WA 98403]",5


In [36]:
specific = specific.drop(columns = ['City, State', 'Length'])

In [37]:
specific['City, State']  = specific['Address'].str.split(',')

In [38]:
# Checking string lengths after cleaning
specific['Length'] = specific['City, State'].apply(lambda x: len(x) if x != None else 0 )
# 1, 3, 4 are the expected length 
specific['Length'].unique()

array([3])

In [39]:
print(specific.shape)
specific.head()

(1, 12)


Unnamed: 0.1,Unnamed: 0,School,Score,Rating,Address,Type,Grades,Total Students Enrolled,Students per teacher,District,"City, State",Length
52126,52126,St. Patrick Catholic School,0/10,Currently unrated,"1112 North G Street, Tacoma, WA 98403",Private,PK-8,445.0,,,"[1112 North G Street, Tacoma, WA 98403]",3


## Merge

In [40]:
df = df.append(length_5)
df = df.append(length_6)
df = df.append(specific)

In [41]:
print(df.shape)
df[df.duplicated(subset=['Unnamed: 0', 'School', 'Score', 'Rating', 'Type', 'Grades', 'Total Students Enrolled', 'Students per teacher', 'District'], keep=False)]

(58863, 12)


Unnamed: 0.1,Unnamed: 0,School,Score,Rating,Address,Type,Grades,Total Students Enrolled,Students per teacher,District,"City, State",Length
553,553,Mountain Mahogany Community School,6/10,Average,"5014 4th Street NW, Albuquerque, NM 87107, Albuquerque, NM 87107",Public charter,K-8,187.0,17:1,Albuquerque Public Schools,"[5014 4th Street NW, Albuquerque, NM 87107, Albuquerque, NM 87107]",5
1006,1006,Acton Academy Albuquerque,0/10,Currently unrated,"3100 Menaul Blvd NE, Albuquerque, NM 87107, Albuquerque, NM 87109",Private,1-6,,,,"[3100 Menaul Blvd NE, Albuquerque, NM 87107, Albuquerque, NM 87109]",5
2096,2096,Anderson Christian School,0/10,Currently unrated,"5401 S. Madison Ave, Anderson, IN 46013, Anderson, IN 46012",Private,"PK-9, 11-12",56.0,,,"[5401 S. Madison Ave, Anderson, IN 46013, Anderson, IN 46012]",5
2148,2148,Go Like The Wind Montessori School,0/10,Currently unrated,"8845 Main St., Whitmore Lake, MI 48189, Ann Arbor, MI 48105",Private,PK-9,131.0,,,"[8845 Main St., Whitmore Lake, MI 48189, Ann Arbor, MI 48105]",5
2382,2382,Newman International Academy Of Ar,6/10,Average,"2011 S Fielder Rd, Arlington, TX 76013 , Arlington, TX 76003",Public charter,PK-12,870.0,13:1,Newman International Academy Of Arlington,"[2011 S Fielder Rd, Arlington, TX 76013 , Arlington, TX 76003]",5
...,...,...,...,...,...,...,...,...,...,...,...,...
56786,56786,Georgetown Day School - Upper School,0/10,Currently unrated,"4200 Davenport Street NW, Washington, DC 20009",Private,9-12,,,,"[4200 Davenport Street NW, Washington, DC 20009]",3
57658,57658,Classical School of Wichita,0/10,Currently unrated,"6355 Willowbrook St., Wichita, Ks 67208",Private,K-12 & Ungraded,,,,"[6355 Willowbrook St., Wichita, Ks 67208]",3
58140,58140,Wilmington Academy Of Arts & Sci,0/10,Currently unrated,"6201 Myrtle Grove Road, Wilmington, NC 28412",Private,4-8,70.0,,,"[6201 Myrtle Grove Road, Wilmington, NC 28412]",3
27205,27205,Upper Lafayette Academy-Louisiana Education Corporation,0/10,Currently unrated,"1018 Harding Street, Suite 112, Lafayette, La 70503",Private,11,,,,"[1018 Harding Street, Suite 112, Lafayette, La 70503]",4


In [42]:
df = df.drop_duplicates(subset=['Unnamed: 0', 'School', 'Score', 'Rating', 'Type', 'Grades', 'Total Students Enrolled', 'Students per teacher', 'District' ], keep='last')
print(df.shape)

(58782, 12)


## Check Length Again

In [43]:
df = df.drop(columns = ['City, State', 'Length'])

In [44]:
df['City, State']  = df['Address'].str.split(',')

In [45]:
# Checking string lengths after cleaning
df['Length'] = df['City, State'].apply(lambda x: len(x) if x != None else 0 )
# 1, 3, 4 are the expected length 
df['Length'].unique()

array([3, 4, 1])

Managed to clean most of the addresses

## 3. Create City, State column

In [46]:
df['City'] = df['City, State'].str[-2]

In [47]:
df['State'] = df['City, State'].str[-1]
df['State'] = df['State'].str[:-6]

In [48]:
df['State'] = df['State'].str.strip()
df['City'] = df['City'].str.strip()
df['Address'] = df['Address'].str.strip()

## Clean up empty state column

In [49]:
df.loc[df['State'] == '']

Unnamed: 0.1,Unnamed: 0,School,Score,Rating,Address,Type,Grades,Total Students Enrolled,Students per teacher,District,"City, State",Length,City,State
15188,15188,Primrose School At Stapleton,0/10,Currently unrated,"2501 Syracuse Street, Denver, Colorado, 80238",Private,PK-K,73.0,,,"[2501 Syracuse Street, Denver, Colorado, 80238]",4,Colorado,
29323,29323,Adams Elementary School,8/10,Above average,"7401 Jacobs Creek Drive, Lincoln, NE, 68512",Public district,PK-5,827.0,18:1,Lincoln Public Schools,"[7401 Jacobs Creek Drive, Lincoln, NE, 68512]",4,NE,
29324,29324,Kloefkorn Elementary School,8/10,Above average,"6601 Glass Ridge Drive, Lincoln, NE, 68526",Public district,PK-5,497.0,17:1,Lincoln Public Schools,"[6601 Glass Ridge Drive, Lincoln, NE, 68526]",4,NE,
29325,29325,Cavett Elementary School,8/10,Above average,"7701 South 36th Street, Lincoln, NE, 68516",Public district,PK-5,692.0,16:1,Lincoln Public Schools,"[7701 South 36th Street, Lincoln, NE, 68516]",4,NE,
29326,29326,Lux Middle School,8/10,Above average,"7800 High Street, Lincoln, NE, 68506",Public district,6-8,917.0,16:1,Lincoln Public Schools,"[7800 High Street, Lincoln, NE, 68506]",4,NE,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48710,48710,Jackson's Child Care Center,0/10,Currently unrated,"305 West 15th Street, Scottsbluff, NE, 69361",Private,PK,,,,"[305 West 15th Street, Scottsbluff, NE, 69361]",4,NE,
48711,48711,Reach For The Stars Ps & Dc,0/10,Currently unrated,"518 Broadway, Scottsbluff, NE, 69361",Private,PK,,,,"[518 Broadway, Scottsbluff, NE, 69361]",4,NE,
48712,48712,Lil Angels Playhouse Child Care Center,0/10,Currently unrated,"1927 Broadway, Scottsbluff, NE, 69361",Private,PK,,,,"[1927 Broadway, Scottsbluff, NE, 69361]",4,NE,
48713,48713,Western Nebrask Child Development Center (Wncdc),0/10,Currently unrated,"3118 17th Avenue, Scottsbluff, NE, 69361",Public district,PK,,,Educational Service Unit 13,"[3118 17th Avenue, Scottsbluff, NE, 69361]",4,NE,


In [50]:
df['Address'] = df['Address'].str.replace('Colorado,', 'CO')
df['Address'] = df['Address'].str.replace('Lincoln, NE,', 'Lincoln, NE ')
df['Address'] = df['Address'].str.replace('Scottsbluff, NE, ', 'Scottsbluff, NE ')
df['Address'] = df['Address'].str.replace('5531 Cancha de Golf, Ste 202, Rancho', '5531 Cancha de Golf Ste 202, Rancho Santa Fe, CA 92091')

In [51]:
df = df.drop(columns = ['City, State',	'Length'] )
df['City, State']  = df['Address'].str.split(',')

In [52]:
# Checking string lengths after cleaning
df['Length'] = df['City, State'].apply(lambda x: len(x) if x != None else 0 )
# 1, 3, 4 are the expected length 
df['Length'].unique()

array([3, 4, 1, 2])

In [53]:
print(df['State'].nunique())
df['State'].unique()

59


array(['OH', 'GA', 'NY', 'OR', 'NM', 'LA', 'PA', 'TX', 'WA', 'CA', 'AK',
       'IN', 'MI', 'WI', 'NC', 'KY', 'NJ', 'ME', 'CO', 'MD', 'MT', 'MS',
       'ND', 'VA', 'IL', 'MA', 'FL', 'CT', 'TN', 'SD', 'VT', 'WY', 'IA',
       'SC', 'UT', 'MO', 'AR', '', 'AZ', 'DE', 'MN', 'NV', 'OK', 'HI',
       'ID', 'Unava', 'NH', 'WV', 'S.C.', 'KS', 'DC', 'Missouri',
       'New Jersey', 'Nevada', 'Oregon', 'Washington', 'OK.', 'Ks', 'La'],
      dtype=object)

In [54]:
df.loc[df['State'] == 'Unava']

Unnamed: 0.1,Unnamed: 0,School,Score,Rating,Address,Type,Grades,Total Students Enrolled,Students per teacher,District,City,State,"City, State",Length
24558,24558,Ms School For The Deaf Sped,0/10,Currently unrated,Unavailable,Public district,Ungraded,,,Ms School For The Deaf,,Unava,[Unavailable],1
25987,25987,Se AK Friends of Montessori,0/10,Currently unrated,Unavailable,Private,PK-K,54.0,,,,Unava,[Unavailable],1
29560,29560,Kathleen O'Neel Day Care,0/10,Currently unrated,Unavailable,Private,PK,,,,,Unava,[Unavailable],1


In [55]:
df.loc[df['State'] == 'S.C.']

Unnamed: 0.1,Unnamed: 0,School,Score,Rating,Address,Type,Grades,Total Students Enrolled,Students per teacher,District,City,State,"City, State",Length
50525,50525,Spartanburg Christian Academy,0/10,Currently unrated,"8740 Asheville Hwy, Spartanburg, S.C. 29316",Private,PK-12,465.0,,,Spartanburg,S.C.,"[8740 Asheville Hwy, Spartanburg, S.C. 29316]",3


In [56]:
df.loc[df['State'] == 'Missouri']

Unnamed: 0.1,Unnamed: 0,School,Score,Rating,Address,Type,Grades,Total Students Enrolled,Students per teacher,District,City,State,"City, State",Length
11531,11531,Heritage Academy,0/10,Currently unrated,"2900 Barberry Avenue, Columbia, Missouri 65202",Private,K-12,,,,Columbia,Missouri,"[2900 Barberry Avenue, Columbia, Missouri 65202]",3


In [57]:
df.loc[df['State'] == 'New Jersey']

Unnamed: 0.1,Unnamed: 0,School,Score,Rating,Address,Type,Grades,Total Students Enrolled,Students per teacher,District,City,State,"City, State",Length
25851,25851,St Dominic Academy,0/10,Currently unrated,"2572 John F Kennedy Boulevard, Jersey City, New Jersey 07304",Private,9-12,255.0,,,Jersey City,New Jersey,"[2572 John F Kennedy Boulevard, Jersey City, New Jersey 07304]",3


In [58]:
df.loc[df['State'] == 'Nevada']

Unnamed: 0.1,Unnamed: 0,School,Score,Rating,Address,Type,Grades,Total Students Enrolled,Students per teacher,District,City,State,"City, State",Length
28538,28538,Anderson Academy of Mathematics and Science,0/10,Currently unrated,"4656 N. Rancho Drive, Las Vegas, Nevada 89130",Private,1-8,,,,Las Vegas,Nevada,"[4656 N. Rancho Drive, Las Vegas, Nevada 89130]",3


In [59]:
df.loc[df['State'] == 'Oregon']

Unnamed: 0.1,Unnamed: 0,School,Score,Rating,Address,Type,Grades,Total Students Enrolled,Students per teacher,District,City,State,"City, State",Length
41320,41320,Tabor Tots Pre-School,0/10,Currently unrated,"6415 SE Morrison street, Portland, Oregon 97215",Private,PK,,,,Portland,Oregon,"[6415 SE Morrison street, Portland, Oregon 97215]",3


In [60]:
df.loc[df['State'] == 'Washington']

Unnamed: 0.1,Unnamed: 0,School,Score,Rating,Address,Type,Grades,Total Students Enrolled,Students per teacher,District,City,State,"City, State",Length
49183,49183,Giddens School,0/10,Currently unrated,"2120 21st Avenue South, Seattle, Washington 98144",Private,PK-5,176.0,,,Seattle,Washington,"[2120 21st Avenue South, Seattle, Washington 98144]",3


In [61]:
df.loc[df['State'] == 'OK.']

Unnamed: 0.1,Unnamed: 0,School,Score,Rating,Address,Type,Grades,Total Students Enrolled,Students per teacher,District,City,State,"City, State",Length
54901,54901,Solid Foundation Preparatory Academy,0/10,Currently unrated,"4025 N. Hartford Ave., Tulsa, OK. 74106",Private,PK-5,46.0,,,Tulsa,OK.,"[4025 N. Hartford Ave., Tulsa, OK. 74106]",3


In [62]:
df.loc[df['State'] == 'Ks']

Unnamed: 0.1,Unnamed: 0,School,Score,Rating,Address,Type,Grades,Total Students Enrolled,Students per teacher,District,City,State,"City, State",Length
57658,57658,Classical School of Wichita,0/10,Currently unrated,"6355 Willowbrook St., Wichita, Ks 67208",Private,K-12 & Ungraded,,,,Wichita,Ks,"[6355 Willowbrook St., Wichita, Ks 67208]",3


In [63]:
df.loc[df['State'] == 'La']

Unnamed: 0.1,Unnamed: 0,School,Score,Rating,Address,Type,Grades,Total Students Enrolled,Students per teacher,District,City,State,"City, State",Length
27205,27205,Upper Lafayette Academy-Louisiana Education Corporation,0/10,Currently unrated,"1018 Harding Street, Suite 112, Lafayette, La 70503",Private,11,,,,Lafayette,La,"[1018 Harding Street, Suite 112, Lafayette, La 70503]",4


In [64]:
df = df[df['State'] != 'Unava']

In [65]:
# Clean Addresses
df['Address'] = df['Address'].str.replace('8740 Asheville Hwy, Spartanburg, S.C. 29316', '8740 Asheville Hwy, Spartanburg, SC 29316')
df['Address'] = df['Address'].str.replace('2900 Barberry Avenue, Columbia, Missouri 65202', '2900 Barberry Avenue, Columbia, MO 65202')
df['Address'] = df['Address'].str.replace('2572 John F Kennedy Boulevard, Jersey City, New Jersey 07304', '2572 John F Kennedy Boulevard, Jersey City, NJ 07304')
df['Address'] = df['Address'].str.replace('4656 N. Rancho Drive, Las Vegas, Nevada 89130', '4656 N. Rancho Drive, Las Vegas, NV 89130')
df['Address'] = df['Address'].str.replace('6415 SE Morrison street, Portland, Oregon 97215', '6415 SE Morrison street, Portland, OR 97215')
df['Address'] = df['Address'].str.replace('2120 21st Avenue South, Seattle, Washington 98144', '2120 21st Avenue South, Seattle, WA 98144')
df['Address'] = df['Address'].str.replace('4025 N. Hartford Ave., Tulsa, OK. 74106', '4025 N. Hartford Ave., Tulsa, OK 74106')
df['Address'] = df['Address'].str.replace('6355 Willowbrook St., Wichita, Ks 67208', '6355 Willowbrook St., Wichita, KS 67208')
df['Address'] = df['Address'].str.replace('1018 Harding Street, Suite 112, Lafayette, La 70503', '1018 Harding Street, Suite 112, Lafayette, LA 70503')

# Clean State
df['State'] = df['State'].str.replace('S.C.', 'SC')
df['State'] = df['State'].str.replace('Missouri', 'MO')
df['State'] = df['State'].str.replace('New Jersey', 'NJ')
df['State'] = df['State'].str.replace('Nevada', 'NV')
df['State'] = df['State'].str.replace('Oregon', 'OR')
df['State'] = df['State'].str.replace('Washington', 'WA')
df['State'] = df['State'].str.replace('OK.', 'OK')
df['State'] = df['State'].str.replace('Ks', 'KS')
df['State'] = df['State'].str.replace('La', 'LA')

In [66]:
print(df['State'].nunique())
df['State'].unique()

49


array(['OH', 'GA', 'NY', 'OR', 'NM', 'LA', 'PA', 'TX', 'WA', 'CA', 'AK',
       'IN', 'MI', 'WI', 'NC', 'KY', 'NJ', 'ME', 'CO', 'MD', 'MT', 'MS',
       'ND', 'VA', 'IL', 'MA', 'FL', 'CT', 'TN', 'SD', 'VT', 'WY', 'IA',
       'SC', 'UT', 'MO', 'AR', '', 'AZ', 'DE', 'MN', 'NV', 'OK', 'HI',
       'ID', 'NH', 'WV', 'KS', 'DC'], dtype=object)

In [67]:
empty = df.loc[df['State'] == '']

In [68]:
empty['City'] = empty['City, State'].str[-2]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  empty['City'] = empty['City, State'].str[-2]


In [69]:
empty['State'] = empty['City, State'].str[-1]
empty['State'] = empty['State'].str[:-6]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  empty['State'] = empty['City, State'].str[-1]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  empty['State'] = empty['State'].str[:-6]


In [70]:
empty.head()

Unnamed: 0.1,Unnamed: 0,School,Score,Rating,Address,Type,Grades,Total Students Enrolled,Students per teacher,District,City,State,"City, State",Length
15188,15188,Primrose School At Stapleton,0/10,Currently unrated,"2501 Syracuse Street, Denver, CO 80238",Private,PK-K,73.0,,,Denver,CO,"[2501 Syracuse Street, Denver, CO 80238]",3
29323,29323,Adams Elementary School,8/10,Above average,"7401 Jacobs Creek Drive, Lincoln, NE 68512",Public district,PK-5,827.0,18:1,Lincoln Public Schools,Lincoln,NE,"[7401 Jacobs Creek Drive, Lincoln, NE 68512]",3
29324,29324,Kloefkorn Elementary School,8/10,Above average,"6601 Glass Ridge Drive, Lincoln, NE 68526",Public district,PK-5,497.0,17:1,Lincoln Public Schools,Lincoln,NE,"[6601 Glass Ridge Drive, Lincoln, NE 68526]",3
29325,29325,Cavett Elementary School,8/10,Above average,"7701 South 36th Street, Lincoln, NE 68516",Public district,PK-5,692.0,16:1,Lincoln Public Schools,Lincoln,NE,"[7701 South 36th Street, Lincoln, NE 68516]",3
29326,29326,Lux Middle School,8/10,Above average,"7800 High Street, Lincoln, NE 68506",Public district,6-8,917.0,16:1,Lincoln Public Schools,Lincoln,NE,"[7800 High Street, Lincoln, NE 68506]",3


## Merge

In [71]:
df = df.append(empty)

In [72]:
df[df.duplicated(subset=['Unnamed: 0', 'School', 'Score', 'Rating', 'Type', 'Grades', 'Total Students Enrolled', 'Students per teacher', 'District'], keep=False)]

Unnamed: 0.1,Unnamed: 0,School,Score,Rating,Address,Type,Grades,Total Students Enrolled,Students per teacher,District,City,State,"City, State",Length
15188,15188,Primrose School At Stapleton,0/10,Currently unrated,"2501 Syracuse Street, Denver, CO 80238",Private,PK-K,73.0,,,Colorado,,"[2501 Syracuse Street, Denver, CO 80238]",3
29323,29323,Adams Elementary School,8/10,Above average,"7401 Jacobs Creek Drive, Lincoln, NE 68512",Public district,PK-5,827.0,18:1,Lincoln Public Schools,NE,,"[7401 Jacobs Creek Drive, Lincoln, NE 68512]",3
29324,29324,Kloefkorn Elementary School,8/10,Above average,"6601 Glass Ridge Drive, Lincoln, NE 68526",Public district,PK-5,497.0,17:1,Lincoln Public Schools,NE,,"[6601 Glass Ridge Drive, Lincoln, NE 68526]",3
29325,29325,Cavett Elementary School,8/10,Above average,"7701 South 36th Street, Lincoln, NE 68516",Public district,PK-5,692.0,16:1,Lincoln Public Schools,NE,,"[7701 South 36th Street, Lincoln, NE 68516]",3
29326,29326,Lux Middle School,8/10,Above average,"7800 High Street, Lincoln, NE 68506",Public district,6-8,917.0,16:1,Lincoln Public Schools,NE,,"[7800 High Street, Lincoln, NE 68506]",3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48710,48710,Jackson's Child Care Center,0/10,Currently unrated,"305 West 15th Street, Scottsbluff, NE 69361",Private,PK,,,,Scottsbluff,NE,"[305 West 15th Street, Scottsbluff, NE 69361]",3
48711,48711,Reach For The Stars Ps & Dc,0/10,Currently unrated,"518 Broadway, Scottsbluff, NE 69361",Private,PK,,,,Scottsbluff,NE,"[518 Broadway, Scottsbluff, NE 69361]",3
48712,48712,Lil Angels Playhouse Child Care Center,0/10,Currently unrated,"1927 Broadway, Scottsbluff, NE 69361",Private,PK,,,,Scottsbluff,NE,"[1927 Broadway, Scottsbluff, NE 69361]",3
48713,48713,Western Nebrask Child Development Center (Wncdc),0/10,Currently unrated,"3118 17th Avenue, Scottsbluff, NE 69361",Public district,PK,,,Educational Service Unit 13,Scottsbluff,NE,"[3118 17th Avenue, Scottsbluff, NE 69361]",3


## Deleting duplicates

In [73]:
df = df.drop_duplicates(subset=['Unnamed: 0', 'School', 'Score', 'Rating', 'Type', 'Grades', 'Total Students Enrolled', 'Students per teacher', 'District' ], keep='last')

In [74]:
df = df.drop(columns = ['Unnamed: 0', 'City, State', 'Length'])

In [75]:
print(df.shape)
df.head()

(58779, 11)


Unnamed: 0,School,Score,Rating,Address,Type,Grades,Total Students Enrolled,Students per teacher,District,City,State
0,Akron Early College High School,10/10,Top rated,"225 South Main Street, Akron, OH 44308",Public district,9-12,384.0,34:1,Akron City School District,Akron,OH
1,Revere Middle School,9/10,Above average,"3195 Spring Valley Road, Akron, OH 44333",Public district,6-8,624.0,13:1,Revere Local School District,Akron,OH
2,Arrowhead Primary Elementary School,8/10,Above average,"1600 Raleigh Boulevard, Akron, OH 44321",Public district,K-4,345.0,20:1,Copley-Fairlawn City School District,Akron,OH
3,Manchester Middle School,8/10,Above average,"760 West Nimisila Road, Akron, OH 44319",Public district,5-8,387.0,16:1,Manchester Local School District,Akron,OH
4,Nolley Elementary School,8/10,Above average,"6285 Renninger Rd, Akron, OH 44319",Public district,K-4,483.0,17:1,Manchester Local School District,Akron,OH


## Cleaning State Information

In [76]:
print(df['State'].nunique())
df['State'].unique()

52


array(['OH', 'GA', 'NY', 'OR', 'NM', 'LA', 'PA', 'TX', 'WA', 'CA', 'AK',
       'IN', 'MI', 'WI', 'NC', 'KY', 'NJ', 'ME', 'CO', 'MD', 'MT', 'MS',
       'ND', 'VA', 'IL', 'MA', 'FL', 'CT', 'TN', 'SD', 'VT', 'WY', 'IA',
       'SC', 'UT', 'MO', 'AR', 'AZ', 'DE', 'MN', 'NV', 'OK', 'HI', 'ID',
       'NH', 'WV', 'KS', 'DC', ' CO', ' NE ', ' NE', ' CA'], dtype=object)

In [77]:
df['State'] = df['State'].str.strip()
df['City'] = df['City'].str.strip()

In [78]:
print(df['State'].nunique())
df['State'].unique()

49


array(['OH', 'GA', 'NY', 'OR', 'NM', 'LA', 'PA', 'TX', 'WA', 'CA', 'AK',
       'IN', 'MI', 'WI', 'NC', 'KY', 'NJ', 'ME', 'CO', 'MD', 'MT', 'MS',
       'ND', 'VA', 'IL', 'MA', 'FL', 'CT', 'TN', 'SD', 'VT', 'WY', 'IA',
       'SC', 'UT', 'MO', 'AR', 'AZ', 'DE', 'MN', 'NV', 'OK', 'HI', 'ID',
       'NH', 'WV', 'KS', 'DC', 'NE'], dtype=object)

## Cleaning City Information

In [79]:
print(df['City'].nunique())
df.City.unique()

405


array(['Akron', 'Albany', 'Albuquerque', 'Alexandria', 'Allentown',
       'Altoona', 'Amarillo', 'Anacortes', 'Anaheim', 'Anchorage',
       'Anderson', 'Ann Arbor', 'Appleton', 'Arlington', 'Arroyo Grande',
       'Asheville', 'Ashland', 'Athens', 'Atlantic City', 'Auburn',
       'Aurora', 'Austin', 'Dripping Springs', 'austin', 'Bakersfield',
       'Baltimore', 'Bangor', 'Baton Rouge', 'Bay City', 'Beaumont',
       'Bellevue', 'Bellingham', 'Beloit', 'Bend', 'Prineville',
       'Benton Harbor', 'Billings', 'Biloxi', 'Bismarck', 'Blacksburg',
       'Bloomington', 'Bossier City', 'Boston', 'Chelsea', 'Boulder',
       'Bowling Green', 'Bradenton', 'Bremerton', 'Bridgeport',
       'Bridgeton', 'Bristol', 'Brookings', 'Brownsville', 'Bryan',
       'Buffalo', 'Burlington', 'Cambridge', 'Canton', 'Cape Coral',
       'Carlsbad', 'Carmel', 'CARMEL', 'Casper', 'Cedar Rapids',
       'Chambersburg', 'Champaign', 'Chapel Hill', 'Charleston',
       'Charlotte', 'Charlottesville', 'Chey

In [80]:
df.loc[df['City'] == 'austin']

Unnamed: 0,School,Score,Rating,Address,Type,Grades,Total Students Enrolled,Students per teacher,District,City,State
3799,Indigo childcare,0/10,Currently unrated,"4314 clarno dr, austin, TX 78749",Private,PK,,,,austin,TX


In [81]:
df.loc[df['City'] == 'CARMEL']

Unnamed: 0,School,Score,Rating,Address,Type,Grades,Total Students Enrolled,Students per teacher,District,City,State
8429,S.H.A.N.D.A. Preparatory Academy,0/10,Currently unrated,"9764 Cedar Point Drive, CARMEL, IN 46032",Private,10-12 & Ungraded,,,,CARMEL,IN


In [82]:
df.loc[df['City'] == 'COLORADO SPRINGS']

Unnamed: 0,School,Score,Rating,Address,Type,Grades,Total Students Enrolled,Students per teacher,District,City,State
11423,Calvary Preschool,0/10,Currently unrated,"4210 Austin Bluffs Pkwy, COLORADO SPRINGS, CO 80918",Private,PK,,,,COLORADO SPRINGS,CO


In [83]:
df.loc[df['City'] == 'GREENSBORO']

Unnamed: 0,School,Score,Rating,Address,Type,Grades,Total Students Enrolled,Students per teacher,District,City,State
22095,Bryan School ( Charter Pending),0/10,Currently unrated,"324 S. Elm Street, GREENSBORO, NC 27401",Private,"K-7, 9-12",,,,GREENSBORO,NC


In [84]:
df.loc[df['City'] == 'SAN DIEGO']

Unnamed: 0,School,Score,Rating,Address,Type,Grades,Total Students Enrolled,Students per teacher,District,City,State
46580,Global Math Art Science & Technology Academy,0/10,Currently unrated,"4395 Kamloop Ave, SAN DIEGO, CA 92117",Private,6-12,,,,SAN DIEGO,CA


In [85]:
df.loc[df['City'] == 'NW  Washington']

Unnamed: 0,School,Score,Rating,Address,Type,Grades,Total Students Enrolled,Students per teacher,District,City,State
56607,The River School,0/10,Currently unrated,"4880 MacArthur Blvd., NW Washington, DC 20007",Private,PK-3,235.0,,,NW Washington,DC


In [86]:
df.loc[df['City'] == 'Cherry Hill/Baltimore']

Unnamed: 0,School,Score,Rating,Address,Type,Grades,Total Students Enrolled,Students per teacher,District,City,State
4295,Bay-Brook Elementary School,2/10,Below average,"2501 Seabury Road, Cherry Hill/Baltimore, MD 21225",Public district,PK-8,455.0,16:1,Baltimore City Public Schools,Cherry Hill/Baltimore,MD


In [87]:
df.loc[df['City'] == 'Suite 117 Phoenix']

Unnamed: 0,School,Score,Rating,Address,Type,Grades,Total Students Enrolled,Students per teacher,District,City,State
40342,Kaleidoscope Preschool,0/10,Currently unrated,"3434 W. Greenway Rd., Suite 117 Phoenix, AZ 85053",Private,PK,,,,Suite 117 Phoenix,AZ


In [88]:
df['City'] = df['City'].str.replace('austin', 'Austin')
df['City'] = df['City'].str.replace('CARMEL', 'Carmel')
df['City'] = df['City'].str.replace('COLORADO SPRINGS', 'Colorado Springs')
df['City'] = df['City'].str.replace('GREENSBORO', 'Greensboro')
df['City'] = df['City'].str.replace('SAN DIEGO', 'San Diego')
df['City'] = df['City'].str.replace('Cherry Hill/Baltimore', 'Cherry Hill')
df['City'] = df['City'].str.replace('Suite 117 Phoenix', ' Phoenix')

df['Address'] = df['Address'].str.replace('4314 clarno dr, austin, TX 78749', '4314 Clarno Dr, Austin, TX 78749')
df['Address'] = df['Address'].str.replace('CARMEL,', 'Carmel')
df['Address'] = df['Address'].str.replace('COLORADO SPRINGS,', 'Colorado Springs')
df['Address'] = df['Address'].str.replace('GREENSBORO', 'Greensboro')
df['Address'] = df['Address'].str.replace('SAN DIEGO', 'San Diego')
df['Address'] = df['Address'].str.replace('Cherry Hill/Baltimore', 'Cherry Hill')
df['Address'] = df['Address'].str.replace('Suite 117', 'Suite 117,')

## Fix NW Washington

In [89]:
washington =  df.loc[df['City'] == 'NW  Washington']

In [90]:
washington.head()

Unnamed: 0,School,Score,Rating,Address,Type,Grades,Total Students Enrolled,Students per teacher,District,City,State
56607,The River School,0/10,Currently unrated,"4880 MacArthur Blvd., NW Washington, DC 20007",Private,PK-3,235.0,,,NW Washington,DC


In [91]:
washington['City'] = washington['City'].str[3:]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  washington['City'] = washington['City'].str[3:]


In [92]:
washington['City'] = washington['City'].str.strip()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  washington['City'] = washington['City'].str.strip()


In [93]:
washington.head()

Unnamed: 0,School,Score,Rating,Address,Type,Grades,Total Students Enrolled,Students per teacher,District,City,State
56607,The River School,0/10,Currently unrated,"4880 MacArthur Blvd., NW Washington, DC 20007",Private,PK-3,235.0,,,Washington,DC


In [94]:
df = df[df['City'] != 'NW  Washington']

In [95]:
df = df.append(washington)

In [96]:
df.shape

(58779, 11)

## Filling NaNs and trying to clean a specific row

In [97]:
df.isna().sum()

School                         0
Score                          0
Rating                         0
Address                        0
Type                           0
                           ...  
Total Students Enrolled    33754
Students per teacher       40918
District                   39386
City                           0
State                          0
Length: 11, dtype: int64

In [98]:
df[['Total Students Enrolled', 'Students per teacher']] = df[['Total Students Enrolled', 'Students per teacher']].fillna(0)

In [99]:
df['District'] = df['District'].fillna('Unavailable')

In [100]:
df.isna().sum()

School                     0
Score                      0
Rating                     0
Address                    0
Type                       0
                          ..
Total Students Enrolled    0
Students per teacher       0
District                   0
City                       0
State                      0
Length: 11, dtype: int64

## Cleaning Extra Spaces

In [101]:
df['School'] = df['School'].str.strip()
df['Score'] = df['Score'].str.strip()
df['Rating'] = df['Rating'].str.strip()
df['Address'] = df['Address'].str.strip()
df['Type'] = df['Type'].str.strip()
df['Grades'] = df['Grades'].str.strip()
df['Students per teacher'] = df['Students per teacher'].str.strip()
df['District'] = df['District'].str.strip()

In [102]:
df.to_csv('schools_cleaned.csv', index = False)

In [103]:
df.head()

Unnamed: 0,School,Score,Rating,Address,Type,Grades,Total Students Enrolled,Students per teacher,District,City,State
0,Akron Early College High School,10/10,Top rated,"225 South Main Street, Akron, OH 44308",Public district,9-12,384.0,34:1,Akron City School District,Akron,OH
1,Revere Middle School,9/10,Above average,"3195 Spring Valley Road, Akron, OH 44333",Public district,6-8,624.0,13:1,Revere Local School District,Akron,OH
2,Arrowhead Primary Elementary School,8/10,Above average,"1600 Raleigh Boulevard, Akron, OH 44321",Public district,K-4,345.0,20:1,Copley-Fairlawn City School District,Akron,OH
3,Manchester Middle School,8/10,Above average,"760 West Nimisila Road, Akron, OH 44319",Public district,5-8,387.0,16:1,Manchester Local School District,Akron,OH
4,Nolley Elementary School,8/10,Above average,"6285 Renninger Rd, Akron, OH 44319",Public district,K-4,483.0,17:1,Manchester Local School District,Akron,OH


### Separating into PK, K, Elementary, Middle, High School
- https://stackoverflow.com/questions/61877712/check-if-an-item-in-a-list-is-available-in-a-column-which-is-of-type-list

In [104]:
import pandas as pd

In [105]:
df = pd.read_csv('schools_cleaned.csv')

In [106]:
print(df.shape)

(58779, 11)


In [107]:
def parse_grades(grades_string):
    GRADES = ['PK', 'K', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', 'Ungraded']

    # Remove & for grades list
    grades_string = grades_string.replace(' &', ',')

    # Grades list - will add to separated grade string to grades
    grades = []

    # split strings based on ','
    string_list = grades_string.split(',')

    # look for sections of list with '-'
    dash = "-"
    for i in range(len(string_list)):
        clean_string = string_list[i].strip()
        if dash in clean_string:
            #  split using '-', loop and add to grades variable
            start_grade, end_grade = clean_string.split(dash)
            grades += GRADES[GRADES.index(start_grade) : GRADES.index(end_grade)+ 1]
        else:
            # add string to grades
            grades.append(clean_string)

    return grades

In [108]:
print(df['Grades'].nunique())
unique_grades_combination  = df['Grades'].unique()

302


In [109]:
def test_complete_dataset(unique_grades_combination):
    # create a loop that goes thru dataset and invoke parse_grades with each element
    separated_grades_list = []
    for i in unique_grades_combination:
        separated_grades_list.append(parse_grades(i))

    dictionary_grade_list = dict(zip(unique_grades_combination, separated_grades_list))

    return dictionary_grade_list

In [110]:
dictionary = test_complete_dataset(unique_grades_combination)

In [111]:
df['Clean_Grades'] = df['Grades'].map(dictionary)

In [112]:
high_school = ['9', '10', '11', '12']
middle_school = ['6', '7', '8']
elementary = ['K', '1', '2', '3', '4', '5']
pre_k = ['PK']

In [113]:
set1 = set(high_school)
df['High School (9-12)'] = [set1.issubset(set(row)) for row in df.Clean_Grades]

In [114]:
set2 = set(middle_school)
df['Middle School (6-8)'] = [set2.issubset(set(row)) for row in df.Clean_Grades]

In [115]:
set3 = set(elementary)
df['Elementary (K-5)'] = [set3.issubset(set(row)) for row in df.Clean_Grades]

In [116]:
set4 = set(pre_k)
df['Pre-Kindergarten (PK)'] = [set4.issubset(set(row)) for row in df.Clean_Grades]

In [117]:
df[['High School (9-12)', 'Middle School (6-8)', 'Elementary (K-5)', 'Pre-Kindergarten (PK)']] = df[['High School (9-12)', 'Middle School (6-8)', 'Elementary (K-5)', 'Pre-Kindergarten (PK)']] * 1

In [118]:
df['Grades'] = df['Grades'].str.replace(' & Ungraded', '')
df = df.drop(columns = ['Clean_Grades'])

In [119]:
df.head()

Unnamed: 0,School,Score,Rating,Address,Type,Grades,Total Students Enrolled,Students per teacher,District,City,State,High School (9-12),Middle School (6-8),Elementary (K-5),Pre-Kindergarten (PK)
0,Akron Early College High School,10/10,Top rated,"225 South Main Street, Akron, OH 44308",Public district,9-12,384.0,34:1,Akron City School District,Akron,OH,1,0,0,0
1,Revere Middle School,9/10,Above average,"3195 Spring Valley Road, Akron, OH 44333",Public district,6-8,624.0,13:1,Revere Local School District,Akron,OH,0,1,0,0
2,Arrowhead Primary Elementary School,8/10,Above average,"1600 Raleigh Boulevard, Akron, OH 44321",Public district,K-4,345.0,20:1,Copley-Fairlawn City School District,Akron,OH,0,0,0,0
3,Manchester Middle School,8/10,Above average,"760 West Nimisila Road, Akron, OH 44319",Public district,5-8,387.0,16:1,Manchester Local School District,Akron,OH,0,1,0,0
4,Nolley Elementary School,8/10,Above average,"6285 Renninger Rd, Akron, OH 44319",Public district,K-4,483.0,17:1,Manchester Local School District,Akron,OH,0,0,0,0


In [120]:
df.to_csv('schools_cleaned.csv', index = False)

### Update School['Scores']
- change to int so data can be sorted by the value

In [1]:
import pandas as pd
school = pd.read_csv('schools_cleaned.csv')



In [2]:
school['Score'] = school['Score'].str.replace('/10', '')
school['Score'] = school['Score'].astype(int)

In [3]:
school.to_csv('schools_cleaned.csv', index = False)