### The Anatomy Of A Dataframe
![Dataframe Anatomy](../images/dataframe-anatomy.png)

### pandas is a python library for working with dataframes
- get familiar with using the [API reference](https://pandas.pydata.org/pandas-docs/stable/reference/index.html#api), which gives information about the many objects, functions and methods for working with dataframes and series



In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

Matplotlib is building the font cache; this may take a moment.


### Let's explore these pandas methods, attributes, and accessors
 - read_csv( )
 - .shape
 - .head( )
 - .tail( )
 - .columns
 - .drop( )
 - .rename( )
 - .loc[]
 - .isin( )
 - .iloc[ ]
 - [[ ]]

### Read in the schools data, a CSV file, and examine the shape, head, and tail

In [3]:
schools = pd.read_csv('../data/schools_clean.csv')
schools.head(2)

Unnamed: 0,level,name,zipcode,grade_k,grade_1,grade_2,grade_3,grade_4,grade_5,grade_6,...,hisp,p_islander,white,male,female,econ_disadv,disabled,limited_eng,lat,lng
0,Elementary School,A. Z. Kelley Elementary,37013,153.0,145.0,149.0,180.0,184.0,,,...,206,1.0,212.0,431,421,261,75.0,298.0,36.021817,-86.658848
1,Elementary School,Alex Green Elementary,37189,42.0,50.0,44.0,38.0,24.0,,,...,29,1.0,21.0,115,119,153,21.0,25.0,36.252961,-86.832229


In [4]:
schools.tail(2)

Unnamed: 0,level,name,zipcode,grade_k,grade_1,grade_2,grade_3,grade_4,grade_5,grade_6,...,hisp,p_islander,white,male,female,econ_disadv,disabled,limited_eng,lat,lng
165,Middle School,William Henry Oliver Middle,37211,,,,,,231.0,271.0,...,158,3.0,437.0,487,498,252,112.0,231.0,36.020174,-86.712207
166,Middle School,Wright Middle,37211,,,,,,188.0,216.0,...,534,1.0,104.0,443,367,400,75.0,536.0,36.100109,-86.734133


In [5]:
schools.shape

(167, 29)

#### the `columns` attribute shows the column names for the DataFrame

In [6]:
schools.columns

Index(['level', 'name', 'zipcode', 'grade_k', 'grade_1', 'grade_2', 'grade_3',
       'grade_4', 'grade_5', 'grade_6', 'grade_7', 'grade_8', 'grade_9',
       'grade_10', 'grade_11', 'grade_12', 'native_amer', 'asian', 'black',
       'hisp', 'p_islander', 'white', 'male', 'female', 'econ_disadv',
       'disabled', 'limited_eng', 'lat', 'lng'],
      dtype='object')

#### The `iloc[ ]` accessor gets the specified rows and columns by their _index_ values

In [7]:
first_five = schools.iloc[0:5, 0:2]

In [8]:
first_five

Unnamed: 0,level,name
0,Elementary School,A. Z. Kelley Elementary
1,Elementary School,Alex Green Elementary
2,Elementary School,Amqui Elementary
3,Elementary School,Andrew Jackson Elementary
4,High School,Antioch High School


#### The `loc[ ]` accessor gets the specified rows and columns by their _names_

In [9]:
middle_schools = schools.loc[schools['level'] == 'Middle School'].head()
middle_schools.shape

(5, 29)

In [10]:
middle_schools.head()

Unnamed: 0,level,name,zipcode,grade_k,grade_1,grade_2,grade_3,grade_4,grade_5,grade_6,...,hisp,p_islander,white,male,female,econ_disadv,disabled,limited_eng,lat,lng
5,Middle School,Antioch Middle,37013,,,,,,97.0,251.0,...,407,3.0,104.0,413,407,391,105.0,415.0,36.055379,-86.67183
6,Middle School,Apollo Middle,37013,,,,,,178.0,214.0,...,428,,137.0,421,414,340,97.0,417.0,36.066878,-86.66407
7,Middle School,Bellevue Middle,37221,,,,,,144.0,175.0,...,85,1.0,300.0,328,303,211,93.0,67.0,36.070749,-86.93482
21,Middle School,Creswell Middle School of the Arts,37218,,,,,,88.0,105.0,...,11,,30.0,138,224,163,41.0,4.0,36.196063,-86.837539
23,Middle School,Croft Middle,37211,,,,,,174.0,214.0,...,379,1.0,207.0,387,363,262,80.0,332.0,36.087475,-86.734148


In [11]:
econ_disadv_over_200 = schools.loc[schools.econ_disadv > 200]
econ_disadv_over_200.shape

(78, 29)

In [12]:
econ_disadv_over_200.head()

Unnamed: 0,level,name,zipcode,grade_k,grade_1,grade_2,grade_3,grade_4,grade_5,grade_6,...,hisp,p_islander,white,male,female,econ_disadv,disabled,limited_eng,lat,lng
0,Elementary School,A. Z. Kelley Elementary,37013,153.0,145.0,149.0,180.0,184.0,,,...,206,1.0,212.0,431,421,261,75.0,298.0,36.021817,-86.658848
2,Elementary School,Amqui Elementary,37115,84.0,93.0,77.0,87.0,84.0,,,...,209,1.0,57.0,223,235,246,51.0,143.0,36.273766,-86.703832
4,High School,Antioch High School,37013,,,,,,,,...,806,1.0,387.0,1026,889,622,185.0,634.0,36.046675,-86.599418
5,Middle School,Antioch Middle,37013,,,,,,97.0,251.0,...,407,3.0,104.0,413,407,391,105.0,415.0,36.055379,-86.67183
6,Middle School,Apollo Middle,37013,,,,,,178.0,214.0,...,428,,137.0,421,414,340,97.0,417.0,36.066878,-86.66407


#### Use the isin method to reference an external data structure

In [13]:
my_list = [37201, 37203]
downtown_schools = schools.loc[schools.zipcode.isin(my_list)]
downtown_schools.shape

(6, 29)

In [14]:
downtown_schools.head()

Unnamed: 0,level,name,zipcode,grade_k,grade_1,grade_2,grade_3,grade_4,grade_5,grade_6,...,hisp,p_islander,white,male,female,econ_disadv,disabled,limited_eng,lat,lng
14,Elementary School,Carter-Lawrence Elementary,37203,38.0,46.0,50.0,51.0,58.0,,,...,20,2.0,25.0,135,128,159,25.0,24.0,36.143653,-86.785853
37,Elementary School,Fall-Hamilton Elementary,37203,49.0,56.0,60.0,49.0,37.0,,,...,69,1.0,47.0,172,140,170,63.0,43.0,36.133472,-86.766206
55,Middle School,Head Middle,37203,,,,,,170.0,179.0,...,31,,133.0,286,265,124,31.0,43.0,36.158605,-86.804406
62,High School,Hume-Fogg High,37203,,,,,,,,...,68,,534.0,346,555,60,27.0,8.0,36.159525,-86.781536
98,High School,Martin Luther King Jr School,37203,,,,,,,,...,111,1.0,539.0,635,633,129,24.0,23.0,36.161676,-86.800474


In [15]:
school_and_type = schools[['name', 'level']]
school_and_type.head(2)

Unnamed: 0,name,level
0,A. Z. Kelley Elementary,Elementary School
1,Alex Green Elementary,Elementary School


#### Drop columns from a DataFrame with the `.drop( )` method; be sure to specify `columns = ` and pass a list of columns to the method

In [16]:
schools.columns

Index(['level', 'name', 'zipcode', 'grade_k', 'grade_1', 'grade_2', 'grade_3',
       'grade_4', 'grade_5', 'grade_6', 'grade_7', 'grade_8', 'grade_9',
       'grade_10', 'grade_11', 'grade_12', 'native_amer', 'asian', 'black',
       'hisp', 'p_islander', 'white', 'male', 'female', 'econ_disadv',
       'disabled', 'limited_eng', 'lat', 'lng'],
      dtype='object')

In [17]:
school_and_gender_counts = schools.drop(columns = ['native_amer', 'asian', 'black', 'hisp', 'p_islander', 'white', 
                                              'econ_disadv', 'disabled', 'limited_eng', 'lat', 'lng'])

In [18]:
school_and_gender_counts.columns

Index(['level', 'name', 'zipcode', 'grade_k', 'grade_1', 'grade_2', 'grade_3',
       'grade_4', 'grade_5', 'grade_6', 'grade_7', 'grade_8', 'grade_9',
       'grade_10', 'grade_11', 'grade_12', 'male', 'female'],
      dtype='object')

#### If the column list is short and you are feeling lazy, you can assign new column names (as a list _with every column in the right order_ ) to the columns attribute

In [19]:
school_and_type.columns

Index(['name', 'level'], dtype='object')

In [20]:
school_and_type.columns = ['school', 'type']
school_and_type.head()

Unnamed: 0,school,type
0,A. Z. Kelley Elementary,Elementary School
1,Alex Green Elementary,Elementary School
2,Amqui Elementary,Elementary School
3,Andrew Jackson Elementary,Elementary School
4,Antioch High School,High School


#### If you only want to change the name of a subset of columns, use the df.rename() function

In [21]:
school_and_gender_counts = school_and_gender_counts.rename(columns = {'level': 'type', 'name': 'school'})
school_and_gender_counts.head()

Unnamed: 0,type,school,zipcode,grade_k,grade_1,grade_2,grade_3,grade_4,grade_5,grade_6,grade_7,grade_8,grade_9,grade_10,grade_11,grade_12,male,female
0,Elementary School,A. Z. Kelley Elementary,37013,153.0,145.0,149.0,180.0,184.0,,,,,,,,,431,421
1,Elementary School,Alex Green Elementary,37189,42.0,50.0,44.0,38.0,24.0,,,,,,,,,115,119
2,Elementary School,Amqui Elementary,37115,84.0,93.0,77.0,87.0,84.0,,,,,,,,,223,235
3,Elementary School,Andrew Jackson Elementary,37138,95.0,93.0,85.0,88.0,103.0,,,,,,,,,258,238
4,High School,Antioch High School,37013,,,,,,,,,,499.0,486.0,482.0,448.0,1026,889


# End of Instruction

### Starting with the schools dataframe filter out just the High Schools and create a new datafrom called "high_schools".

In [22]:
high_schools = schools.loc[schools.level == 'High School']

In [23]:
high_schools.head()

Unnamed: 0,level,name,zipcode,grade_k,grade_1,grade_2,grade_3,grade_4,grade_5,grade_6,...,hisp,p_islander,white,male,female,econ_disadv,disabled,limited_eng,lat,lng
4,High School,Antioch High School,37013,,,,,,,,...,806,1.0,387.0,1026,889,622,185.0,634.0,36.046675,-86.599418
13,High School,Cane Ridge High School,37013,,,,,,,,...,655,3.0,313.0,980,878,635,190.0,573.0,36.030288,-86.62123
35,High School,East Nashville School,37206,,,,,,,,...,13,,24.0,308,393,253,55.0,3.0,36.180626,-86.750471
40,High School,Glencliff High School,37211,,,,,,,,...,742,,175.0,655,512,439,104.0,621.0,36.101401,-86.727469
59,High School,Hillsboro High,37215,,,,,,,,...,75,1.0,451.0,597,570,313,165.0,42.0,36.107066,-86.812229


### Now drop the columns named grade_k trhough grade_8 and overwrite you high_schools dataframe with the results

In [24]:
high_schools = high_schools.drop(columns = ['grade_k', 'grade_1', 'grade_2', 'grade_3', 'grade_4', 'grade_5', 'grade_6', 'grade_8'])

In [25]:
high_schools.head()

Unnamed: 0,level,name,zipcode,grade_7,grade_9,grade_10,grade_11,grade_12,native_amer,asian,...,hisp,p_islander,white,male,female,econ_disadv,disabled,limited_eng,lat,lng
4,High School,Antioch High School,37013,,499.0,486.0,482.0,448.0,1.0,78.0,...,806,1.0,387.0,1026,889,622,185.0,634.0,36.046675,-86.599418
13,High School,Cane Ridge High School,37013,,474.0,449.0,513.0,422.0,2.0,133.0,...,655,3.0,313.0,980,878,635,190.0,573.0,36.030288,-86.62123
35,High School,East Nashville School,37206,19.0,153.0,164.0,167.0,175.0,1.0,5.0,...,13,,24.0,308,393,253,55.0,3.0,36.180626,-86.750471
40,High School,Glencliff High School,37211,,349.0,309.0,332.0,177.0,,46.0,...,742,,175.0,655,512,439,104.0,621.0,36.101401,-86.727469
59,High School,Hillsboro High,37215,,314.0,319.0,280.0,252.0,2.0,42.0,...,75,1.0,451.0,597,570,313,165.0,42.0,36.107066,-86.812229


### Create a list name my_zip_codes which contains the zipcodes 37203 and 37013.  Next, use the isin() method to filter the datafrome and save it to itself.

In [26]:
my_zip_codes = [37203, 37013]

In [27]:
high_schools = high_schools.loc[high_schools.zipcode.isin(my_zip_codes)]

### Rename the level column "high_school"

In [31]:
high_schools = high_schools.rename(columns = {'level': 'high_school'})
high_schools.head()

Unnamed: 0,high_school,name,zipcode,grade_7,grade_9,grade_10,grade_11,grade_12,native_amer,asian,...,hisp,p_islander,white,male,female,econ_disadv,disabled,limited_eng,lat,lng
4,High School,Antioch High School,37013,,499.0,486.0,482.0,448.0,1.0,78.0,...,806,1.0,387.0,1026,889,622,185.0,634.0,36.046675,-86.599418
13,High School,Cane Ridge High School,37013,,474.0,449.0,513.0,422.0,2.0,133.0,...,655,3.0,313.0,980,878,635,190.0,573.0,36.030288,-86.62123
62,High School,Hume-Fogg High,37203,,231.0,224.0,226.0,220.0,3.0,100.0,...,68,,534.0,346,555,60,27.0,8.0,36.159525,-86.781536
98,High School,Martin Luther King Jr School,37203,154.0,266.0,248.0,226.0,230.0,,125.0,...,111,1.0,539.0,635,633,129,24.0,23.0,36.161676,-86.800474


### Use the shape attribute to see how many rows and columns exist in the current dataframe.

In [32]:
high_schools.shape

(4, 21)