# About

This module will run throught he basics of creating columns and sorting data.

In [1]:
# import our tools
import pandas as pd
import numpy as np

In [2]:
# bring in the college dataset
ipeds_url = "https://public.tableau.com/s/sites/default/files/media/Resources/IPEDS_data.xlsx"
ipeds = pd.read_excel(ipeds_url)
ipeds.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1534 entries, 0 to 1533
Columns: 145 entries, ID number to Endowment assets (year end) per FTE enrollment (FASB)
dtypes: float64(116), int64(2), object(27)
memory usage: 1.7+ MB


# Creating Columns



In [3]:
# first, what do we really have for columns
list(ipeds.columns)

['ID number',
 'Name',
 'year',
 'ZIP code',
 'Highest degree offered',
 'County name',
 'Longitude location of institution',
 'Latitude location of institution',
 'Religious affiliation',
 'Offers Less than one year certificate',
 'Offers One but less than two years certificate',
 "Offers Associate's degree",
 'Offers Two but less than 4 years certificate',
 "Offers Bachelor's degree",
 'Offers Postbaccalaureate certificate',
 "Offers Master's degree",
 "Offers Post-master's certificate",
 "Offers Doctor's degree - research/scholarship",
 "Offers Doctor's degree - professional practice",
 "Offers Doctor's degree - other",
 'Offers Other degree',
 'Applicants total',
 'Admissions total',
 'Enrolled total',
 'Percent of freshmen submitting SAT scores',
 'Percent of freshmen submitting ACT scores',
 'SAT Critical Reading 25th percentile score',
 'SAT Critical Reading 75th percentile score',
 'SAT Math 25th percentile score',
 'SAT Math 75th percentile score',
 'SAT Writing 25th percentil

In [4]:
# lets make this a much smaller dataset
COLS_2_KEEP = ['ID number', 'Name', 'Applicants total', 'Admissions total', 'Enrolled total']
ipeds2 = ipeds.loc[:, COLS_2_KEEP]
ipeds2.head()

Unnamed: 0,ID number,Name,Applicants total,Admissions total,Enrolled total
0,100654,Alabama A & M University,6142.0,5521.0,1104.0
1,100663,University of Alabama at Birmingham,5689.0,4934.0,1773.0
2,100690,Amridge University,,,
3,100706,University of Alabama in Huntsville,2054.0,1656.0,651.0
4,100724,Alabama State University,10245.0,5251.0,1479.0


In [5]:
# thats much easier - what do we have for missing dat
ipeds2.isna().sum()

ID number             0
Name                  0
Applicants total    157
Admissions total    157
Enrolled total      157
dtype: int64

In [6]:
# remove every row that has at least 1 missing value 
ipeds2.dropna(inplace=True)

In [7]:
# confirm
ipeds2.isna().sum()

ID number           0
Name                0
Applicants total    0
Admissions total    0
Enrolled total      0
dtype: int64

In [8]:
# create a simple column, every value is one
ipeds2['just1'] = 1
ipeds2.head()

Unnamed: 0,ID number,Name,Applicants total,Admissions total,Enrolled total,just1
0,100654,Alabama A & M University,6142.0,5521.0,1104.0,1
1,100663,University of Alabama at Birmingham,5689.0,4934.0,1773.0,1
3,100706,University of Alabama in Huntsville,2054.0,1656.0,651.0,1
4,100724,Alabama State University,10245.0,5251.0,1479.0,1
5,100751,The University of Alabama,30975.0,17515.0,6454.0,1


In [0]:
# summary
ipeds2.describe()



---



***What just happened***

Pandas casts the calculation to every row in our dataset, no need to do each row by 1x1.  This is the equivalent of dragging and dropping the formula in excel down every row.




---



In [16]:
# we can create more than constants
ipeds2['double_apps'] = ipeds2['Applicants total'] * 2
ipeds2.head()

Unnamed: 0,ID number,Name,Applicants total,Admissions total,Enrolled total,just1,double_apps
0,100654,Alabama A & M University,6142.0,5521.0,1104.0,1,12284.0
1,100663,University of Alabama at Birmingham,5689.0,4934.0,1773.0,1,11378.0
3,100706,University of Alabama in Huntsville,2054.0,1656.0,651.0,1,4108.0
4,100724,Alabama State University,10245.0,5251.0,1479.0,1,20490.0
5,100751,The University of Alabama,30975.0,17515.0,6454.0,1,61950.0


In [20]:
# we can even compare columns
ipeds2['yield_rate'] = ipeds2['Enrolled total'] / ipeds2['Admissions total']
ipeds2.head()

Unnamed: 0,ID number,Name,Applicants total,Admissions total,Enrolled total,just1,double_apps,yield_rate
0,100654,Alabama A & M University,6142.0,5521.0,1104.0,1,12284.0,0.199964
1,100663,University of Alabama at Birmingham,5689.0,4934.0,1773.0,1,11378.0,0.359343
3,100706,University of Alabama in Huntsville,2054.0,1656.0,651.0,1,4108.0,0.393116
4,100724,Alabama State University,10245.0,5251.0,1479.0,1,20490.0,0.281661
5,100751,The University of Alabama,30975.0,17515.0,6454.0,1,61950.0,0.368484


# Sorting

Remeber `value_counts()`, well there is `sort_values()`

https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.sort_values.html

^^ Documentation is our friend

In [10]:
# get help on the DataFrame method sort_values
pd.DataFrame.sort_values?

In [18]:
## sort the data by yield_rate ascending
ipeds2.sort_values('double_apps').head()

Unnamed: 0,ID number,Name,Applicants total,Admissions total,Enrolled total,just1,double_apps,yield_rate
99,115083,Golden Gate University-San Francisco,0.0,0.0,0.0,1,0.0,
1343,230889,Goddard College,4.0,4.0,2.0,1,8.0,0.5
331,148849,Shimer College,28.0,8.0,8.0,1,56.0,1.0
1500,414878,Trine University-Regional/Non-Traditional Camp...,33.0,10.0,7.0,1,66.0,0.7
1479,262086,Brandman University,34.0,34.0,24.0,1,68.0,0.705882


> Note that even though we removed all missing values, we still get a NaN because we cant do division by 0. It's pretty great that pandas didn't yell at us, right?

In [21]:
## sort the data by yield_rate ascending
ipeds2.sort_values('double_apps').tail()

Unnamed: 0,ID number,Name,Applicants total,Admissions total,Enrolled total,just1,double_apps,yield_rate
83,110653,University of California-Irvine,56515.0,23956.0,5077.0,1,113030.0,0.21193
841,193900,New York University,57845.0,15066.0,5207.0,1,115690.0,0.345613
86,110680,University of California-San Diego,60832.0,22812.0,4575.0,1,121664.0,0.200552
81,110635,University of California-Berkeley,61717.0,11108.0,4162.0,1,123434.0,0.374685
84,110662,University of California-Los Angeles,72676.0,15981.0,5620.0,1,145352.0,0.351668


In [22]:
## we can also do nested sorting
ipeds2.sort_values(['yield_rate', 'double_apps']).tail()

Unnamed: 0,ID number,Name,Applicants total,Admissions total,Enrolled total,just1,double_apps,yield_rate
1476,245883,Antioch University-Seattle,42.0,33.0,30.0,1,84.0,0.909091
690,178697,College of the Ozarks,3006.0,391.0,357.0,1,6012.0,0.913043
1484,367884,Hodges University,221.0,183.0,170.0,1,442.0,0.928962
331,148849,Shimer College,28.0,8.0,8.0,1,56.0,1.0
99,115083,Golden Gate University-San Francisco,0.0,0.0,0.0,1,0.0,


In [14]:
## these were just temporary sorts, we can use inplace to save
ipeds2.head()

Unnamed: 0,ID number,Name,Applicants total,Admissions total,Enrolled total,just1,double_apps
0,100654,Alabama A & M University,6142.0,5521.0,1104.0,1,12284.0
1,100663,University of Alabama at Birmingham,5689.0,4934.0,1773.0,1,11378.0
3,100706,University of Alabama in Huntsville,2054.0,1656.0,651.0,1,4108.0
4,100724,Alabama State University,10245.0,5251.0,1479.0,1,20490.0
5,100751,The University of Alabama,30975.0,17515.0,6454.0,1,61950.0


In [23]:
ipeds2.sort_values(['yield_rate', 'double_apps'], inplace=True)

In [24]:
ipeds2.tail()

Unnamed: 0,ID number,Name,Applicants total,Admissions total,Enrolled total,just1,double_apps,yield_rate
1476,245883,Antioch University-Seattle,42.0,33.0,30.0,1,84.0,0.909091
690,178697,College of the Ozarks,3006.0,391.0,357.0,1,6012.0,0.913043
1484,367884,Hodges University,221.0,183.0,170.0,1,442.0,0.928962
331,148849,Shimer College,28.0,8.0,8.0,1,56.0,1.0
99,115083,Golden Gate University-San Francisco,0.0,0.0,0.0,1,0.0,


> `inplace=True` is like assigning the output of the sort to a dataframe, it just does it for us, in place.

# Combine this lesson together

In [25]:
# create a column that is the length of the school's name
ipeds2['name_length'] = ipeds2['Name'].str.len()

In [26]:
ipeds2.head()

Unnamed: 0,ID number,Name,Applicants total,Admissions total,Enrolled total,just1,double_apps,yield_rate,name_length
1530,454184,The Kingâ€™s College,3033.0,2158.0,127.0,1,6066.0,0.058851,20
736,182980,New England College,5723.0,5149.0,388.0,1,11446.0,0.075354,19
1011,206491,Wilberforce University,1534.0,895.0,69.0,1,3068.0,0.077095,22
1374,232672,Mary Baldwin College,3089.0,2729.0,218.0,1,6178.0,0.079883,20
1080,212054,Drexel University,43945.0,35815.0,3040.0,1,87890.0,0.084881,17


> Note:  I corrected this in a previous notebook, but see that we have to access the string methods by .str (the type) against the Series which is a string, to access the methods  like length, lower, etc.

In [0]:
## sort the dataset descending (its just an argument) in place and view the top 5 longest names
ipeds2.sort_values("name_length", ascending=False, inplace=True)
ipeds2.head()