# Dataframes:
#### - Dataframes represent tabular, 2-dimensional data, and provide a number of facilities for manipulating and transforming the data.

In [2]:
import pandas as pd
import numpy as np

np.random.seed(123)

students = ['Sally', 'Jane', 'Suzie', 'Billy', 'Ada', 'John', 'Thomas',
            'Marie', 'Albert', 'Richard', 'Isaac', 'Alan']

# randomly generate scores for each student for each subject
# note that all the values need to have the same length here

math_grades = np.random.randint(low=60, high=100, size=len(students))
english_grades = np.random.randint(low=60, high=100, size=len(students))
reading_grades = np.random.randint(low=60, high=100, size=len(students))

df = pd.DataFrame({'name': students,
                   'math': math_grades,
                   'english': english_grades,
                   'reading': reading_grades})

type(df)


pandas.core.frame.DataFrame

In [3]:
df

Unnamed: 0,name,math,english,reading
0,Sally,62,85,80
1,Jane,88,79,67
2,Suzie,94,74,95
3,Billy,98,96,88
4,Ada,77,92,98
5,John,79,76,93
6,Thomas,82,64,81
7,Marie,93,63,90
8,Albert,92,62,87
9,Richard,69,80,94


In [None]:
## a dataframe is composed of series
# each series is a column
#but the df itself is more than a list of series
#the df itself has its own functionality on top

In [4]:
#df.column_name returns that specific series
df.name

0       Sally
1        Jane
2       Suzie
3       Billy
4         Ada
5        John
6      Thomas
7       Marie
8      Albert
9     Richard
10      Isaac
11       Alan
Name: name, dtype: object

In [5]:
#the df itself has its own method and functionality
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12 entries, 0 to 11
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   name     12 non-null     object
 1   math     12 non-null     int64 
 2   english  12 non-null     int64 
 3   reading  12 non-null     int64 
dtypes: int64(3), object(1)
memory usage: 512.0+ bytes


In [6]:
df.columns #looking at columns & index

Index(['name', 'math', 'english', 'reading'], dtype='object')

In [7]:
df.index

RangeIndex(start=0, stop=12, step=1)

In [8]:
#.describe provides summary stats on our numeric columns
df.describe()

Unnamed: 0,math,english,reading
count,12.0,12.0,12.0
mean,84.833333,77.666667,86.5
std,11.134168,13.371158,9.643651
min,62.0,62.0,67.0
25%,78.5,63.75,80.75
50%,90.0,77.5,89.0
75%,92.25,86.75,93.25
max,98.0,99.0,98.0


## Why DataFrames?
### -Rectangular data consisting of columns and rows
### -Dataframes enjoy functionality above/beyond series
### -Dataframes are a container for series

In [11]:
df.columns = [column.upper() for column in df.columns]
df

Unnamed: 0,NAME,MATH,ENGLISH,READING
0,Sally,62,85,80
1,Jane,88,79,67
2,Suzie,94,74,95
3,Billy,98,96,88
4,Ada,77,92,98
5,John,79,76,93
6,Thomas,82,64,81
7,Marie,93,63,90
8,Albert,92,62,87
9,Richard,69,80,94


In [14]:
df.index = df.NAME

df

Unnamed: 0_level_0,NAME,MATH,ENGLISH,READING
NAME,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Sally,Sally,62,85,80
Jane,Jane,88,79,67
Suzie,Suzie,94,74,95
Billy,Billy,98,96,88
Ada,Ada,77,92,98
John,John,79,76,93
Thomas,Thomas,82,64,81
Marie,Marie,93,63,90
Albert,Albert,92,62,87
Richard,Richard,69,80,94


In [21]:
##no index was named/set, so defaults to numeric (see code at top)
##index can be the name of lists in df, 

df = pd.DataFrame({'name': students,
                   'math': math_grades,
                   'english': english_grades,
                   'reading': reading_grades}, index=students)

In [22]:
df

Unnamed: 0,name,math,english,reading
Sally,Sally,62,85,80
Jane,Jane,88,79,67
Suzie,Suzie,94,74,95
Billy,Billy,98,96,88
Ada,Ada,77,92,98
John,John,79,76,93
Thomas,Thomas,82,64,81
Marie,Marie,93,63,90
Albert,Albert,92,62,87
Richard,Richard,69,80,94


In [26]:
## How to access a specific column?
#df.column_name
# or df.['column_name']

df['math']

Sally      62
Jane       88
Suzie      94
Billy      98
Ada        77
John       79
Thomas     82
Marie      93
Albert     92
Richard    69
Isaac      92
Alan       92
Name: math, dtype: int64

In [27]:
df.math

Sally      62
Jane       88
Suzie      94
Billy      98
Ada        77
John       79
Thomas     82
Marie      93
Albert     92
Richard    69
Isaac      92
Alan       92
Name: math, dtype: int64

In [29]:
## making a NEW column, we need to use the ['column_name'] syntax

#df.['column_name'] = value

df['cohort'] = 'Jemison'
df

Unnamed: 0,name,math,english,reading,cohort
Sally,Sally,62,85,80,Jemison
Jane,Jane,88,79,67,Jemison
Suzie,Suzie,94,74,95,Jemison
Billy,Billy,98,96,88,Jemison
Ada,Ada,77,92,98,Jemison
John,John,79,76,93,Jemison
Thomas,Thomas,82,64,81,Jemison
Marie,Marie,93,63,90,Jemison
Albert,Albert,92,62,87,Jemison
Richard,Richard,69,80,94,Jemison


In [32]:
df['campus'] = 'San Antonio'
df

Unnamed: 0,name,math,english,reading,cohort,campus
Sally,Sally,62,85,80,Jemison,San Antonio
Jane,Jane,88,79,67,Jemison,San Antonio
Suzie,Suzie,94,74,95,Jemison,San Antonio
Billy,Billy,98,96,88,Jemison,San Antonio
Ada,Ada,77,92,98,Jemison,San Antonio
John,John,79,76,93,Jemison,San Antonio
Thomas,Thomas,82,64,81,Jemison,San Antonio
Marie,Marie,93,63,90,Jemison,San Antonio
Albert,Albert,92,62,87,Jemison,San Antonio
Richard,Richard,69,80,94,Jemison,San Antonio


In [33]:
#now we can call from the df via columns
#df.math >= 90 returns a series of booleans
df.math >= 90

Sally      False
Jane       False
Suzie       True
Billy       True
Ada        False
John       False
Thomas     False
Marie       True
Albert      True
Richard    False
Isaac       True
Alan        True
Name: math, dtype: bool

## Once you have your array/Series of booleans, you have the keys to the castle

-we can filter our results

-'df[df.column > 90]'

In [35]:
df[df.math > 90]

Unnamed: 0,name,math,english,reading,cohort,campus
Suzie,Suzie,94,74,95,Jemison,San Antonio
Billy,Billy,98,96,88,Jemison,San Antonio
Marie,Marie,93,63,90,Jemison,San Antonio
Albert,Albert,92,62,87,Jemison,San Antonio
Isaac,Isaac,92,99,93,Jemison,San Antonio
Alan,Alan,92,62,72,Jemison,San Antonio


In [40]:
# Let's try other filters...
## What if we need to store math >= 90 as its own column?

df['math_honors'] = df.math >= 90
df

Unnamed: 0,name,math,english,reading,cohort,campus,math__honors,english_honors,math_honors
Sally,Sally,62,85,80,Jemison,San Antonio,False,False,False
Jane,Jane,88,79,67,Jemison,San Antonio,False,False,False
Suzie,Suzie,94,74,95,Jemison,San Antonio,True,False,True
Billy,Billy,98,96,88,Jemison,San Antonio,True,True,True
Ada,Ada,77,92,98,Jemison,San Antonio,False,True,False
John,John,79,76,93,Jemison,San Antonio,False,False,False
Thomas,Thomas,82,64,81,Jemison,San Antonio,False,False,False
Marie,Marie,93,63,90,Jemison,San Antonio,True,False,True
Albert,Albert,92,62,87,Jemison,San Antonio,True,False,True
Richard,Richard,69,80,94,Jemison,San Antonio,False,False,False


In [41]:
df['english_honors'] = df.english >= 90
df

Unnamed: 0,name,math,english,reading,cohort,campus,math__honors,english_honors,math_honors
Sally,Sally,62,85,80,Jemison,San Antonio,False,False,False
Jane,Jane,88,79,67,Jemison,San Antonio,False,False,False
Suzie,Suzie,94,74,95,Jemison,San Antonio,True,False,True
Billy,Billy,98,96,88,Jemison,San Antonio,True,True,True
Ada,Ada,77,92,98,Jemison,San Antonio,False,True,False
John,John,79,76,93,Jemison,San Antonio,False,False,False
Thomas,Thomas,82,64,81,Jemison,San Antonio,False,False,False
Marie,Marie,93,63,90,Jemison,San Antonio,True,False,True
Albert,Albert,92,62,87,Jemison,San Antonio,True,False,True
Richard,Richard,69,80,94,Jemison,San Antonio,False,False,False


In [45]:
## What about creating an all_honors column?
# we can use past columns that already state boolean of True to add together as like an if, and statement
df['all_honors'] = df.english_honors & df.math_honors

#the above would be identitical to if we put (df.english >= 90) & (df.math >= 90)

df.head()

Unnamed: 0,name,math,english,reading,cohort,campus,math__honors,english_honors,math_honors,all_honors
Sally,Sally,62,85,80,Jemison,San Antonio,False,False,False,False
Jane,Jane,88,79,67,Jemison,San Antonio,False,False,False,False
Suzie,Suzie,94,74,95,Jemison,San Antonio,True,False,True,False
Billy,Billy,98,96,88,Jemison,San Antonio,True,True,True,True
Ada,Ada,77,92,98,Jemison,San Antonio,False,True,False,False


In [47]:
df['overall_average'] = (df.math + df.english) / 2

df.head()

Unnamed: 0,name,math,english,reading,cohort,campus,math__honors,english_honors,math_honors,all_honors,overall_average
Sally,Sally,62,85,80,Jemison,San Antonio,False,False,False,False,73.5
Jane,Jane,88,79,67,Jemison,San Antonio,False,False,False,False,83.5
Suzie,Suzie,94,74,95,Jemison,San Antonio,True,False,True,False,84.0
Billy,Billy,98,96,88,Jemison,San Antonio,True,True,True,True,97.0
Ada,Ada,77,92,98,Jemison,San Antonio,False,True,False,False,84.5


In [49]:
## to view only a specific set of columns:
columns = ['name', 'math', 'reading', 'english']

df[columns].head()

Unnamed: 0,name,math,reading,english
Sally,Sally,62,80,85
Jane,Jane,88,67,79
Suzie,Suzie,94,95,74
Billy,Billy,98,88,96
Ada,Ada,77,98,92


In [50]:
## double brackets produce the same result as above w/o a variable
df[['name','math', 'reading', 'english']]

Unnamed: 0,name,math,reading,english
Sally,Sally,62,80,85
Jane,Jane,88,67,79
Suzie,Suzie,94,95,74
Billy,Billy,98,88,96
Ada,Ada,77,98,92
John,John,79,93,76
Thomas,Thomas,82,81,64
Marie,Marie,93,90,63
Albert,Albert,92,87,62
Richard,Richard,69,94,80


In [53]:
# How to drop columns
cols_to_drop = ['campus','cohort', 'math__honors']

df.drop(columns = cols_to_drop)
df.head()

Unnamed: 0,name,math,english,reading,cohort,campus,math__honors,english_honors,math_honors,all_honors,overall_average
Sally,Sally,62,85,80,Jemison,San Antonio,False,False,False,False,73.5
Jane,Jane,88,79,67,Jemison,San Antonio,False,False,False,False,83.5
Suzie,Suzie,94,74,95,Jemison,San Antonio,True,False,True,False,84.0
Billy,Billy,98,96,88,Jemison,San Antonio,True,True,True,True,97.0
Ada,Ada,77,92,98,Jemison,San Antonio,False,True,False,False,84.5


In [54]:
# If you have a series/arrays of booleans, you can filter your results!!

df[df.name == 'Jane']

Unnamed: 0,name,math,english,reading,cohort,campus,math__honors,english_honors,math_honors,all_honors,overall_average
Jane,Jane,88,79,67,Jemison,San Antonio,False,False,False,False,83.5


In [55]:
df[(df.name == 'Jane') | (df.name == 'Marie')] 

Unnamed: 0,name,math,english,reading,cohort,campus,math__honors,english_honors,math_honors,all_honors,overall_average
Jane,Jane,88,79,67,Jemison,San Antonio,False,False,False,False,83.5
Marie,Marie,93,63,90,Jemison,San Antonio,True,False,True,False,78.0


In [56]:
## The | or & operators here return a series of booleans
(df.name == 'Jane') | (df.name == 'Marie')

Sally      False
Jane        True
Suzie      False
Billy      False
Ada        False
John       False
Thomas     False
Marie       True
Albert     False
Richard    False
Isaac      False
Alan       False
Name: name, dtype: bool

In [57]:
## The code that produces the series of boolean goes in the square braces
df[df.name.str.startswith('S')]

Unnamed: 0,name,math,english,reading,cohort,campus,math__honors,english_honors,math_honors,all_honors,overall_average
Sally,Sally,62,85,80,Jemison,San Antonio,False,False,False,False,73.5
Suzie,Suzie,94,74,95,Jemison,San Antonio,True,False,True,False,84.0


In [60]:
#Sorting
df.sort_values(by='english') #defaults are ascending order

Unnamed: 0,name,math,english,reading,cohort,campus,math__honors,english_honors,math_honors,all_honors,overall_average
Albert,Albert,92,62,87,Jemison,San Antonio,True,False,True,False,77.0
Alan,Alan,92,62,72,Jemison,San Antonio,True,False,True,False,77.0
Marie,Marie,93,63,90,Jemison,San Antonio,True,False,True,False,78.0
Thomas,Thomas,82,64,81,Jemison,San Antonio,False,False,False,False,73.0
Suzie,Suzie,94,74,95,Jemison,San Antonio,True,False,True,False,84.0
John,John,79,76,93,Jemison,San Antonio,False,False,False,False,77.5
Jane,Jane,88,79,67,Jemison,San Antonio,False,False,False,False,83.5
Richard,Richard,69,80,94,Jemison,San Antonio,False,False,False,False,74.5
Sally,Sally,62,85,80,Jemison,San Antonio,False,False,False,False,73.5
Ada,Ada,77,92,98,Jemison,San Antonio,False,True,False,False,84.5


In [61]:
df.sort_values(by='english', ascending = False)

Unnamed: 0,name,math,english,reading,cohort,campus,math__honors,english_honors,math_honors,all_honors,overall_average
Isaac,Isaac,92,99,93,Jemison,San Antonio,True,True,True,True,95.5
Billy,Billy,98,96,88,Jemison,San Antonio,True,True,True,True,97.0
Ada,Ada,77,92,98,Jemison,San Antonio,False,True,False,False,84.5
Sally,Sally,62,85,80,Jemison,San Antonio,False,False,False,False,73.5
Richard,Richard,69,80,94,Jemison,San Antonio,False,False,False,False,74.5
Jane,Jane,88,79,67,Jemison,San Antonio,False,False,False,False,83.5
John,John,79,76,93,Jemison,San Antonio,False,False,False,False,77.5
Suzie,Suzie,94,74,95,Jemison,San Antonio,True,False,True,False,84.0
Thomas,Thomas,82,64,81,Jemison,San Antonio,False,False,False,False,73.0
Marie,Marie,93,63,90,Jemison,San Antonio,True,False,True,False,78.0


In [63]:
df.sort_values(by=['english', 'math'])

Unnamed: 0,name,math,english,reading,cohort,campus,math__honors,english_honors,math_honors,all_honors,overall_average
Albert,Albert,92,62,87,Jemison,San Antonio,True,False,True,False,77.0
Alan,Alan,92,62,72,Jemison,San Antonio,True,False,True,False,77.0
Marie,Marie,93,63,90,Jemison,San Antonio,True,False,True,False,78.0
Thomas,Thomas,82,64,81,Jemison,San Antonio,False,False,False,False,73.0
Suzie,Suzie,94,74,95,Jemison,San Antonio,True,False,True,False,84.0
John,John,79,76,93,Jemison,San Antonio,False,False,False,False,77.5
Jane,Jane,88,79,67,Jemison,San Antonio,False,False,False,False,83.5
Richard,Richard,69,80,94,Jemison,San Antonio,False,False,False,False,74.5
Sally,Sally,62,85,80,Jemison,San Antonio,False,False,False,False,73.5
Ada,Ada,77,92,98,Jemison,San Antonio,False,True,False,False,84.5


In [65]:
#to reassign, just rename df with what looking at

df = df.sort_values(by=['english', 'math'])

## Chaining methods on a dataframe
-As long as your method returns a df, you can attach another method
-Think of string method chaining

In [68]:
"bANanA".swapcase().lower().swapcase()

'BANANA'

In [69]:
df.sort_values(by = 'all_honors').sort_values(by = 'name')

Unnamed: 0,name,math,english,reading,cohort,campus,math__honors,english_honors,math_honors,all_honors,overall_average
Ada,Ada,77,92,98,Jemison,San Antonio,False,True,False,False,84.5
Alan,Alan,92,62,72,Jemison,San Antonio,True,False,True,False,77.0
Albert,Albert,92,62,87,Jemison,San Antonio,True,False,True,False,77.0
Billy,Billy,98,96,88,Jemison,San Antonio,True,True,True,True,97.0
Isaac,Isaac,92,99,93,Jemison,San Antonio,True,True,True,True,95.5
Jane,Jane,88,79,67,Jemison,San Antonio,False,False,False,False,83.5
John,John,79,76,93,Jemison,San Antonio,False,False,False,False,77.5
Marie,Marie,93,63,90,Jemison,San Antonio,True,False,True,False,78.0
Richard,Richard,69,80,94,Jemison,San Antonio,False,False,False,False,74.5
Sally,Sally,62,85,80,Jemison,San Antonio,False,False,False,False,73.5


In [70]:
##remember:
#Methods execute from left to right
#Functions execute from inside out (right to left)

In [72]:
# Renaming a column using a dictionary
cols_to_rename = {
    "math": "math_grade",
    "reading": "reading_grade",
    "english": "english_grade"
}
cols_to_rename

{'math': 'math_grade', 'reading': 'reading_grade', 'english': 'english_grade'}

In [73]:
df = df.rename(columns=cols_to_rename)

In [74]:
df

Unnamed: 0,name,math_grade,english_grade,reading_grade,cohort,campus,math__honors,english_honors,math_honors,all_honors,overall_average
Albert,Albert,92,62,87,Jemison,San Antonio,True,False,True,False,77.0
Alan,Alan,92,62,72,Jemison,San Antonio,True,False,True,False,77.0
Marie,Marie,93,63,90,Jemison,San Antonio,True,False,True,False,78.0
Thomas,Thomas,82,64,81,Jemison,San Antonio,False,False,False,False,73.0
Suzie,Suzie,94,74,95,Jemison,San Antonio,True,False,True,False,84.0
John,John,79,76,93,Jemison,San Antonio,False,False,False,False,77.5
Jane,Jane,88,79,67,Jemison,San Antonio,False,False,False,False,83.5
Richard,Richard,69,80,94,Jemison,San Antonio,False,False,False,False,74.5
Sally,Sally,62,85,80,Jemison,San Antonio,False,False,False,False,73.5
Ada,Ada,77,92,98,Jemison,San Antonio,False,True,False,False,84.5
