In [2]:
import pandas as pd
import numpy as np

In [3]:
#the series is one building block in pandas. pandas series is a one-directional labeled array that can hold data of any type(integer,string,float,python objects etc), the axis labels are collectively called index
pd.Series([1,2,3], index = ['a','b','c'])

a    1
b    2
c    3
dtype: int64

In [4]:
#alternatively, the values can be a numpy array
pd.Series(np.array([1,2,3]), index = ['a','b','c'])

a    1
b    2
c    3
dtype: int64

In [8]:
#in data science, data is usually more than one-dimensional, and different data types, thus series is not sufficient, DataFrames are 2darrays with both row and column labels. One way to create a DataFrame from scratch is to pass in a dict. For example, this week, we sold 3 bottles of red wine to Adam, 6 to Bob, and 5 to Charles. we sold 5 bottles of white wine to Adam, 0 to Bob, and 10 to Charles. we can organize the data into a DataFrame by creating a dict 'wine_dict' with the number of bottles of each wine type we sold, then pass it along with the customer names as index to create a DataFrame 'sales'

wine_dict = {
    'red_wine' : [3,5,6],
    'white_wine' : [5,0,10]
}

sales = pd.DataFrame(wine_dict, index = ['adam','bob','charles'])
sales
sales['white_wine']

adam        5
bob         0
charles    10
Name: white_wine, dtype: int64

In [13]:
#let's take a look at a new DataFrame, in addition to heights and ages of the presidents, there is information on the order, names and parties. The DataFrame presidents_df is read from a CSV file as follows. Note index is set to be the name of presidents

presidents_df = pd.read_csv('https://sololearn.com/uploads/files/president_heights_party.csv', index_col='name')

In [12]:
#similar to numpy, to get the dimensions of a DataFrame, use .shape
presidents_df.shape

(45, 4)

In [7]:
presidents_df.shape[0]

45

In [8]:
#size also works on DataFrame to return an integer representing the number of elements in this object
presidents_df.size

180

In [9]:
#instead of looking at the entire dataset, we can just take a peep. to see the first few lines in a DataFrame, use .head(), if we don't specify n ( the number of lines ), by default, it displays the first 5 rows. Here we want to display the first 3 rows
presidents_df.head(n=3)

Unnamed: 0_level_0,order,age,height,party
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
George Washington,1,57,189,none
John Adams,2,61,170,federalist
Thomas Jefferson,3,57,189,democratic-republican


In [10]:
#use .info() to get an overview of the DataFrame, its output includes index, column names, count of non-null values, dtypes, and memory usage
#the dtype for order, age and height is integers, while party is an object. the count of non-null values in each column is the same as the number of rows, indicating missing values
presidents_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 45 entries, George Washington to Donald J. Trump
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   order   45 non-null     int64 
 1   age     45 non-null     int64 
 2   height  45 non-null     int64 
 3   party   45 non-null     object
dtypes: int64(3), object(1)
memory usage: 1.8+ KB


In [11]:
#instead of memorizing the integer positions to locate the order, age, height, and party information of Abraham Lincoln, with DataFrame, we can access it by the name .loc
presidents_df.loc['Abraham Lincoln']

order             16
age               52
height           193
party     republican
Name: Abraham Lincoln, dtype: object

In [12]:
#the result is a pandas series of shape (4,)
type(presidents_df.loc['Abraham Lincoln'])
presidents_df.loc['Abraham Lincoln'].shape

(4,)

In [13]:
#we can also slice by index. say we are interested in gathering information all of the presidents between Abraham Lincoln and Ulysses S. Grant
presidents_df.loc['Abraham Lincoln': 'Ulysses S. Grant']

Unnamed: 0_level_0,order,age,height,party
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Abraham Lincoln,16,52,193,republican
Andrew Johnson,17,56,178,national union
Ulysses S. Grant,18,46,173,republican


In [14]:
#alternatively, if we do know the integer position(s), we can use .iloc to access the row(s)
presidents_df.iloc[15]

order             16
age               52
height           193
party     republican
Name: Abraham Lincoln, dtype: object

In [15]:
#to gather information from 16th to 18th presidents, we can then:
presidents_df.iloc[15:18]

Unnamed: 0_level_0,order,age,height,party
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Abraham Lincoln,16,52,193,republican
Andrew Johnson,17,56,178,national union
Ulysses S. Grant,18,46,173,republican


In [16]:
#we can retrieve an entire column from presidents_df by name. First we access all the column names:
presidents_df.columns

Index(['order', 'age', 'height', 'party'], dtype='object')

In [17]:
#we can also access the column height by:
presidents_df['height']

name
George Washington         189
John Adams                170
Thomas Jefferson          189
James Madison             163
James Monroe              183
John Quincy Adams         171
Andrew Jackson            185
Martin Van Buren          168
William Henry Harrison    173
John Tyler                183
James K. Polk             173
Zachary Taylor            173
Millard Fillmore          175
Franklin Pierce           178
James Buchanan            183
Abraham Lincoln           193
Andrew Johnson            178
Ulysses S. Grant          173
Rutherford B. Hayes       174
James A. Garfield         183
Chester A. Arthur         183
Grover Cleveland          180
Benjamin Harrison         168
Grover Cleveland          180
William McKinley          170
Theodore Roosevelt        178
William Howard Taft       182
Woodrow Wilson            180
Warren G. Harding         183
Calvin Coolidge           178
Herbert Hoover            182
Franklin D. Roosevelt     188
Harry S. Truman           175
Dwigh

In [18]:
presidents_df['height'].shape

(45,)

In [19]:
presidents_df['height'].size

45

In [20]:
presidents_df[['height','age']].head(n=3)

Unnamed: 0_level_0,height,age
name,Unnamed: 1_level_1,Unnamed: 2_level_1
George Washington,189,57
John Adams,170,61
Thomas Jefferson,189,57


In [21]:
#if we wanted to access columns order, age, and height, we can do it with .loc .loc allows us to access any of the columns. For example, if we wanted to access columns from order through height for the firs three presidents
presidents_df.loc[:, 'order':'height'].head(n=3)

Unnamed: 0_level_0,order,age,height
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
George Washington,1,57,189
John Adams,2,61,170
Thomas Jefferson,3,57,189


In [22]:
presidents_df.min()

order              1
age               42
height           163
party     democratic
dtype: object

In [23]:
presidents_df.max()

order       45
age         70
height     193
party     whig
dtype: object

In [17]:
#variance and standard deviation: in probability and statistics, variance is the mean squared deviation of each data point from the mean of the entire dataset. 

#you can think of it as how far apart a set of numbers are spread out from their average value. Standard deviation is the square root of the variance. A high std implies a large spread, and a low std indicates a small spread, or most points and close to the mean

const = pd.Series([2,3,4])
const.var()
#const.std()

1.0

In [19]:
presidents_df['age'].var()
#presidents_df['age'].std()

6.59545297913646

In [20]:
#describe() prints out almost all the summary statistics [count,mean,std,min,max]
presidents_df['age'].describe()

count    45.000000
mean     55.000000
std       6.595453
min      42.000000
25%      51.000000
50%      55.000000
75%      58.000000
max      70.000000
Name: age, dtype: float64

In [21]:
#categorical varible: the fourth column 'party' was omitted in the output of describe() because it is a categorical variable. A categorical variable is one that takes on a single value from a limited sets of categories. it doesn't make sense to calculate the mean of democratic, republican, federalist, and other parties. we can check the unique values and corresponding frequency by using .value_counts():

presidents_df['party'].value_counts()

republican               19
democratic               15
democratic-republican     4
whig                      4
none                      1
federalist                1
national union            1
Name: party, dtype: int64

In [22]:
#we can also call describe() to see that there are 45 non-null values, 7 unique parties, the most frequent party is republican, with total of 19 presidents belonging to this party
presidents_df['party'].describe()

count             45
unique             7
top       republican
freq              19
Name: party, dtype: object

In [24]:
#Groupby: summary statistics on an entire dataset provides a good overall view, but often we're interested in some calculation conditional upon a given label or category , for example, what is the average height conditional of the presidents party?

# to find the value based on a condition, we can use the groupby operation. think of groupby doing three steps: split, apply, and combine. the split step breaks the DataFrame into multiple DataFrames based on the value of the specified key; the apply step is to perform the operation inside each smaller DataFrame, the last step combines the pieces back into a larger DataFrame

presidents_df.groupby('party').mean()

Unnamed: 0_level_0,order,age,height
party,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
democratic,26.066667,52.6,181.066667
democratic-republican,4.5,57.25,176.5
federalist,2.0,61.0,170.0
national union,17.0,56.0,178.0
none,1.0,57.0,189.0
republican,29.631579,55.263158,180.894737
whig,11.0,58.25,176.0


In [30]:
#aggregation : we can also perform multiple operations on the groupby object using .agg() method. it takes a string, a function, or a list thereof, for example, we would like to obtain the min, median, and max values of heights grouped by party

presidents_df.groupby('party')['height'].agg(['min', np.median, max])

Unnamed: 0_level_0,min,median,max
party,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
democratic,168,180.0,193
democratic-republican,163,177.0,189
federalist,170,170.0,170
national union,178,178.0,178
none,189,189.0,189
republican,168,182.0,193
whig,173,174.0,183


In [31]:
#often time we are interested in different summary statistics for multiple columns. for instance, we would like to check the median and mean of heights, but minimum and maximum for ages, grouped by party. in this case, we can pass a dict with key indicate the column name, and value indicate the function

presidents_df.groupby('party').agg({'height': [ np.median, np.mean], 'age': [min, max]})

Unnamed: 0_level_0,height,height,age,age
Unnamed: 0_level_1,median,mean,min,max
party,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
democratic,180.0,181.066667,43,65
democratic-republican,177.0,176.5,57,58
federalist,170.0,170.0,61,61
national union,178.0,178.0,56,56
none,189.0,189.0,57,57
republican,182.0,180.894737,42,70
whig,174.0,176.0,50,68


In [None]:
#project

