# Lab: Pandas Data Loading & Indexing

Read in the file `data/states.csv`

In [1]:
import pandas as pd
states = pd.read_csv('../data/states.csv')

Examine the file

In [2]:
states.head()

Unnamed: 0,Abbreviation,state,area,pop
0,AL,Alabama,135767,4874747.0
1,AK,Alaska,1723337,739795.0
2,AZ,Arizona,295234,7016270.0
3,AR,Arkansas,137732,3004279.0
4,CA,California,423967,39536653.0


In [3]:
states.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 52 entries, 0 to 51
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Abbreviation  52 non-null     object 
 1   state         52 non-null     object 
 2   area          52 non-null     int64  
 3   pop           51 non-null     float64
dtypes: float64(1), int64(1), object(2)
memory usage: 1.8+ KB


Re-read the file, using the Abbreviation as an index

In [4]:
states = pd.read_csv('../data/states.csv', index_col='Abbreviation')
states.head()

Unnamed: 0_level_0,state,area,pop
Abbreviation,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
AL,Alabama,135767,4874747.0
AK,Alaska,1723337,739795.0
AZ,Arizona,295234,7016270.0
AR,Arkansas,137732,3004279.0
CA,California,423967,39536653.0


Find the largest state by land area

In [5]:
states.sort_values('area', ascending=False).head()

Unnamed: 0_level_0,state,area,pop
Abbreviation,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
AK,Alaska,1723337,739795.0
TX,Texas,695662,28304596.0
CA,California,423967,39536653.0
MT,Montana,380831,1050493.0
NM,New Mexico,314917,2088070.0


In [6]:
maxarea = states.area.max()
states[states.area == maxarea]

Unnamed: 0_level_0,state,area,pop
Abbreviation,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
AK,Alaska,1723337,739795.0


In [7]:
states.area.idxmax()

'AK'

In [8]:
states.loc[states.area.idxmax()]

state     Alaska
area     1723337
pop       739795
Name: AK, dtype: object

Create a dataframe with just Florida, Georgia, Alabama, and South Carolina

In [10]:
subset = 'Florida, Georgia, Alabama, South Carolina'.split(', ')
subset

['Florida', 'Georgia', 'Alabama', 'South Carolina']

In [11]:
se_states = states.loc[states.state.isin(subset)]
se_states

Unnamed: 0_level_0,state,area,pop
Abbreviation,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
AL,Alabama,135767,4874747.0
FL,Florida,170312,20984400.0
GA,Georgia,153910,10429379.0
SC,South Carolina,82933,5024369.0


In [13]:
states.loc[['AL', 'FL', 'GA', 'SC']]

Unnamed: 0_level_0,state,area,pop
Abbreviation,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
AL,Alabama,135767,4874747.0
FL,Florida,170312,20984400.0
GA,Georgia,153910,10429379.0
SC,South Carolina,82933,5024369.0


Save the new dataframe to southeast.csv  (there is a `DataFrame.to_csv()` method)

In [14]:
se_states.to_csv('../data/southeast.csv')

In [15]:
!cat ../data/southeast.csv

Abbreviation,state,area,pop
AL,Alabama,135767,4874747.0
FL,Florida,170312,20984400.0
GA,Georgia,153910,10429379.0
SC,South Carolina,82933,5024369.0


Read in the sales data `data/sales.csv`

In [16]:
sales = pd.read_csv('../data/sales.csv', parse_dates=['date'])
sales.head()

Unnamed: 0,order_num,line_num,date,sku,qty
0,0,0,2011-01-01,sku4333,6
1,0,1,2011-01-01,sku76536,7
2,1,0,2011-01-02,sku75108,3
3,1,1,2011-01-02,sku78838,9
4,1,2,2011-01-02,sku77480,9


In [17]:
sales.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2450 entries, 0 to 2449
Data columns (total 5 columns):
 #   Column     Non-Null Count  Dtype         
---  ------     --------------  -----         
 0   order_num  2450 non-null   int64         
 1   line_num   2450 non-null   int64         
 2   date       2450 non-null   datetime64[ns]
 3   sku        2450 non-null   object        
 4   qty        2450 non-null   int64         
dtypes: datetime64[ns](1), int64(3), object(1)
memory usage: 95.8+ KB


Find the average line quantity during the month of June, 2011

In [18]:
june = sales[
    (sales.date >= '2011-06-01')
    & (sales.date < '2011-07-01')
]
june.qty.mean()

4.861111111111111

In [19]:
sum(june.qty) / len(june.qty)

4.861111111111111

Read the stock data `data/closing-prices.csv`

In [20]:
stock = pd.read_csv('../data/closing-prices.csv', index_col=0, parse_dates=[0])

Examine its structure

In [21]:
stock.head()

Unnamed: 0,F,TSLA,GOOG,IBM,AAPL
2014-01-02,12.089,150.1,,157.6001,72.7741
2014-01-03,12.1438,149.56,,158.543,71.1756
2014-01-06,12.1986,147.0,,157.9993,71.5637
2014-01-07,12.042,149.36,,161.1508,71.0516
2014-01-08,12.1673,151.28,,159.6728,71.5019


Find the min, max, mean, and median price of AAPL during June of 2015

In [22]:
stock.loc['2015-06-01':'2015-06-30', 'AAPL'].describe()

count     22.000000
mean     120.584491
std        1.362229
min      117.493300
25%      119.885025
50%      120.394500
75%      121.366325
max      123.158900
Name: AAPL, dtype: float64

In [23]:
stock.loc['2015-06-01':'2015-06-30', 'AAPL'].agg('min max mean median'.split())

min       117.493300
max       123.158900
mean      120.584491
median    120.394500
Name: AAPL, dtype: float64