# DataFrames

A Pandas DataFrame is a 2 dimensional data structure, like a 2 dimensional array, or a table with rows and columns.

Each column of a data frame is a pandas `Series`

In [None]:
import pandas as pd
import numpy as np

## Create a DataFrame from arrays

In [None]:
data = {
  "calories": [420, 380, 390],
  "duration": [50, 40, 45]
}
df = pd.DataFrame(data)
df

In [None]:
# using named indexes
df = pd.DataFrame(data, index = ["day1", "day2", "day3"])
df

## Creating a DataFrame from a CSV file

In [None]:
# read in the data and print the first five rows
# use the Symbol column as the index, and 
# only read in columns in positions 0, 2, 3, 7
sp500 = pd.read_csv("data/sp500.csv", index_col='Symbol', usecols=[0, 2, 3, 7])
sp500

## Create a DataFrame from Series

In [None]:
dates = pd.date_range('2016-04-01', '2016-04-06')
temps1 = pd.Series([80, 82, 85, 90, 83, 87], index = dates)
temps2 = pd.Series([70, 75, 69, 83, 79, 77], index = dates)
temps_df = pd.DataFrame({
    'Missoula': temps1, 
    'Philadelphia': temps2
})
temps_df

In [None]:
# retrieve the columns index
temps_df.columns

In [None]:
# how many rows?
len(temps_df)

In [None]:
# what is the dimensionality
temps_df.shape

## Accessing row

In [None]:
# get the row with index '2016-04-01'
temps_df.loc['2016-04-01']

In [None]:
# get the row at array position 1
temps_df.iloc[1]

In [None]:
# select rows from postion 1 to 3
temps_df.iloc[1:3]

In [None]:
# select rows with specific indexes:
temps_df.loc[['2016-04-01', '2016-04-03']]

In [None]:
# select rows with ranged indexes
temps_df.loc['2016-04-01':'2016-04-03']

In [None]:
# get the location of 2016-04-01 and 2016-04-03 in the index
i1 = temps_df.index.get_loc('2016-04-01')
i2 = temps_df.index.get_loc('2016-04-03')
(i1, i2)

In [None]:
# and get the rows
temps_df.iloc[[i1, i2]]

## Accessing column

In [None]:
# get the column with the name Missoula
temps_df['Missoula']

In [None]:
# return both columns in a different order
temps_df[['Philadelphia', 'Missoula']]

In [None]:
# retrieve the Missoula column through property syntax
temps_df.Missoula.head()

## Scalar lookup

In [None]:
# by label in both the index and column
temps_df.at['2016-04-03', 'Missoula']

In [None]:
# by location.  Row 0, column 1
temps_df.iat[2, 0]

## Slicing data

In [None]:
# first five rows
sp500[:5]

In [None]:
# ABT through ACN labels
sp500['ABT':'ACN']

In [None]:
# slice the Price column for the rows at 
# location 1 through 4 (as though it is an array)
sp500.Price[1:4]

In [None]:
# get the values in the Price column in rows 1, 3 and 5
# using 0-based location
sp500.iloc[[1, 3, 5]].Price

## Selecting rows using Boolean selection

In [None]:
# what rows have a price < 100?
sp500.Price < 100

In [None]:
# now get the rows with Price < 100
sp500[sp500.Price < 100]

In [None]:
# get only the Price where Price is < 40 and Sector = Health Care
r = sp500[(sp500.Price < 40) &  (sp500.Sector == 'Health Care')] [['Price','Sector']]
r

In [None]:
# select the price and sector columns for ABT and ZTS
sp500.loc[['ABT', 'ZTS']][['Sector', 'Price']]

## Viewing the Data

In [None]:
# printing the first 5 rows of the DataFrame
sp500.head()

In [None]:
# Print the last 5 rows of the DataFrame
sp500.tail()

In [None]:
# Print information about the data
sp500.info()

## Visualization

In [None]:
# plots the values in the Close column
df.Close.plot()