# DataFrames

A Pandas DataFrame is a 2 dimensional data structure, like a 2 dimensional array, or a table with rows and columns.

Each column of a data frame is a pandas `Series`

In [None]:
import pandas as pd
import numpy as np

## Create a DataFrame from arrays

In [None]:
data = {
  "calories": [420, 380, 390],
  "duration": [50, 40, 45]
}
df = pd.DataFrame(data)
df

In [None]:
# using named indexes
data = {
  "calories": [420, 380, 390],
  "duration": [50, 40, 45]
}
df = pd.DataFrame(data, index = ["day1", "day2", "day3"])
df

## Creating a DataFrame from a CSV file

In [None]:
df = pd.read_csv('data/goog.csv', parse_dates=['Date'], index_col='Date', usecols=[0,1,4])
df

In [None]:
# verify the index is Date column
df.index

## Creating a DataFrame from a JSON file

In [None]:
df = pd.read_json('data.json')
df

## Create a DataFrame from Series

In [None]:
dates = pd.date_range('2016-04-01', '2016-04-06')
temps1 = pd.Series([80, 82, 85, 90, 83, 87], index = dates)
temps2 = pd.Series([70, 75, 69, 83, 79, 77], index = dates)
temps_df = pd.DataFrame({
    'Missoula': temps1, 
    'Philadelphia': temps2
})
temps_df

In [None]:
# retrieve the columns index
temps_df.columns

In [None]:
# how many rows?
len(temps_df)

In [None]:
# what is the dimensionality
temps_df.shape

## Accessing row

In [None]:
# get the row with index '2016-04-01'
temps_df.loc['2016-04-01']

In [None]:
# get the row at array position 1
temps_df.iloc[1]

In [None]:
# use a list of indexes:
temps_df.loc[['2016-04-01', '2016-04-03']]

In [None]:
# get the location of 2016-04-01 and 2016-04-03 in the index
i1 = temps_df.index.get_loc('2016-04-01')
i2 = temps_df.index.get_loc('2016-04-03')
(i1, i2)

In [None]:
# and get the rows
temps_df.iloc[[i1, i2]]

## Accessing column

In [None]:
# get the column with the name Missoula
temps_df['Missoula']

In [None]:
# return both columns in a different order
temps_df[['Philadelphia', 'Missoula']]

In [None]:
# retrieve the Missoula column through property syntax
temps_df.Missoula.head()

## Add new column

In [None]:
# add a column to temp_df which contains the difference in temps
temps_df['Difference'] = temps_df.Missoula - temps_df.Philadelphia
temps_df

## Scalar lookup

In [None]:
# by label in both the index and column
temps_df.at['2016-04-03', 'Missoula']

In [None]:
# by location.  Row 0, column 1
temps_df.iat[2, 0]

## Slicing data

In [None]:
# slice the temp differences column for the rows at 
# location 1 through 4 (as though it is an array)
temps_df.Difference[1:4]

In [None]:
# get the values in the Differences column in tows 1, 3 and 5
# using 0-based location
temps_df.iloc[[1, 3, 5]].Difference

In [None]:
# which values in the Missoula column are > 82?
temps_df.Missoula > 82

In [None]:
# return the rows where the temps for Missoula > 82
temps_df[temps_df.Missoula > 82]

In [50]:
temps_df[(temps_df.Missoula > 82) & (temps_df.Philadelphia > 75)]

Unnamed: 0,Missoula,Philadelphia
2016-04-04,90,83
2016-04-05,83,79
2016-04-06,87,77


In [51]:
# select the Missoula column for '2016-04-01' and '2016-04-03'
temps_df.loc[['2016-04-01', '2016-04-03']][['Missoula']]

Unnamed: 0,Missoula
2016-04-01,80
2016-04-03,85


## Viewing the Data

In [None]:
# printing the first 5 rows of the DataFrame
print(df.head())

# Print the last 5 rows of the DataFrame
print(df.tail()) 

# Print information about the data
print(df.info()) 

## Visualization

In [None]:
# plots the values in the Close column
df.Close.plot()