# Configuring pandas

In [None]:
# import numpy and pandas
import numpy as np
import pandas as pd

# read in the data and print the first five rows
# use the Symbol column as the index, and 
# only read in columns in positions 0, 2, 3, 7
sp500 = pd.read_csv("data/sp500.csv", 
    index_col='Symbol', 
    usecols=[0, 2, 3, 7])

# one month of stock history data
omh = pd.read_csv("data/omh.csv")

# Performing arithmetic on a DataFrame or Series

In [None]:
# set the seed to allow replicatable results
np.random.seed(123456)
# create the DataFrame
df = pd.DataFrame(np.random.randn(5, 4), 
    columns=['A', 'B', 'C', 'D'])
df

In [None]:
# multiply everything by 2
df * 2

In [None]:
# get first row 
s = df.iloc[0] 
# subtract first row from every row of the DataFrame
diff = df - s 
diff

In [None]:
# subtract DataFrame from Series
diff2 = s - df
diff2

In [None]:
# get the A column
a_col = df['A']
# subtract the A column values from every column
df.sub(a_col, axis=0)

# Counts of values

In [None]:
s = pd.Series(['a', 'a', 'b', 'c', np.NaN])
# number of occurrences of each unique value
s.count()

# Unique and number of unique values

In [None]:
# return a list of unique items
s.unique()

In [None]:
# The number of unique values (excluding NaN)
s.nunique()

In [None]:
# To include NaN in the result, use dropna=False as a parameter
s.nunique(dropna=False)

In [None]:
# get summary stats on non-numeric data
s.value_counts(dropna=False)

# Minimum and maximum

In [None]:
# minimum values for both columns
omh[['MSFT', 'AAPL']].min()

In [None]:
# maximum values for both columns
omh[['MSFT', 'AAPL']].max()

In [None]:
# location of min price for both stocks
omh[['MSFT', 'AAPL']].idxmin()

In [None]:
# and location of the max
omh[['MSFT', 'AAPL']].idxmax()

# Smallest and Largest Values

In [None]:
# get the 4 smallest values
omh.nsmallest(4, ['MSFT'])['MSFT']

In [None]:
# get the 4 largest values
omh.nlargest(4, ['MSFT'])['MSFT']

In [None]:
# nsmallest on a Series
omh.MSFT.nsmallest(4)

# Accumulations

Accumulations are statistical methods that determine a value by continuously applying the next value in a `Series` to the running result.

In [None]:
# calculate a cumulative product
# The result is another `Series` that represents the accumulated value at each position
pd.Series([1, 2, 3, 4]).cumprod()

In [None]:
# calculate a cumulative sum
pd.Series([1, 2, 3, 4]).cumsum()

# Summary descriptive statistics

In [None]:
# get summary statistics for each column
omh.describe()

In [None]:
# just the stats for MSFT
omh.MSFT.describe()

In [None]:
# only the mean for MSFT
omh.MSFT.describe()['mean']

In [None]:
# get summary stats on non-numeric data
s = pd.Series(['a', 'a', 'b', 'c', np.NaN])
s.describe()

# Mean

In [None]:
# the mean of all the columns in omh
omh.mean()

In [None]:
# calc the mean of the values in each row
omh.mean(axis=1)[:5]

# Median

In [None]:
# calc the median of the values in each column
omh.median()

# Mode

In [None]:
# find the mode of this Series
s = pd.Series([1, 2, 3, 3, 5])
s.mode()

In [None]:
# there can be more than one mode
s = pd.Series([1, 2, 3, 3, 5, 1])
s.mode()

# Variance

In [None]:
# calc the variance of the values in each column
omh.var()

# Standard Deviation

In [None]:
# standard deviation
omh.std()

# Covariance

In [None]:
# covariance of MSFT vs AAPL
omh.MSFT.cov(omh.AAPL)

# Correlation

In [None]:
# correlation of MSFT relative to AAPL
omh.MSFT.corr(omh.AAPL)

# Discretization and quantiling

In [None]:
# generate 50 ages between 6 and 45
np.random.seed(123456)
ages = np.random.randint(6, 45, 50)
ages

In [None]:
# cut into ranges and then get descriptive stats
# The resulting is a pandas Categorical variable. 
# It consists of a set of labels and an index that describes how the data has been split.
ranges = [6, 12, 18, 35, 50]
agebins = pd.cut(ages, ranges)
agebins.describe()

In [None]:
# add names for the bins
ranges = [6, 12, 18, 35, 50]
labels = ['Youth', 'Young Adult', 'Adult', 'Middle Aged']
agebins = pd.cut(ages, ranges, labels=labels)
agebins.describe()