pandas: https://pandas.pydata.org/docs/user_guide/index.html#user-guide

pandas DF: https://pandas.pydata.org/docs/reference/frame.html

In [None]:
import pandas as pd
import numpy as np

<a id="0"></a> <br>
 # Table of Contents  
1. [Options](#options)
1. [Data Types](#datatypes)
    1. [Category](#category)
1. [String](#string)
    1. [Series.str](#series.str)
1. [Dates](#dates)
    1. [pd.to_datetime()](#pd.to_datetime())
    1. [pd.date_range()](#pd.date_range())
    1. [Series.dt](#series.dt)
    1. [Time Delta](#timedelta)
1. [Input/Output](#input/output)
1. [Create DataFrame](#createdataframe)
1. [Index / Columns](#indexcolumns)
    1. [MultiIndex](#multiindex)
1. [Mutability](#mutability)
1. [Data Overview](#dataoverview)
1. [Math/Stats](#mathstats)
1. [Sorting](#sorting)
1. [Merging](#merging)
1. [Slicing / Filtering](#slicingfiltering)
    1. [General](#slicinggeneral)
    1. [query()](#query)
1. [Transformation](#transform)
    1. [Basic](#basic)
    1. [Handling missing values](#missingvalues)
    1. [apply()](#apply)
    1. [pipe()](#pipe)
    1. [transform()](#ttransform)
    1. [assign()](#assign)
1. [Reshaping / Aggregating](#reshaping)
    1. [Pivot / Crosstab](#pivotcrosstab)
    1. [groupby](#groupby)
    1. [Advanced groupby](#advancedgroupby)

<a id="options"></a> 
# ---------- Options ----------

[Table of Contents](#0)

In [None]:
## describe all options
pd.describe_option()

In [None]:
## show the current value of an option
pd.options.display.max_rows
# .max_rows(None) .max_columns(None) .precision(6)

In [None]:
## set an option
pd.set_option("display.max_rows", 999)

<a id="datatypes"><a/>
# ---------- Data Types ----------

[Table of Contents](#0)

In [None]:
## change type for entire dataframe
df.astype(int)

## change type for particular columns
df.astype({'colname1': 'float32', 'colname2': 'str'})

## convert to numeric
pd.to_numeric(series, errors='coerce')
# errors: 'ignore', 'raise', 'coerce'

## convert to numpy array / dataframe
series.to_numpy() 
series.to_frame(name)

In [None]:
## .astype('string') changes np.nan to pd.NA
pd.DataFrame([[1.0, 1, pd.NA, np.nan]]).astype('string')

<a id="category"><a/>
## Category

In [None]:
## convert values to 'category' type
series.astype('category')

In [None]:
## rename categories
## a list, or a dict, or a callable
series.cat.rename_categories(['excellent', 'good', 'bad'])

## set order of categories
series.cat.set_categories(['bad', 'good', 'great'], ordered=True)

## reorder
series.cat.reorder_categories(['great', 'good', 'bad'], ordered=True)

In [None]:
# .remove_unused_categories()
# .remove_categories(['cat1', 'cat2'])
# .as_ordered()
# .as_unordered()

<a id="string"></a> 
# ---------- String ----------
[Table of Contents](#0)

<a id="series.str"></a> 
## Series.str

In [None]:
## retrieve i-th character
series.str[i]

## contains
series.str.contains("abc")

## replace pattern
series.str.replace(pat=' ', repl='-', regex=False)

## split string by pattern, new column for each item (up to n columns)
series.str.split(pat='', expand=True, n=-1)

## extract named groups, creating one column per group
string_series.str.extract(r"(?P<letter>[ab])(?P<digit>\d)")

## concatenate series into one string value
series.str.cat(sep=',', na_rep='-')

## concatenate with another equal-length series
series.str.cat(series2, sep='_', na_rep='')

## concatenate with a equal-length dataframe, joining using index
series.str.cat(df, sep='_', na_rep='', join='outer')

# .lower() .upper() .len() .strip() .lstrip() .rstrip() 
# .repeat() .pad()
# .isalpha() .isdigit() .islower()

<a id="dates"></a>
# ---------- Dates ----------

[Table of Contents](#0)

<a id="pd.to_datetime()"></a> 
## pd.to_datetime()

In [None]:
## returns Timestamp object for 'now'
pd.to_datetime('today')

In [None]:
## format list of string dates as DatetimeIndex, each item a Timestamp object
pd.to_datetime(['26-03-1997', '20-04-1998'], format="%d-%m-%Y")

# %a - abbreviated weekday name | %A - full weekday name
# %d - day of month zero padded
# %b - abbreviated month name | %B - full month name
# %m - month zero padded
# %y - year without century zero padded | %Y - year with centure

<a id="pd.date_range()"></a> 
## pd.date_range()

In [None]:
## create a range of dates, at fixed intervals
## specify start, period, freq
## each item a Timestamp object
pd.date_range(start='2020-01-01', periods=4, freq='M')

## specify start, end, freq
pd.date_range(start='2020-01-01', end='2020-12-01', freq='MS')
# freq: "4H", "6H", "D", "W-MON", "W-SUN", "MS", "M", "QS", "Q", "A-JAN", "A-DEC"

<a id="series.dt"></a> 
## Series.dt

In [None]:
## get component of datetime
series.dt.month_name()
# second, minute, hour,
# day, dayofweek, dayofyear, day_name(), week, weekofyear, month, month_name(), quarter, year

## maps to period
series.dt.to_period('Q')
# freq: "4H", "6H", "D", "W-MON", "W-SUN", "MS", "M", "QS", "Q", "A-JAN", "A-DEC"

## change date format
series.dt.strftime('%d/%m/%Y')

## make series time zone aware
series.dt.tz_localize('EST')

## change time zone aware series to another time zone
series.dt.tz_convert('US/Pacific')

<a id="timedelta"></a>
## Time Delta

In [None]:
## difference between two Timestamps
## returns Timedelta object
time_delta = pd.Timestamp("2020/03/24 00:18:48") - pd.to_datetime("2019/03/04 00:12:23")

In [None]:
## extract time component from Timedelta
time_delta.days 
# microseeconds, seconds, total_seconds()

In [None]:
## subtract Timestamp by Timedelta
## absolute time arithmetic
pd.to_datetime("2020/03/24 00:18:48") - pd.Timedelta(days=9, weeks=1)
# kwargs: years, months, weeks, days, hours, minutes
#         seconds, milliseconds, microseconds, nanoseconds

In [None]:
## subtract Timestamp by DateOffset
## respects calendar arithmetic
pd.to_datetime("2020/03/24 00:18:48") - pd.DateOffset(days=9, weeks=1)
# kwargs: years, months, weeks, days, hours, minutes
#         seconds, milliseconds, microseconds, nanoseconds

<a id="input/output"><a/>
# ---------- Input / Output ----------

[Table of Contents](#0)

In [None]:
path = ("C:/Users/mushj/Desktop/WORK/")

## read table
data = pd.read_table(path + "filename.csv",
                     header=3, sep=',', nrows=-1)

## .xlsx files
data = pd.read_excel(path + "filename.xlsx",
                     header=3, sheet_name='sheet_name or #')
# sheet_name=None (read all sheets)

## .csv files
data = pd.read_csv(path + "filename.csv",
                   sep=';', delim_whitespace=False)

## writing
data.to_csv("filename.csv", index=False)
data.to_excel("filename.xlsx")
data.to_sql("filename.sql")

In [None]:
## writing to multiple sheets in Excel workbook
with pd.ExcelWriter("filename.xlsx",
                    date_format="YYYY-MM-DD",
                    mode="a", # 'w', 'a'
                    engine="openpyxl",
                    if_sheet_exists="replace" # ‘error’, ‘new’, ‘replace’, ‘overlay’
                   ) as writer:
    df1.to_excel(writer, sheet_name="Sheet1")
    df2.to_excel(writer, sheet_name="Sheet2") 

<a id="createdataframe"><a/>
# ---------- Create DataFrame ----------
    
[Table of Contents](#0)

In [None]:
## from dictionary
pd.DataFrame({'col1': [10,20], 'col2': ['a','b']}, index=[1,2])

## from nested list
pd.DataFrame([[1,2,3], ['a','b','c']], index=[1,2], columns=['col1', 'col2', 'col3'])

## from list of dictionaries
pd.DataFrame([{'col1': 1, 'col2': 2}, {'col1': 1, 'col3': 3}], index=[1,2])

<a id="indexcolumns"><a/>
# ---------- Index / Columns ----------
[Table of Contents](#0)

In [None]:
## rename indexes/columns
df.columns = ['newcolname1', 'newcolname2', 'newcolname3']
df.index = [1,2,3,4,5,6,7]

## rename selected indexes/columns
df.rename(columns={'oldcolname1': 'newcolname1'},
         index={'oldrowname': 'newrowname'})

## union indexes: stats union, then sort
pd.Index([3,2,1,0]).union(pd.Index([2,3,4,5]))

## reset index
df.reset_index(drop=True)
# drop=True -> don't keep index as column

## set a certain variable as index
df.set_index(keys='varname', drop=True)

## label axis names with list object
df.set_axis(labels=['a', 'b', 'c'], axis=1)
# axis=0 (row names), axis=1 (colnames)

## name series before concatenating to dataframe
pd.Series(array, name='new_colname')

In [None]:
## convert single/multi index to series
index.to_series(name="name", index=[1,2,3])

## repeat index in order
index.repeat(repeats=10)

<a id="multiindex"><a/>
## MultiIndex

In [None]:
## the following 3 return the same MultiIndex

## from arrays
pd.MultiIndex.from_arrays([['a', 'a', 'b', 'b'], ['1', '2', '1', '2']],
                          names=["level1", "level2"])

## from tuples
pd.MultiIndex.from_tuples([('a', '1'), ('a', '2'), ('b', '1'), ('b', '2')],
                          names=["level1", "level2"])

## cartesian product
pd.MultiIndex.from_product([['a', 'b'], ['1', '2']],
                           names=["level1", "level2"])

In [None]:
## slice multi-index (must specify value at all levels)
df.loc[('a', '1')]

## slicing based on a subset of levels
df.xs(('value1', 'value2'), level=['level1', 'level3'])

In [None]:
## get level values
## specify integer or level name
index.get_level_values(level=0)

## flatten index
index.to_flat_index()

## drop a level of index from dataframe
## specify integer or level name
df.droplevel(level)

<a id="mutability"><a/>
# ---------- Mutability ----------

[Table of Contents](#0)

In [None]:
## changing cell (0,2) will not change all values in column 2
## although list multiplication created 10 references to the same list
df1 = pd.DataFrame([[0,1,2]]*10, columns=['a','b','c'])
df1.iloc[0,2] = 2000

## changes the original df, unless copied
def func(df):
    df = df.copy(deep=True)
    df.loc[5,0] = 1000
    
## df can be modified through df2
## i.e. a view is created using .iloc, .loc, or df["col"]
df2 = df1.iloc[4:8,0:2].T
df2.iloc[1,2] = 1000

## a copy is created
df2 = df1.loc[df1.index.repeat(3)]

<a id="dataoverview"><a/>
# ---------- Data Overview ----------
[Table of Contents](#0)

In [None]:
df.shape
df.columns.values
df.index.values
df.dtypes
df.head(n=5)
df.tail(n=5)
df.info()

## use .equals to compare df with np.nan, because np.nan != np.nan
(df * 2).equals(df + df)

## descriptive overview with custom percentiles
df.describe(percentiles=[0.05, 0.1, 0.25], include=['float64'])

## get unique values / counts of unique values
df|series.nunique()
series.unique()

## check duplicates
df.duplicated(subset=['colname1', 'colname2'], keep=False) 
# keep: first, last, False

## count value by group
df.value_counts(["colname1", "colname2", "colname3"],
                normalize=False sort=True, ascending=False)

## print value counts for all object variables
for col in df.columns.values:
    if df[col].dtypes == 'object':
        print(df.value_counts(col))
        print('\n')

In [None]:
## check NaNs
df.isnull().sum(axis=0)
df.notnull().sum(axis=0)
df.isna().sum(axis=0)
df.notna().sum(axis=0)

<a id="mathstats"><a/>
# ---------- Math/Stat Operations ----------
[Table of Contents](#0)

In [None]:
## get index of min/max
df.idxmax(axis=0)
# .idxmin()

## --- descriptive statistics
# Dataframe & Series:
# .count() .sum() .mean() .mad() .median() .min() .max() .mode()
# .abs() .prod() .std() .var() .sem() .skew() .kurt() .quantile(0.75)
# .cumsum() .cumprod() .cummin() .cummax()
# .pct_change(periods=1)

## covariance / correlation
series1.cov(series2)  
df.cov()
df.corr(method='')

## standard error of mean
df.sem(axis=0, numeric_only=True, skipna=True)

## autocorrelation
series.autocorr(lag=5)

## --- arithmetic
# .add() .sub() .mul() .div() .floordiv() .mod() .pow()
df.sub([1, 2, 3], axis=1, fill_value=None) # broadcast rowwise

## rolling window
series.rolling(window=4, center=False, closed=None).mean()
# closed: left, right, both, neither

## rolling with custom function: e.g. Mean Absolute Deviation from mean within window
series.rolling(window=4).apply(lambda x: np.abs(x - x.mean()).mean())

In [None]:
## cut by specifying bins: [0,18], (18,40],...
pd.cut(x=series,
       bins=[0, 18, 40, 60, np.infty],
       include_lowest=True,
       right=True,
       labels=["A", "B", "C", "D"])

## cut by specifying percentile bounds
## or number of percentiles: q=10 -> deciles
pd.qcut(x=series, q=[0.25, 0.5, 0.75, 1], labels=['A', 'B', 'C'])

<a id="sorting"><a/>
# ---------- Sorting ----------
[Table of Contents](#0)

In [None]:
## reorder dataframe based on new index/column order
df.reindex([4, 2, 1, 3], axis=0)
df.reindex(index=[4, 2, 1, 3], columns=['C', 'A', 'B'])

## if there are new indexes, fill with
## ffill (forward), bfill (backward), nearest
df.reindex(index=[1,2,3,4,5], method='ffill')

## sort data by values of a column(s)
df.sort_values(['colname1', 'colname2'], ascending=False, axis=0)

## sort data by a level of multiindex
df.sort_index(level=[1,0], ascending=False)


## create a column ranking values
df.rank(method="dense", ascending=True, na_option="keep", pct=False, axis=0)
# method: 'average', 'min', 'max', 'first', 'dense'
# na_option: 'keep', 'top', 'bottom'

<a id="merging"><a/>
# ---------- Merging ----------
[Table of Contents](#0)

In [None]:
## concatenate two dataframes / series row-wise or column-wise
## based on matching index
pd.concat(objs=[df1, df2, df3], axis=1, ignore_index=True, join='outer')

## joining
pd.merge(left=left_df, right=right_df, 
         left_index=True, right_on='join_column', 
         how='left/right/outer/inner/cross',
         suffixes=['_left', '_right'])

## joining from map
dct = {'Male': 1, 'Female': 2}
df['Gender'].map(dct)

## patch NaNs in df1 with values in df2 (df1, df2 like-indexed)
df1.combine_first(df2)

## replace non NaNs in df1 with values in df2 (df1, df2 like-indexed)
df1.update(df2)

<a id="slicingfiltering"><a/>
# ---------- Slicing / Filtering ----------
[Table of Contents](#0)

<a id="slicinggeneral"><a/>
## General

In [None]:
## select columns of certain dtypes
df.select_dtypes(include=['int64'], exclude=['object'])
# dtypes: int64, float64, bool, datetime64, timedelta64, object, category

## return rows that are the n-th largest 
df.nlargest(n, ["col1", "col2"])
# .nsmallest

## label-based: search by row/col labels
df.loc[['2017-01-03', '2017-01-05'], ['colname1', 'colname2']]

## search by index
df.iloc[0:10, 0:2]

## get position of column
df.columns.get_loc('colname')

## random sample: specify <n> or <frac>
df.sample(n=3, weights='weights_column', replace=False, axis=0, random_state=42)

## randomize row order
df.sample(frac=1).reset_index(inplace=True, drop=True)

## filter data where values of colname is in a list
df[df['colname'].isin(['A', 'B', 'C'])]

## filter by multiple conditions on rows and columns
## add '~' to condition as negation, '|' for or
df.loc[(df['colname1'] > 0) & ~(df['colname2'] == 100), ['colname3']]

In [None]:
## iterate through rows
for i,row in df.iterrows():
    print(i, row["col1"], row["col2"])

<a id="query"><a/>
## query()

In [None]:
## reference a variable
df.query("col_name >= @var")

## multiple filters
df.query("col_name == 'abc' & col_name == 'xyz'")

## column name with space
df.query("`col name` < 1")

## operations
df.query("col_name != 1 + 2")

## reference other columns
df.query("col_name1 > (col_name2 + col_name3) / 2")

## .isin()
df.query("`col name`.isin([1,2,3])")

<a id="transform"><a/>
# ---------- Transform ----------
[Table of Contents](#0)

<a id="basic"><a/>
## Basic

In [None]:
## create new column based on existing columns
df["new colname"] = df["colname1"] + df["colname2"]

## replace multiple columns at once
df[["A_new", "B_new", "C_new"]] = df[["A", "B", "C"]].values * df["D"].values.reshape(-1,1)

## get dummies
pd.get_dummies(df|series, prefix="prefix", prefix_sep="_", drop_first=False)

## shifting
df|series.shift(-1) # shift backward [a,b,c] -> [b,c,nan]
df|series.shift(1) # shift forward [a,b,c] -> [nan,a,b]

## difference
df|series.diff(-1) # e_i = i - (i+1), starting from i=0
df|series.diff(1) # e_i = i - (i+1), starting from i=1

## drop columns based on name
df.drop(columns=["colname1", "colname2"], axis=1)

## drop duplicated rows
df.drop_duplicates(subset=["colname1", "colname2"], keep="first")
# keep = "first", "last", False

## replace values
df.replace(to_replace=["yes", "no"], value=[1, 0])

## replace with dictionary
df.replace({'oldvalue1': 'newvalue1', 'oldvalue2': 'newvalue2'})

## replace values with regex
df.column.replace(to_replace='^(.)|(.)$', value='_', regex=True)

## returns dataframe of same shape as original, values outside of condition become NaN
df.where(df > 0)

## replace those values with 'other'
df.where(df > 0, other=-df)

<a id="missingvalues"><a/>
## Handling missing values

In [None]:
## drop na values by column; subset=Null for entire dataset
df.dropna(subset=["col1", "col2"], axis=0)

## fill na
df.fillna(value=0) # fill all NaN with value
df.fillna(value={'col1': 0, 'col2': 100}) # fill with dictionary (keys must match column names)
df.fillna(value=df.mean()[['col1', 'col2']]) # fill with series (index = column names, mapping to values)
df.fillna(method=None) # ‘bfill’, ‘ffill’
df.bfill()
df.ffill()

## interpolation of NaN values
df.interpolate(method='linear', axis=0)
# 'nearest', 'zero', 'slinear', 'quadratic', 'cubic', 'spline', 'polynomial'
# axis=0: across rows

<a id="apply"><a/>
## apply()

In [None]:
## apply function to each element in data frame
df.applymap(func=lambda x: x**2)

## apply function
## reference two columns by index
df[["col1", "col2"]].apply(lambda x: x.iloc[1] if pd.isna(x.iloc[0]) else x.iloc[1]/2, axis=1)
# axis=1: apply function to each row (across columns)

## reference columns by name
df.apply(lambda x: x["col1"] if pd.isna(x["col2"]) else x["col1"]/2, axis=1)

<a id="pipe"><a/>
## pipe()

In [None]:
## chaining functions with arguments
## each function must take in and return a dataframe
def step_1(df, na_subset):
    return df.dropna(axis=0, subset=[na_subset])

def step_2(df, operation='add', constants=[0,0,0]):
    if operation == 'add':
        return df.add(constants, axis=1)

(df.pipe(step_1, na_subset='region A')
   .pipe(step_2, operation='add', constants=[10, 20, 30])
)

<a id="ttransform"><a/>
## transform()

In [None]:
## selective transformation
## returns a df with a column for each transformation
df.transform({'colname1': np.abs,
              'colname2': lambda x: x + 1})

## apply multiple functions to one column
## returns a df with a column for each transformation
df['colname'].transform([np.abs, lambda x: x + 1])

## apply multiple functions to all columns
df.transform([np.abs, lambda x: x + 1])

In [None]:
df.groupby().transform

<a id="assign"><a/>
## assign()

In [None]:
## create two new columns (column names as arguments)
df.assign(total=lambda x: x.sum(axis=1),
          total_times_10=lambda x: x['total'] * 10)

## create two new columns with dictionary
df.assign(**{"var": lambda x: x, "var2": lambda x: x})

## Timeseries

In [None]:
df.resample()

<a id="reshaping"><a/>
# ---------- Reshaping / Aggregating ----------
[Table of Contents](#0)

In [None]:
## unpivot all columns, creating a new level of index
df.stack(level=-1)

## reverses stack: pivots a level of index as columns
df.unstack(level=-1)

## explodes a column of arrays into rows (each item taking a row),
## repeating values of other columns
df.explode(column='colname')

## unpivot
df.melt(id_vars=["col1", "col2"], value_vars=["col3", "col4"],
        var_name="variable", value_name="value")

<a id="pivotcrosstab"><a/>
## pivot / crosstab

In [None]:
## pivot: without aggregation
## creates a cartesian product space, with NaNs if no value
df.pivot(index=["col1", "col2"], columns=["col3", "col4"], values=["col5", "col6"])

## pivot: with aggregation
df.pivot_table(index="colname1", columns="colname2", values="colname3",
               aggfunc='mean', margins=True)

## return cross-tabulated counts/percentages
## creates a cartesian product space
pd.crosstab(series1, series2, normalize=True)

## with aggfunc
pd.crosstab(df["colname1"], df["colname2"],
            margins=True, values=df["colname3"], 
            aggfunc=lambda x: np.mean(x))

<a id="groupby"><a/>
## group by

In [None]:
## group
grouped = df.groupby(['colname1', 'colname2'], as_index=False)
# as_index=True makes groups into indexes

## retrieve groups
grouped.groups.keys() # get all group names (keys)
grouped.get_group('group_i_name')
[(name, group_data) for (name, group_data) in grouped] # unpack names and groups

## compute aggregations
df.groupby("group_var")['colname'].mean()
df.groupby("group_var")['colname'].agg(['mean', 'std'])
df.groupby("group_var")["colname"].apply(list) # collect list
df.groupby("group_var")["colname"].apply(set) # collect set

## group-by with .agg()
(df.groupby("group_var")
   .agg({'colname1': ['first', 'last'],
         'colname2': ['sum', 'mean', 'count'],
         'colname3': lambda x: x.max() - x.min()  # x refers to all rows in each group
        })
)

## with names
(df.groupby("group_var")
   .agg(col_sum = ('colname1', 'sum'),
        col_range = ('colname2', lambda x: x.max() - x.min())
       )
)

## custom named function
custom_function = lambda x: x
custom_function.__name__ = 'func_name'
df.agg([custom_function])


## group by custom index values
df.groupby(lambda x: x.year).sum()

## group by other values
df.groupby(df2["colname"])

<a id="advancedgroupby"><a/>
## Advanced group by

In [None]:
## rank within groups
df.groupby("group_var").rank()

## apply transformation within each group
df.groupby("group_var").transform(lambda x: x - x.mean())

## rolling window
df.groupby("group_var").rolling(window=3)['colname'].mean()