pandas: https://pandas.pydata.org/docs/user_guide/index.html#user-guide

pandas DF: https://pandas.pydata.org/docs/reference/frame.html

In [None]:
import pandas as pd
import numpy as np

<a id="0"></a> <br>
 # Table of Contents  
1. [Options](#options)
    1. A
    1. B
1. [String](#string)
1. [Dates](#dates)
1.

<a id="options"></a> 
# ---------- Options ----------

[Table of Contents](#0)

In [None]:
## describe all options
pd.describe_option()

In [None]:
## show the current value of an option
pd.options.display.max_rows
# .max_rows(None) .max_columns(None) .precision(6)

In [None]:
## set an option
pd.set_option("display.max_rows", 999)

<a id="string"></a> 
# ---------- String ----------

[Table of Contents](#0)

## Series.str

In [None]:
# basic
# .lower() .upper() .len() .strip() .lstrip() .rstrip() .repeat() .pad()
# .isalpha() .isdigit() .islower() .replace
string_series.str.strip()
string_series.str.replace(pat=' ', repl='-', regex=False)

In [None]:
# .split(' ') .split().str[1] .split('', expand=True, n=2)
string_series.str.split('', expand=True, n=3)

In [None]:
# extract named groups -> one column per group
string_series.str.extract(r"(?P<letter>[ab])(?P<digit>\d)")

# concatenate series into one string
string_series.str.cat(sep=',', na_rep='-')

In [None]:
string_series.str.cat(['a','b','c','d'], sep='_', na_rep='')

In [None]:
# concatenate with a dataframe of strings
# based on index
new = pd.DataFrame([['a','b','c','d'], [1,2,3,4]], dtype='string').T
new.set_axis([1,2,3,4], axis=0, inplace=True)
string_series.str.cat(new, sep='_', na_rep='', join='outer')

<a id="dates"></a>
# ---------- Dates ----------

## pd.to_datetime()

In [None]:
pd.to_datetime('today')

In [None]:
# %a - abbreviated weekday name | %A - full weekday name
# %d - day of month zero padded
# %b - abbreviated month name | %B - full month name
# %m - month zero padded
# %y - year without century zero padded | %Y - year with centure
datetime_index = pd.to_datetime(['26-03-1997', '20-04-1998'], format="%d-%m-%Y")
datetime_index

## pd.date_range()

In [None]:
# freq: '6H', 'D', 'M'
datetime_index = pd.date_range(start='2020-01-01', periods=4, freq='M')
datetime_index

## Series.dt

In [None]:
dt_series = datetime_index.to_series().reset_index(drop=True)

# second, minute, hour,
# day, dayofweek, dayofyear, day_name(), week, weekofyear, month, month_name(), quarter, year
A = dt_series.dt.month_name().rename("month_name()")

# D, M, Q, Y
B = dt_series.dt.to_period('Q').rename("to_period('Q')")

C = dt_series.dt.strftime('%d/%m/%Y').rename("strftime('%d/%m/%Y')")

D = dt_series.dt.tz_localize('EST').rename("tz_localize('EST')")

E = dt_series.dt.tz_localize('EST').dt.tz_convert('US/Pacific').rename("tz_convert('US/Pacific')")

pd.concat([dt_series, A, B, C, D, E], axis=1)

## Time Delta

In [None]:
# difference between two dates
time_delta = pd.to_datetime("2020/03/24 00:18:48") - pd.to_datetime("2019/03/04 00:12:23")
time_delta

In [None]:
# extract time component from difference
time_delta.days # microseeconds, seconds, total_seconds()

In [None]:
# subtract date by time
# weeks, days, hours, minutes, seconds, milliseconds, microseconds, nanoseconds
pd.to_datetime("2020/03/24 00:18:48") - pd.Timedelta(days=9, weeks=1)

# ---------- Create Data

## DataFrame

In [None]:
# from dictionary
pd.DataFrame({'col1': [10,20], 'col2': ['a','b']}, index=[1,2])

# from nested list
pd.DataFrame([[1,2,3], ['a','b','c']], index=[1,2], columns=['col1', 'col2', 'col3'])

# from list of dictionaries
pd.DataFrame([{'col1': 1, 'col2': 2}, {'col1': 1, 'col3': 3}], index=[1,2])

## MultiIndex

In [None]:
# tuples
pd.MultiIndex.from_tuples([('a', '1'), ('a', '2'), ('b', '1'), ('b', '2')],
                          names=["letter", "number"])

# cartesian product
pd.MultiIndex.from_product([['a', 'b', 'c'], ['1', '2']],
                           names=["letter", "number"])

# slice multi-index
df.loc[('a', '1')]

## Categorical

### Series.cat()

In [None]:
df = pd.DataFrame({'level': ['b', 'c', 'a']}) # column as string
df['level'] = df['level'].astype('category') # convert column to category
df['level']

In [None]:
df['level'].cat.categories = ['excellent', 'good', 'bad'] # reassign categories
df['level'].cat.rename_categories({'excellent': 'great'}, inplace=True) # selective rename
df['level'] = df['level'].cat.set_categories(['bad', 'good', 'great'], ordered=True) # specify category order
df['level']

In [None]:
df['level'].cat.reorder_categories(['great', 'good', 'bad'], ordered=True)

In [None]:
# .remove_unused_categories()
# .remove_categories(['cat1', 'cat2'])
# .as_ordered()
# .as_unordered()

# ---------- Input / Output

In [None]:
path = ("C:/Users/mushj/Desktop/WORK/"
        "Master of Management Analytics/"
        "RSM8502H - Data-Based Management Decisions/"
        "Group Assignment/")

# read table
data = pd.read_table(path + "bank-additional-full-1.csv",
                     header=3, sep=',', nrows=-1)

# .xlsx files
data = pd.read_excel(path + "bank-additional-full-1.csv",
                     header=3, sheet_name='sheet_name or #')

data = pd.read_csv(path + "bank-additional-full-1.csv",
                   sep=';', delim_whitespace=False)

# writing
data.to_csv("filename.csv", index=False)
data.to_excel("filename.xlsx")
data.to_sql("filename.sql")

# ---------- Data Overview

In [None]:
data.shape
data.columns.values
data.index.values
data.dtypes
data.head(n=5)
data.tail(n=5)
data.info()

# use .equals to compare df with np.nan, as np.nan != np.nan
(data * 2).equals(data + data)

# descriptive overview with custom percentiles
data.describe(percentiles=[0.05, 0.1, 0.25], include=['float64'])

# get unique values of column
data['colnmae'].nunique()
data['colname'].unique()

data.duplicated(subset=['colname1', 'colname2'], keep=False) # keep: first, last

# count value by group
# add .to_frame(name='colname') to format as DataFrame
data.value_counts(["colname1", "colname2", "colname3"],
                  normalize=False sort=True, ascending=False)

# print value counts for all object variables
for col in data.columns.values:
    if data[col].dtypes == 'object':
        print(data.value_counts(col))
        print('\n')

# check NaNs
# axis=0 -> sum across rows, for each column
data.isnull().sum(axis=0)
data.notnull().sum(axis=0)
data.isna().sum(axis=0)
data.notna().sum(axis=0)

# ---------- Math/Stat Operations

In [None]:
# --- descriptive statistics
# Dataframe & Series:
# .count() .sum() .mean() .mad() .median() .min() .max() .mode()
# .abs() .prod() .std() .var() .sem() .skew() .kurt() .quantile(0.75)
# .cumsum() .cumprod() .cummin() .cummax()
# .pct_change(periods=1)
# data['colname1'].cov(data['colname2'])  data.cov()  | .corr(method='')

# standard error of mean
data.sem(axis=0, numeric_only=True, skipna=True)

# Series:
# .autocorr(lag=1)
data['colname'].autocorr(lag=5)

# --- arithmetic
# .add() .sub() .mul() .div() .floordiv() .mod() .pow()
data.sub([1, 2, 3], axis=1, fill_value=None) # broadcast rowwise

# Dataframe & Series
# closed: left, right, both, neither
data.rolling(window=4, center=False, closed=None).mean()
# rolling with custom function: Mean Absolute Deviation from mean within window
data.rolling(window=4).apply(lambda x: np.abs(x - x.mean()).mean())

# get index of min/max
# .idxmin()
data.idxmax(axis=0)

In [None]:
# correlation matrix: get corrs > 0.7
cor_mat = data.corr()
pairs = cor_mat.unstack().sort_values(ascending=False)
pairs_cleaned = pairs[[i[0] != i[1] for i in pairs.index]].drop_duplicates() # remove self-corr and symmetry
pairs_cleaned[pairs_cleaned > 0.7]

# ---------- Label

In [None]:
# rename columns
data.columns = ['newcolname1', 'newcolname2', 'newcolname3']

# rename selected columns
data.rename(columns={'oldcolname1': 'newcolname1', 'oldcolname2': 'newcolname2'},
            index={'oldrowname': 'newrowname'})

# add new index
new_ind = data.index.union(pd.Index([3,4,5,6]))
data.reindex(new_ind)

# reset index; drop=True -> don't keep index as column
data.reset_index(inplace=True, drop=True)

# set a certain variable as index
data.set_index(keys='varname', drop=True)

# label axis names with list object, axis=0 (row names), axis=1 (colnames)
data.set_axis(labels=['a', 'b', 'c'], axis=1)

# name series before concatenating to dataframe
pd.Series(new_col, name='new_colname')

# ---------- Type / Format

In [None]:
data.astype(int) # change entire dataset: int, string
data.astype({'colname1': 'float32', 'colname2': 'str'})

pd.to_numeric(x, errors='coerce')

data['colname1'].to_numpy() # to_series(), to_frame()

In [None]:
# .astype('string') changes np.nan to pd.NA
df = pd.DataFrame([[1.0, 1, pd.NA, np.nan]])
df.astype('string')

In [None]:
for i in range(4):
    print(type(df.astype('string').loc[:,i].values[0]))

# ---------- Sort

In [None]:
# reorder dataframe based on new index order
data.reindex([4, 2, 1, 3], axis=0)
data.reindex(index=[4, 2, 1, 3], columns=['A', 'B', 'C'])

# if there are new indexes, fill with
# ffill (forward), bfill (backward), nearest
data.reindex(index=[1,2,3,4,5], method='ffill')

# sort data by values of a column(s)
data.sort_values(['colname1', 'colname2'], ascending=False, axis=0)

# ---------- Merge

In [None]:
# column join two dataframes / series | axis=0 row join
pd.concat(objs=[df1, df2, df3], axis=1, ignore_index=True, join='outer')

# conventional merging
pd.merge(left=left_df, right=right_df, on='join_column', how='left/right/outer/inner/cross',
         suffixes=['_left', '_right'])

# joining from map
dct = {'Male': 1, 'Female': 2}
df['tip_means_by_sex'] = df['sex'].map(dct)

# patch NaNs in df1 with values in df2 (df1, df2 like-indexed)
df1.combine_first(df2)

# replace non NaNs in df1 with values in df2 (df1, df2 like-indexed)
df1.update(df2)

In [None]:
# left join example:
df1 = pd.DataFrame({'col1': ['a', 'b'], 'col2': [1, 2]})
df2 = pd.DataFrame({'col1': ['a', 'a', 'b', 'b', 'b'], 'col3': [101, 102, 901, 902, 903]})
df1

In [None]:
df2

In [None]:
df1.merge(df2, on='col1', how='left')

# ---------- Slice / Filter

In [None]:
# dtypes: int64, float64, bool, datetime64, timedelta64, object, category
data.select_dtypes(include=['int64'], exclude=['object'])

# .nsmallest
data.nlargest(4, 'colname')

# label-based: search by row/col labels
data.loc[['2017-01-03', '2017-01-05'], ['colname1', 'colname2']]

# search by index
data.iloc[0:10, 0:2]

# get iloc index of column
data.columns.get_loc('colname')

# random sample: specify <n> or <frac>
data.sample(n=3, weights='weights_column', replace=False, axis=0, random_state=42)
# randomize
data.sample(frac=1).reset_index(inplace=True, drop=True)

# filter data where values of colname is in a list
data[data['colname'].isin(['A', 'B', 'C'])]

# filter data where column name contains string
data[data['colname'].str.contains('hello')]

# filter by multiple conditions on rows and columns
# add '~' to condition as negation, '|' for or
data.loc[(data['colname1'] > 0) & ~(df_sales['colname2'] == 100), ['colname3']]


# cut by specifying bins: [0,18], (18,40],...
pd.cut(x=data.age,
       bins=[0, 18, 40, 60, np.infty],
       include_lowest=True,
       right=True,
       labels=['young', 'adult', 'old', 'retired'])

# cut by specifying percentile bounds
# or number of percentiles: q=10 -> deciles
pd.qcut(x=data.age, q=[0.25, 0.5, 0.75, 1], labels=['A', 'B', 'C'])

# ---------- Reshape

In [None]:
data.stack(level=-1) # compresses columns as inner-most index
data.unstack(level=-1) # reverses stack

# explodes list-like values
data.explode(column='colname')

## pivot / crosstab

In [None]:
# pivot
# expand multiple rows under 'colname1' as columns; columns times values set of columns
data.pivot(index='colname1', columns=['colname2'], values=['colname3', 'colname4'])

# pivot table
# values: the column to be aggregated
data.pivot_table(index="colname1", columns="colname2", values="colname3",
                 aggfunc='mean', margins=True)

# get proportion by each row
pivot_table.div([row1_sum, row2_sum, row3_sum], axis=0)

# return cross-tabulated counts/percentages
pd.crosstab(data["colname1"], data["colname2"], normalize=True)
# with aggfunc
pd.crosstab(data["colname1"], data["colname2"],
            margins=True, values=data["colname3"], aggfunc=lambda x: np.mean(x))

## group by

In [None]:
# as_index=True makes groups into indexes
grouped = data.groupby(['colname1', 'colname2'], as_index=False)

grouped.get_group('group_i_name') # get the dataframe of that group, group name is categorical level
grouped.groups.keys() # get all group names (keys) | values are indexes
[(name, group_data) for (name, group_data) in grouped] # unpack names and groups

# compute aggregations
grouped['colname3'].mean()
grouped['colname3'].agg(['mean', 'std'])


# group-by with .agg()
(db1.groupby('colname')
    .agg({'colname1': ['first', 'last'],
          'colname2': ['sum', 'mean', 'count'],
          'colname3': lambda x: x.max() - x.min()  # x refers to all rows in each group
         })
)
# with names
(data.groupby('colname')
     .agg(agg_name1 = ('colname1', 'sum'),
          range = ('colname2', lambda x: x.max() - x.min())
         )
)


# custom named function
custom_function = lambda x: x
custom_function.__name__ = 'func_name'
data.agg([custom_function])


# group by custom index values
df.groupby(lambda x: x.year).sum()

# apply transformation within each group
data.groupby('colname1').transform(lambda x: x - x.mean())
data.groupby('colname1').rolling(window=3)['colname2'].mean()

# ---------- Transform

## Basic

In [None]:
# Dataframe & Series:
data.shift(-1) # lead (shift backward/right)
data.shift(1) # lag (shift forward/left)
data.diff(1) # difference between two-elements: i - (i-1), start from i=0
data.diff(-1) # difference between two-elements: (i-1) - i, start from i=0

# create new column based on existing columns
data["new colname"] = data["colname1"] + data["colname2"]


# --- handling NaN
# drop na values by column; subset=Null for entire dataset
data.dropna(subset=["colname"], axis=0, inplace=True)

# fill na
data.fillna(value=0) # fill all NaN with value
data.fillna(value={'colname1': 0, 'colname2': 100}) # fill with dictionary (keys must match colnames)
data.fillna(value=data.mean()[['colname1', 'colname2']]) # fill with series (index must match colnames)
data.fillna(method=None) # ‘bfill’, ‘ffill’

# interpolation
# linear, quadratic, cubic
data.interpolate(method='linear')

# drop columns based on name
data.drop(columns=["colname1", "colname2"], axis=1, inplace=True)

# drop columns based on index
data.drop(columns=data.columns[i], axis=1)

# drop duplicated rows
data.drop_duplicates(subset=["colname1", "colname2"], keep={'first', 'last', False})


# replace values
data.replace(to_replace=["yes", "no"], value=[1, 0], inplace=False)

# replace with dictionary
data.replace({'oldvalue1': 'newvalue1', 'oldvalue2': 'newvalue2'})

# replace values with regex
data.column.replace(to_replace='^(.)|(.)$', value='_', regex=True)


# get dummies
pd.get_dummies(data['colname'], prefix="table1", prefix_sep="_", drop_first=False)

# returns dataframe of same shape as original, values outside of condition become NaN
data.where(data > 0)
# or replace those values with 'other'
data.where(data > 0, other=-data, inplace=False)

## apply()

In [None]:
# if axis=0, average of top and bottom values for each column
# if axis=1, average of leftmost and rightmost values for each row

# apply function to each element in data frame
data.applymap(func=lambda x: x**2)

# creating a new column based on two columns
# axis=1: apply function to each row
data[['Double_Header', 'Tickets_Sold']].apply(lambda x: x[1] if pd.isna(x[0]) else x[1]/2, axis=1)

## pipe()

In [None]:
def step_1(df, na_subset):
    return df.dropna(axis=0, subset=[na_subset])

def step_2(df, operation='add', constants=[0,0,0]):
    if operation == 'add':
        return df.add(constants, axis=1)

(data.pipe(step_1, na_subset='region A')
     .pipe(step_2, operation='add', constants=[10, 20, 30])
)

## transform()

In [None]:
# selective transformation
data.transform({'colname1': np.abs,
                'colname2': lambda x: x + 1})

# apply multiple functions to one column
data['colname'].transform([np.abs, lambda x: x + 1])

# apply multiple functions to all columns
data.transform([np.abs, lambda x: x + 1])

## assign()

In [None]:
(data.query("`colname 1` > 0")
     .assign(total=lambda x: x.sum(axis=1),
             total_times_10=lambda x: x['total'] * 10)
)