In [2]:
import numpy as np
import pandas as pd

In [3]:
df = pd.DataFrame({'col1': [1, 2, 3, 4], 'col2': [444, 555, 666, 444], 'col3': ['abc', 'def', 'ghi', 'xyz']})
df

Unnamed: 0,col1,col2,col3
0,1,444,abc
1,2,555,def
2,3,666,ghi
3,4,444,xyz


### **Info on Unique Values.**

In [5]:
# Return the unique values in the col2 column
df['col2'].unique()

array([444, 555, 666], dtype=int64)

In [7]:
# Return the number of unique values in the col2 column
df['col2'].nunique() # or len(df['col2'].unique())


3

In [8]:
# Return the number of times each unique value appears in the col2 column
df['col2'].value_counts()

col2
444    2
555    1
666    1
Name: count, dtype: int64

### **Selecting Data.**

In [9]:
df[(df['col1'] > 2) & (df['col2'] == 444)]

Unnamed: 0,col1,col2,col3
3,4,444,xyz


### **Applying Functions.**

In [13]:
df['col1'].sum()

10

In [10]:
def times2(x):
    return x * 2

df['col1'].apply(times2)

0    2
1    4
2    6
3    8
Name: col1, dtype: int64

In [12]:
df['col2'].apply(lambda x: x * 2)

0     888
1    1110
2    1332
3     888
Name: col2, dtype: int64

In [11]:
df['col3'].apply(len)

0    3
1    3
2    3
3    3
Name: col3, dtype: int64

### **Permanently Removing a Column.**

In [14]:
del df['col1'] # or df.drop('col1', axis = 1, inplace = True)

In [15]:
df

Unnamed: 0,col2,col3
0,444,abc
1,555,def
2,666,ghi
3,444,xyz


### **Get column and index names.**

In [16]:
df.columns

Index(['col2', 'col3'], dtype='object')

In [17]:
df.index

RangeIndex(start=0, stop=4, step=1)

### **Sorting and Ordering a DataFrame.**

In [18]:
help(df.sort_values)

Help on method sort_values in module pandas.core.frame:

sort_values(by: 'IndexLabel', *, axis: 'Axis' = 0, ascending: 'bool | list[bool] | tuple[bool, ...]' = True, inplace: 'bool' = False, kind: 'SortKind' = 'quicksort', na_position: 'str' = 'last', ignore_index: 'bool' = False, key: 'ValueKeyFunc | None' = None) -> 'DataFrame | None' method of pandas.core.frame.DataFrame instance
    Sort by the values along either axis.
    
    Parameters
    ----------
    by : str or list of str
        Name or list of names to sort by.
    
        - if `axis` is 0 or `'index'` then `by` may contain index
          levels and/or column labels.
        - if `axis` is 1 or `'columns'` then `by` may contain column
          levels and/or index labels.
    axis : "{0 or 'index', 1 or 'columns'}", default 0
         Axis to be sorted.
    ascending : bool or list of bool, default True
         Sort ascending vs. descending. Specify list for multiple sort
         orders.  If this is a list of bools,

In [19]:
df.sort_values('col2') # inplace = False by default

Unnamed: 0,col2,col3
0,444,abc
3,444,xyz
1,555,def
2,666,ghi


### **Create a spreadsheet-style pivot table as a DataFrame.**

In [22]:
data = {'A':['foo', 'foo', 'foo', 'bar', 'bar', 'bar'],
        'B':['one', 'one', 'two', 'two', 'one', 'one'],
        'C':['x', 'y', 'x', 'y', 'x', 'y'],
        'D':[1, 3, 2, 5, 4, 1]}

df = pd.DataFrame(data)
df

Unnamed: 0,A,B,C,D
0,foo,one,x,1
1,foo,one,y,3
2,foo,two,x,2
3,bar,two,y,5
4,bar,one,x,4
5,bar,one,y,1


In [21]:
help(df.pivot_table)

Help on method pivot_table in module pandas.core.frame:

pivot_table(values=None, index=None, columns=None, aggfunc: 'AggFuncType' = 'mean', fill_value=None, margins: 'bool' = False, dropna: 'bool' = True, margins_name: 'Level' = 'All', observed: 'bool | lib.NoDefault' = <no_default>, sort: 'bool' = True) -> 'DataFrame' method of pandas.core.frame.DataFrame instance
    Create a spreadsheet-style pivot table as a DataFrame.
    
    The levels in the pivot table will be stored in MultiIndex objects
    (hierarchical indexes) on the index and columns of the result DataFrame.
    
    Parameters
    ----------
    values : list-like or scalar, optional
        Column or columns to aggregate.
    index : column, Grouper, array, or list of the previous
        Keys to group by on the pivot table index. If a list is passed,
        it can contain any of the other types (except list). If an array is
        passed, it must be the same length as the data and will be used in
        the same m

In [24]:
df.pivot_table(values = 'D', index = ['A', 'B'], columns = ['C']) # aggfunc = mean by default

Unnamed: 0_level_0,C,x,y
A,B,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,one,4.0,1.0
bar,two,,5.0
foo,one,1.0,3.0
foo,two,2.0,


Extended example:

In [28]:
df = pd.DataFrame({"A": ["foo", "foo", "foo", "foo", "foo", "bar", "bar", "bar", "bar"],
                    "B": ["one", "one", "one", "two", "two", "one", "one", "two", "two"],
                    "C": ["small", "large", "large", "small", "small", "large", "small", "small", "large"],
                    "D": [1, 2, 2, 3, 3, 4, 5, 6, 7],
                    "E": [2, 4, 5, 5, 6, 6, 8, 9, np.nan]})

df

Unnamed: 0,A,B,C,D,E
0,foo,one,small,1,2.0
1,foo,one,large,2,4.0
2,foo,one,large,2,5.0
3,foo,two,small,3,5.0
4,foo,two,small,3,6.0
5,bar,one,large,4,6.0
6,bar,one,small,5,8.0
7,bar,two,small,6,9.0
8,bar,two,large,7,


In [29]:
df.pivot_table(values = ['D', 'E'], index = ['A', 'C'], aggfunc = {'D': "mean", 'E': ["min", "max", "mean"]}, fill_value = 0)


Unnamed: 0_level_0,Unnamed: 1_level_0,D,E,E,E
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,max,mean,min
A,C,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
bar,large,5.5,6.0,6.0,6.0
bar,small,5.5,9.0,8.5,8.0
foo,large,2.0,5.0,4.5,4.0
foo,small,2.333333,6.0,4.333333,2.0
