In [4]:
# Pandas is built off of numpy so it'll have to be imported as well
import pandas as pd
import numpy as np

In [5]:
# Different python data types
labels = ['a', 'b', 'c'] # List (of characters)
my_list = [10, 20, 30] # list
arr = np.array([10,20,30]) # Array
d = {'a':10, 'b':20, 'c':30} # Dictionary

In [8]:
# Convert to a series using the Series method
pd.Series(data=my_list)

0    10
1    20
2    30
dtype: int64

In [9]:
# Specify an index, in this case the labels list
pd.Series(data = my_list, index = labels)

a    10
b    20
c    30
dtype: int64

In [12]:
# Array
pd.Series(arr, labels)

a    10
b    20
c    30
dtype: int32

In [13]:
# Dictionary
pd.Series(d)

a    10
b    20
c    30
dtype: int64

In [14]:
# You can store functions in Series objects
pd.Series([sum, print, len])

0      <built-in function sum>
1    <built-in function print>
2      <built-in function len>
dtype: object

In [16]:
# new Series
series1 = pd.Series([1,2,3,4], index = ['USA','Germany','Russia','Japan'])
series1

USA        1
Germany    2
Russia     3
Japan      4
dtype: int64

In [17]:
# Dataframes can also be made, using the DataFrame method. Sort of like a dictionary where each dictionary item is a list of equal lengths
# Dataframe columns can also be made from Series objects

df = pd.DataFrame({'A':[1,2,3],
                  'B':[4,5,6],
                  'C':[7,8,9]})

In [18]:
df = pd.DataFrame(np.random.randn(5,4), index = ['A','B','C','D','E'], columns = ['W','X','Y','Z'])
df

Unnamed: 0,W,X,Y,Z
A,1.072608,-0.174305,-0.345,1.046436
B,-1.458357,-1.130441,-1.201612,0.992496
C,1.708214,0.358426,0.077859,0.436797
D,-0.08741,0.702537,0.315446,-0.116108
E,-2.182743,-0.698692,0.330948,0.801497


In [23]:
# refer to a column by its name 
df['W']

A    1.072608
B   -1.458357
C    1.708214
D   -0.087410
E   -2.182743
Name: W, dtype: float64

In [24]:
# Or, in a way similar to the "$" notation from R...(df$W)
df.W

A    1.072608
B   -1.458357
C    1.708214
D   -0.087410
E   -2.182743
Name: W, dtype: float64

In [25]:
# Refer to multiple columns using a list of colukn names
df[['W', 'Z']]

Unnamed: 0,W,Z
A,1.072608,1.046436
B,-1.458357,0.992496
C,1.708214,0.436797
D,-0.08741,-0.116108
E,-2.182743,0.801497


In [35]:
# Make new columns
df['new'] = df.W + df.Y
df

Unnamed: 0,W,X,Y,Z,new
A,1.072608,-0.174305,-0.345,1.046436,0.727608
B,-1.458357,-1.130441,-1.201612,0.992496,-2.659969
C,1.708214,0.358426,0.077859,0.436797,1.786073
D,-0.08741,0.702537,0.315446,-0.116108,0.228036
E,-2.182743,-0.698692,0.330948,0.801497,-1.851795


In [30]:
# The drop method can be used to remove columns. You have to specify the axis argument to specify that 'new' refers to a column. The '0' axis refers to rows
df.drop('new', axis = 1)

Unnamed: 0,W,X,Y,Z
A,1.072608,-0.174305,-0.345,1.046436
B,-1.458357,-1.130441,-1.201612,0.992496
C,1.708214,0.358426,0.077859,0.436797
D,-0.08741,0.702537,0.315446,-0.116108
E,-2.182743,-0.698692,0.330948,0.801497


In [31]:
# This column hasn't been dropped from the DataFrame yet....But it can be if you ask
df

Unnamed: 0,W,X,Y,Z,new
A,1.072608,-0.174305,-0.345,1.046436,0.727608
B,-1.458357,-1.130441,-1.201612,0.992496,-2.659969
C,1.708214,0.358426,0.077859,0.436797,1.786073
D,-0.08741,0.702537,0.315446,-0.116108,0.228036
E,-2.182743,-0.698692,0.330948,0.801497,-1.851795


In [36]:
# Permanantly remove the new column from df
df.drop('new', axis = 1, inplace = True)
df

Unnamed: 0,W,X,Y,Z
A,1.072608,-0.174305,-0.345,1.046436
B,-1.458357,-1.130441,-1.201612,0.992496
C,1.708214,0.358426,0.077859,0.436797
D,-0.08741,0.702537,0.315446,-0.116108
E,-2.182743,-0.698692,0.330948,0.801497


In [37]:
# Drop specific rows using axis=0
df.drop('E', axis = 0)

Unnamed: 0,W,X,Y,Z
A,1.072608,-0.174305,-0.345,1.046436
B,-1.458357,-1.130441,-1.201612,0.992496
C,1.708214,0.358426,0.077859,0.436797
D,-0.08741,0.702537,0.315446,-0.116108


In [38]:
# Conditional logic can be applied to the entire dataframe
df > 0

Unnamed: 0,W,X,Y,Z
A,True,False,False,True
B,False,False,False,True
C,True,True,True,True
D,False,True,True,False
E,False,False,True,True


In [39]:
# Which values of the dataframe meet this criteria
df[df>0]

Unnamed: 0,W,X,Y,Z
A,1.072608,,,1.046436
B,,,,0.992496
C,1.708214,0.358426,0.077859,0.436797
D,,0.702537,0.315446,
E,,,0.330948,0.801497


In [40]:
# You can refer to multiple positions using the loc method
df.loc['B', 'Y'] # Intersection of row B with column Y

-1.2016119433557602

In [43]:
df.loc[['A', 'B'], ['W', 'Y']]

Unnamed: 0,W,Y
A,1.072608,-0.345
B,-1.458357,-1.201612


In [44]:
# Subsetting using conditional logic
df[df['W'] > 0]

Unnamed: 0,W,X,Y,Z
A,1.072608,-0.174305,-0.345,1.046436
C,1.708214,0.358426,0.077859,0.436797


In [47]:
# Further subset this if you only want a selection of columns from the results
df[df['W'] > 0][['X', 'Y']]

Unnamed: 0,X,Y
A,-0.174305,-0.345
C,0.358426,0.077859


In [48]:
# And/or operations
df[(df['W'] > 0) & (df['Y'] > 0)] # Surround each condition with parentheses

Unnamed: 0,W,X,Y,Z
C,1.708214,0.358426,0.077859,0.436797


In [49]:
# You can reset indexes using dedicated methods
df.reset_index()

Unnamed: 0,index,W,X,Y,Z
0,A,1.072608,-0.174305,-0.345,1.046436
1,B,-1.458357,-1.130441,-1.201612,0.992496
2,C,1.708214,0.358426,0.077859,0.436797
3,D,-0.08741,0.702537,0.315446,-0.116108
4,E,-2.182743,-0.698692,0.330948,0.801497


In [50]:
# Or make a new index
new_index = 'CA NY WY OR CO'.split() # This is another way of making a list

In [51]:
df['States'] = new_index # by first making this list a column in the dataframe, you can have a named index once you set it

In [52]:
df

Unnamed: 0,W,X,Y,Z,States
A,1.072608,-0.174305,-0.345,1.046436,CA
B,-1.458357,-1.130441,-1.201612,0.992496,NY
C,1.708214,0.358426,0.077859,0.436797,WY
D,-0.08741,0.702537,0.315446,-0.116108,OR
E,-2.182743,-0.698692,0.330948,0.801497,CO


In [53]:
df.set_index('States', inplace = True)

In [54]:
df

Unnamed: 0_level_0,W,X,Y,Z
States,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
CA,1.072608,-0.174305,-0.345,1.046436
NY,-1.458357,-1.130441,-1.201612,0.992496
WY,1.708214,0.358426,0.077859,0.436797
OR,-0.08741,0.702537,0.315446,-0.116108
CO,-2.182743,-0.698692,0.330948,0.801497


In [57]:
# Hierarchy in indices
outside = 'G1 G1 G1 G2 G2 G2'.split()
inside = [1,2,3,1,2,3]
hier_index = list(zip(outside,inside)) # "Zip" lists together to form a tuple
hier_index = pd.MultiIndex.from_tuples(hier_index) # Make a 'MultiIndex' from this tuple

hier_index

MultiIndex(levels=[['G1', 'G2'], [1, 2, 3]],
           codes=[[0, 0, 0, 1, 1, 1], [0, 1, 2, 0, 1, 2]])

In [58]:
df = pd.DataFrame(np.random.randn(6, 2), index = hier_index, columns = ['A', 'B'])
df

Unnamed: 0,Unnamed: 1,A,B
G1,1,1.020699,-1.094652
G1,2,0.39374,0.070321
G1,3,0.0521,-0.234627
G2,1,-1.129164,2.153522
G2,2,1.180219,0.685828
G2,3,0.89348,-0.316671


In [60]:
df.loc['G1'] # the first indexed level

Unnamed: 0,A,B
1,1.020699,-1.094652
2,0.39374,0.070321
3,0.0521,-0.234627


In [61]:
df.loc['G1'].loc[1] # The first row of the first main level

A    1.020699
B   -1.094652
Name: 1, dtype: float64

In [62]:
# Name the index column(s)
df.index.names = ['Group', 'Num']
df

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B
Group,Num,Unnamed: 2_level_1,Unnamed: 3_level_1
G1,1,1.020699,-1.094652
G1,2,0.39374,0.070321
G1,3,0.0521,-0.234627
G2,1,-1.129164,2.153522
G2,2,1.180219,0.685828
G2,3,0.89348,-0.316671


In [64]:
# The xs() method lets you refer to multiple indices
df.xs(['G1', 1])

A    1.020699
B   -1.094652
Name: (G1, 1), dtype: float64

In [66]:
# Or refer directly to a level of the index
df.xs(1, level = 'Num')

Unnamed: 0_level_0,A,B
Group,Unnamed: 1_level_1,Unnamed: 2_level_1
G1,1.020699,-1.094652
G2,-1.129164,2.153522
