In [4]:
import numpy as np

In [5]:
import pandas as pd

### Creating a Series by converting a list, dictionary, array to a Series

In [6]:
labels = ['a','b','c']
my_list = [10,20,30]
arr = np.array([10,20,30])
d = {'a':10,'b':20, 'c':30}

In [7]:
pd.Series(data=my_list)

0    10
1    20
2    30
dtype: int64

In [8]:
pd.Series(data=my_list, index=labels)

a    10
b    20
c    30
dtype: int64

In [9]:
pd.Series(d)

a    10
b    20
c    30
dtype: int64

In [10]:
pd.Series(labels)

0    a
1    b
2    c
dtype: object

In [11]:
pd.Series(arr)

0    10
1    20
2    30
dtype: int32

### Using a Series
#### The key to using a series effectively in pandas is understanding its index. Pandas makes use of index names or numbers by allowing for fast lookups of information(i.e it works like a dictionary or hashtable)

In [12]:
pd.Series([1,2,3,4], index=['USA','RUSSIA','GERMANY','UK'])

USA        1
RUSSIA     2
GERMANY    3
UK         4
dtype: int64

In [13]:
countrySeries = pd.Series([1,2,3,4], index=['USA','RUSSIA','GERMANY','UK'])

In [14]:
countrySeries

USA        1
RUSSIA     2
GERMANY    3
UK         4
dtype: int64

In [15]:
countrySeries['RUSSIA']

2

In [16]:
countrySeries[1]

2

In [17]:
nationalGDP = pd.Series([2,5,6,8], index=['USA','MOROCCO', 'RUSSIA','GERMANY'])

In [18]:
nationalGDP

USA        2
MOROCCO    5
RUSSIA     6
GERMANY    8
dtype: int64

In [19]:
countrySeries + nationalGDP

GERMANY    11.0
MOROCCO     NaN
RUSSIA      8.0
UK          NaN
USA         3.0
dtype: float64

# DATAFRAMES

DataFrames are the workhorse of pandas and are directly inspired by the R programming language. We can think of a DataFrame as a bunch of Series objects put together to share the same index.

In [20]:
from numpy.random import randn

In [21]:
pd.DataFrame(randn(5,4))

Unnamed: 0,0,1,2,3
0,0.360124,1.527596,0.708478,0.294401
1,-0.962787,0.430893,-1.339323,-0.135905
2,0.278759,-0.118764,-0.333575,0.054499
3,-0.016117,-0.39587,-0.377616,-0.82789
4,-0.146377,-0.110139,0.797217,0.962784


In [22]:
#df = pd.DataFrame(randn(5,4), index = 'A B C D E'.split(), columns='W X Y Z'.split())

In [23]:
df = pd.DataFrame(randn(5,4), index = ['A', 'B', 'C', 'D', 'E'], columns=['W', 'X', 'Y', 'Z'])

In [24]:
df

Unnamed: 0,W,X,Y,Z
A,-0.521223,0.399029,-0.744585,1.343952
B,-1.554174,0.246275,-0.33527,-0.276795
C,-0.333873,-1.656755,1.136345,-0.97267
D,0.739085,-1.784838,2.230966,-2.210923
E,-0.054111,0.140167,-0.148827,0.606651


## Selection and Indexing
#### Understanding the various methods of grabbing data from a dataframe

In [25]:
df['W']

A   -0.521223
B   -1.554174
C   -0.333873
D    0.739085
E   -0.054111
Name: W, dtype: float64

### To Access more than one column, you can do so by entering a list into the parentheses

In [26]:
df[['W','X']]

Unnamed: 0,W,X
A,-0.521223,0.399029
B,-1.554174,0.246275
C,-0.333873,-1.656755
D,0.739085,-1.784838
E,-0.054111,0.140167


In [27]:
df[['W','Z']]

Unnamed: 0,W,Z
A,-0.521223,1.343952
B,-1.554174,-0.276795
C,-0.333873,-0.97267
D,0.739085,-2.210923
E,-0.054111,0.606651


In [28]:
df['COLUMN'] = df['W'] + df ['Z']

# creating a new column using data from exisiting columns

In [29]:
df

Unnamed: 0,W,X,Y,Z,COLUMN
A,-0.521223,0.399029,-0.744585,1.343952,0.822729
B,-1.554174,0.246275,-0.33527,-0.276795,-1.830968
C,-0.333873,-1.656755,1.136345,-0.97267,-1.306543
D,0.739085,-1.784838,2.230966,-2.210923,-1.471837
E,-0.054111,0.140167,-0.148827,0.606651,0.55254


#### Removing columns

In [30]:
df.drop('COLUMN', axis=1)

Unnamed: 0,W,X,Y,Z
A,-0.521223,0.399029,-0.744585,1.343952
B,-1.554174,0.246275,-0.33527,-0.276795
C,-0.333873,-1.656755,1.136345,-0.97267
D,0.739085,-1.784838,2.230966,-2.210923
E,-0.054111,0.140167,-0.148827,0.606651


#### The sort of removal done above just removes the specified dataset on the fly, not inplace unless specified

In [31]:
df

#This shows that the new column is still there

Unnamed: 0,W,X,Y,Z,COLUMN
A,-0.521223,0.399029,-0.744585,1.343952,0.822729
B,-1.554174,0.246275,-0.33527,-0.276795,-1.830968
C,-0.333873,-1.656755,1.136345,-0.97267,-1.306543
D,0.739085,-1.784838,2.230966,-2.210923,-1.471837
E,-0.054111,0.140167,-0.148827,0.606651,0.55254


In [32]:
df.drop('COLUMN', axis=1, inplace=True)

#### This removal done above totally removes it from the dataset by setting inplace = True

In [33]:
df

Unnamed: 0,W,X,Y,Z
A,-0.521223,0.399029,-0.744585,1.343952
B,-1.554174,0.246275,-0.33527,-0.276795
C,-0.333873,-1.656755,1.136345,-0.97267
D,0.739085,-1.784838,2.230966,-2.210923
E,-0.054111,0.140167,-0.148827,0.606651


#### Rows can be dropped this way, on the fly

In [34]:
df.drop('E', axis = 0)

Unnamed: 0,W,X,Y,Z
A,-0.521223,0.399029,-0.744585,1.343952
B,-1.554174,0.246275,-0.33527,-0.276795
C,-0.333873,-1.656755,1.136345,-0.97267
D,0.739085,-1.784838,2.230966,-2.210923


In [35]:
df

Unnamed: 0,W,X,Y,Z
A,-0.521223,0.399029,-0.744585,1.343952
B,-1.554174,0.246275,-0.33527,-0.276795
C,-0.333873,-1.656755,1.136345,-0.97267
D,0.739085,-1.784838,2.230966,-2.210923
E,-0.054111,0.140167,-0.148827,0.606651


### Selecting rows

In [36]:
df.loc['A']

W   -0.521223
X    0.399029
Y   -0.744585
Z    1.343952
Name: A, dtype: float64

In [37]:
df.iloc[2]

#This selects off of position instead of labels

W   -0.333873
X   -1.656755
Y    1.136345
Z   -0.972670
Name: C, dtype: float64

In [38]:
df.loc['C']

W   -0.333873
X   -1.656755
Y    1.136345
Z   -0.972670
Name: C, dtype: float64

In [39]:
df.loc['B','Y']

-0.33527028047450835

In [40]:
df.loc[['A','B'],['W','X']]

Unnamed: 0,W,X
A,-0.521223,0.399029
B,-1.554174,0.246275


## Conditional Selection

An important feature of pandas is conditional selection using bracket notation, very similar to numpy

In [41]:
df

Unnamed: 0,W,X,Y,Z
A,-0.521223,0.399029,-0.744585,1.343952
B,-1.554174,0.246275,-0.33527,-0.276795
C,-0.333873,-1.656755,1.136345,-0.97267
D,0.739085,-1.784838,2.230966,-2.210923
E,-0.054111,0.140167,-0.148827,0.606651


In [42]:
df>0

Unnamed: 0,W,X,Y,Z
A,False,True,False,True
B,False,True,False,False
C,False,False,True,False
D,True,False,True,False
E,False,True,False,True


In [43]:
df > 1

Unnamed: 0,W,X,Y,Z
A,False,False,False,True
B,False,False,False,False
C,False,False,True,False
D,False,False,True,False
E,False,False,False,False


In [44]:
df[df>0]

Unnamed: 0,W,X,Y,Z
A,,0.399029,,1.343952
B,,0.246275,,
C,,,1.136345,
D,0.739085,,2.230966,
E,,0.140167,,0.606651


#### For more than one condition, use keywords like (&, |) with parenthesis:

### More Index Details

In [116]:
df

Unnamed: 0,W,X,Y,Z
A,0.302665,1.693723,-1.706086,-1.159119
B,-0.134841,0.390528,0.166905,0.184502
C,0.807706,0.07296,0.638787,0.329646
D,-0.497104,-0.75407,-0.943406,0.484752
E,-0.116773,1.901755,0.238127,1.996652


In [117]:
#To reset an already assigned index
df.reset_index()

Unnamed: 0,index,W,X,Y,Z
0,A,0.302665,1.693723,-1.706086,-1.159119
1,B,-0.134841,0.390528,0.166905,0.184502
2,C,0.807706,0.07296,0.638787,0.329646
3,D,-0.497104,-0.75407,-0.943406,0.484752
4,E,-0.116773,1.901755,0.238127,1.996652


In [46]:
newind = 'CA NY WY OR CO'.split()

In [47]:
df['States'] = newind

In [48]:
df

Unnamed: 0,W,X,Y,Z,States
A,-0.521223,0.399029,-0.744585,1.343952,CA
B,-1.554174,0.246275,-0.33527,-0.276795,NY
C,-0.333873,-1.656755,1.136345,-0.97267,WY
D,0.739085,-1.784838,2.230966,-2.210923,OR
E,-0.054111,0.140167,-0.148827,0.606651,CO


In [49]:
df.set_index('Y')

Unnamed: 0_level_0,W,X,Z,States
Y,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
-0.744585,-0.521223,0.399029,1.343952,CA
-0.33527,-1.554174,0.246275,-0.276795,NY
1.136345,-0.333873,-1.656755,-0.97267,WY
2.230966,0.739085,-1.784838,-2.210923,OR
-0.148827,-0.054111,0.140167,0.606651,CO


In [50]:
df

Unnamed: 0,W,X,Y,Z,States
A,-0.521223,0.399029,-0.744585,1.343952,CA
B,-1.554174,0.246275,-0.33527,-0.276795,NY
C,-0.333873,-1.656755,1.136345,-0.97267,WY
D,0.739085,-1.784838,2.230966,-2.210923,OR
E,-0.054111,0.140167,-0.148827,0.606651,CO
