<h1>Pandas Series</h1>

In [1]:
import numpy as np

In [2]:
import pandas as pd

In [3]:
labels = ['a', 'b', 'c']
my_data = [10,20,30]
arr = np.array(my_data)
d = {'a':10, 'b':20, 'c':30}

In [4]:
labels

['a', 'b', 'c']

In [6]:
my_data

[10, 20, 30]

In [7]:
arr

array([10, 20, 30])

In [8]:
d

{'a': 10, 'b': 20, 'c': 30}

In [9]:
pd.Series(data = my_data)

0    10
1    20
2    30
dtype: int64

In [10]:
pd.Series(data=my_data,index=labels)

a    10
b    20
c    30
dtype: int64

In [12]:
pd.Series(my_data,labels)

a    10
b    20
c    30
dtype: int64

In [13]:
pd.Series(arr)

0    10
1    20
2    30
dtype: int64

In [14]:
pd.Series(arr,labels)

a    10
b    20
c    30
dtype: int64

In [15]:
pd.Series(d)

a    10
b    20
c    30
dtype: int64

In [16]:
pd.Series(data=labels)

0    a
1    b
2    c
dtype: object

In [17]:
pd.Series(data=[sum,print,len]) # this would be how you could store functions in a Series if you wanted

0      <built-in function sum>
1    <built-in function print>
2      <built-in function len>
dtype: object

In [18]:
ser1 = pd.Series([1,2,3,4], ['USA', 'Germany', 'USSR', 'Japan'])

In [19]:
ser1

USA        1
Germany    2
USSR       3
Japan      4
dtype: int64

In [20]:
ser2 = pd.Series([1,2,5,4],['USA', 'Germany', 'Italy', 'Japan'])

In [21]:
ser2

USA        1
Germany    2
Italy      5
Japan      4
dtype: int64

In [22]:
ser1['USA']

1

In [23]:
ser3 = pd.Series(data=labels)

In [24]:
ser3

0    a
1    b
2    c
dtype: object

In [25]:
ser3[0]

'a'

In [26]:
ser1

USA        1
Germany    2
USSR       3
Japan      4
dtype: int64

In [27]:
ser2

USA        1
Germany    2
Italy      5
Japan      4
dtype: int64

In [28]:
ser1 + ser2

Germany    4.0
Italy      NaN
Japan      8.0
USA        2.0
USSR       NaN
dtype: float64

<h1>Pandas DataFrames Part 1</h1>

In [29]:
import numpy as np

In [30]:
import pandas as pd

In [31]:
from numpy.random import randn

In [32]:
np.random.seed(101)

In [33]:
df = pd.DataFrame(randn(5,4),['A','B','C','D','E'],['W','X','Y','Z'])

In [34]:
df

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [35]:
df['W']

A    2.706850
B    0.651118
C   -2.018168
D    0.188695
E    0.190794
Name: W, dtype: float64

In [36]:
type(df['W']) # selecting a column from a dataframe makes it a series, which is what a dataframe basically is - a collection of series

pandas.core.series.Series

In [37]:
type(df)

pandas.core.frame.DataFrame

In [38]:
df.W # recommended not to do this as it may confuse you with the various methods available

A    2.706850
B    0.651118
C   -2.018168
D    0.188695
E    0.190794
Name: W, dtype: float64

In [39]:
df['X']

A    0.628133
B   -0.319318
C    0.740122
D   -0.758872
E    1.978757
Name: X, dtype: float64

In [40]:
df[['W','Z']]

Unnamed: 0,W,Z
A,2.70685,0.503826
B,0.651118,0.605965
C,-2.018168,-0.589001
D,0.188695,0.955057
E,0.190794,0.683509


In [41]:
df['new'] = df['W'] + df['Y']

In [42]:
df

Unnamed: 0,W,X,Y,Z,new
A,2.70685,0.628133,0.907969,0.503826,3.614819
B,0.651118,-0.319318,-0.848077,0.605965,-0.196959
C,-2.018168,0.740122,0.528813,-0.589001,-1.489355
D,0.188695,-0.758872,-0.933237,0.955057,-0.744542
E,0.190794,1.978757,2.605967,0.683509,2.796762


In [44]:
df.drop('new', axis=1) # if you don't specify acis = 1, the default argument is axis=0 (meaning a row)

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [45]:
df

Unnamed: 0,W,X,Y,Z,new
A,2.70685,0.628133,0.907969,0.503826,3.614819
B,0.651118,-0.319318,-0.848077,0.605965,-0.196959
C,-2.018168,0.740122,0.528813,-0.589001,-1.489355
D,0.188695,-0.758872,-0.933237,0.955057,-0.744542
E,0.190794,1.978757,2.605967,0.683509,2.796762


In [46]:
df.drop('new', axis=1, inplace=True) # inplace=True will permanently drop the column from your DataFRame

In [47]:
df.drop('E')

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057


In [48]:
df

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [49]:
df.shape # the shape is a tuple

(5, 4)

<p><h6>axis 0 is for the rows axis</h6></p>
<p><h6>axis 1 is for the columns axis</h6></p>

In [50]:
df.columns

Index(['W', 'X', 'Y', 'Z'], dtype='object')

In [52]:
# Rows

In [54]:
df.loc['C'] # returns a series for that row

W   -2.018168
X    0.740122
Y    0.528813
Z   -0.589001
Name: C, dtype: float64

In [55]:
df.iloc[2] # allows you to return the row if you don't know the row index string name

W   -2.018168
X    0.740122
Y    0.528813
Z   -0.589001
Name: C, dtype: float64

In [56]:
df.loc['B','Y']

-0.8480769834036315

In [57]:
df.loc[['A','B'],['W','Y']]

Unnamed: 0,W,Y
A,2.70685,0.907969
B,0.651118,-0.848077


<h1> Pandas DataFrames Part 2</h1>

In [58]:
df > 0

Unnamed: 0,W,X,Y,Z
A,True,True,True,True
B,True,False,False,True
C,False,True,True,False
D,True,False,False,True
E,True,True,True,True


In [59]:
booldf = df > 0

In [60]:
booldf

Unnamed: 0,W,X,Y,Z
A,True,True,True,True
B,True,False,False,True
C,False,True,True,False
D,True,False,False,True
E,True,True,True,True


In [61]:
df[booldf]

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,,,0.605965
C,,0.740122,0.528813,
D,0.188695,,,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [62]:
df[df>0]

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,,,0.605965
C,,0.740122,0.528813,
D,0.188695,,,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [63]:
df

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [64]:
df['W']>0

A     True
B     True
C    False
D     True
E     True
Name: W, dtype: bool

In [65]:
df['W']

A    2.706850
B    0.651118
C   -2.018168
D    0.188695
E    0.190794
Name: W, dtype: float64

In [66]:
df[df['W']>0]

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [67]:
df[df['Z']<0]

Unnamed: 0,W,X,Y,Z
C,-2.018168,0.740122,0.528813,-0.589001


In [70]:
resultdf = df[df['W']>0]

In [71]:
resultdf

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [72]:
resultdf['X']

A    0.628133
B   -0.319318
D   -0.758872
E    1.978757
Name: X, dtype: float64

In [73]:
df[df['W']>0]['X']

A    0.628133
B   -0.319318
D   -0.758872
E    1.978757
Name: X, dtype: float64

In [74]:
df[df['W']>0][['Y','X']]

Unnamed: 0,Y,X
A,0.907969,0.628133
B,-0.848077,-0.319318
D,-0.933237,-0.758872
E,2.605967,1.978757


In [75]:
boolser = df['W']>0

In [76]:
boolser

A     True
B     True
C    False
D     True
E     True
Name: W, dtype: bool

In [81]:
# The below is the same as what is on line 74, but broken out
# into multiple steps. This will use up more memory and is less
# efficient, but is a good way to learn at first
boolser = df['W']>0
result = df[boolser]
my_cols = ['Y', 'X']
result[my_cols]

Unnamed: 0,Y,X
A,0.907969,0.628133
B,-0.848077,-0.319318
D,-0.933237,-0.758872
E,2.605967,1.978757


In [80]:
result

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [82]:
df[(df['W']>0) and (df['Y']>1)]

ValueError: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().

In [83]:
df[(df['W']>0) & (df['Y']>1)] 
# for multiple conditions you cannot use the and operator
# you have to use an &
# for or, you will need to use a pipe operator |

Unnamed: 0,W,X,Y,Z
E,0.190794,1.978757,2.605967,0.683509


In [84]:
df[(df['W']>0) | (df['Y']>1)] 

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [85]:
df

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [86]:
df.reset_index()

Unnamed: 0,index,W,X,Y,Z
0,A,2.70685,0.628133,0.907969,0.503826
1,B,0.651118,-0.319318,-0.848077,0.605965
2,C,-2.018168,0.740122,0.528813,-0.589001
3,D,0.188695,-0.758872,-0.933237,0.955057
4,E,0.190794,1.978757,2.605967,0.683509


In [89]:
df

Unnamed: 0,index,W,X,Y,Z
0,A,2.70685,0.628133,0.907969,0.503826
1,B,0.651118,-0.319318,-0.848077,0.605965
2,C,-2.018168,0.740122,0.528813,-0.589001
3,D,0.188695,-0.758872,-0.933237,0.955057
4,E,0.190794,1.978757,2.605967,0.683509


In [88]:
df.reset_index(inplace=True)  
# this would permanently make the change to the dataframe

In [90]:
df

Unnamed: 0,index,W,X,Y,Z
0,A,2.70685,0.628133,0.907969,0.503826
1,B,0.651118,-0.319318,-0.848077,0.605965
2,C,-2.018168,0.740122,0.528813,-0.589001
3,D,0.188695,-0.758872,-0.933237,0.955057
4,E,0.190794,1.978757,2.605967,0.683509


In [91]:
df = pd.DataFrame(randn(5,4),['A','B','C','D','E'],['W','X','Y','Z'])

In [92]:
df

Unnamed: 0,W,X,Y,Z
A,0.302665,1.693723,-1.706086,-1.159119
B,-0.134841,0.390528,0.166905,0.184502
C,0.807706,0.07296,0.638787,0.329646
D,-0.497104,-0.75407,-0.943406,0.484752
E,-0.116773,1.901755,0.238127,1.996652


In [93]:
newind = 'CA NY WY OR CO'.split() 
# .split() we create splits on open space

In [94]:
newind

['CA', 'NY', 'WY', 'OR', 'CO']

In [96]:
df['States'] = newind

In [97]:
df

Unnamed: 0,W,X,Y,Z,States
A,0.302665,1.693723,-1.706086,-1.159119,CA
B,-0.134841,0.390528,0.166905,0.184502,NY
C,0.807706,0.07296,0.638787,0.329646,WY
D,-0.497104,-0.75407,-0.943406,0.484752,OR
E,-0.116773,1.901755,0.238127,1.996652,CO


In [98]:
df.set_index('States')

Unnamed: 0_level_0,W,X,Y,Z
States,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
CA,0.302665,1.693723,-1.706086,-1.159119
NY,-0.134841,0.390528,0.166905,0.184502
WY,0.807706,0.07296,0.638787,0.329646
OR,-0.497104,-0.75407,-0.943406,0.484752
CO,-0.116773,1.901755,0.238127,1.996652


In [99]:
df

Unnamed: 0,W,X,Y,Z,States
A,0.302665,1.693723,-1.706086,-1.159119,CA
B,-0.134841,0.390528,0.166905,0.184502,NY
C,0.807706,0.07296,0.638787,0.329646,WY
D,-0.497104,-0.75407,-0.943406,0.484752,OR
E,-0.116773,1.901755,0.238127,1.996652,CO


In [101]:
boolser = df['W']>0

In [102]:
boolser

A     True
B    False
C     True
D    False
E    False
Name: W, dtype: bool

In [103]:
result = df[boolser]

In [104]:
result

Unnamed: 0,W,X,Y,Z,States
A,0.302665,1.693723,-1.706086,-1.159119,CA
C,0.807706,0.07296,0.638787,0.329646,WY


In [105]:
my_cols = ['Y', 'X']

In [106]:
result[my_cols]

Unnamed: 0,Y,X
A,-1.706086,1.693723
C,0.638787,0.07296


In [107]:
df

Unnamed: 0,W,X,Y,Z,States
A,0.302665,1.693723,-1.706086,-1.159119,CA
B,-0.134841,0.390528,0.166905,0.184502,NY
C,0.807706,0.07296,0.638787,0.329646,WY
D,-0.497104,-0.75407,-0.943406,0.484752,OR
E,-0.116773,1.901755,0.238127,1.996652,CO


In [108]:
boolser = df['W']>0
result = df[boolser]
my_cols = ['Y', 'X']
result[my_cols]

Unnamed: 0,Y,X
A,-1.706086,1.693723
C,0.638787,0.07296


<h1> Pandas DataFrames Part 3</h1>