## Series
---
### Create Series

1. Create Series from ndarray 

In [1]:
import pandas as pd
import numpy as np

In [17]:
s_ndarray= pd.Series(np.random.randint(5, 10, size=5), index=['a', 'b', 'c', 'd', 'e'])
s_ndarray

a    6
b    9
c    6
d    9
e    7
dtype: int32

2. Create Series from dict

In [22]:
dic = {'name': 'Mark', 'age': 39, 'gender': 'Male', 'Married': True}

# Without index passed
s_dic = pd.Series(dic)

s_dic

name       Mark
age          39
gender     Male
Married    True
dtype: object

In [24]:
# With index passed
s_dic2 = pd.Series(dic, index=['gender', 'age', 'Married', 'name','hobbies'])

s_dic2


gender     Male
age          39
Married    True
name       Mark
hobbies     NaN
dtype: object

3. Create Series from scalar

In [26]:
s_scalar = pd.Series(7., index=list('abcdefz'))
s_scalar

a    7.0
b    7.0
c    7.0
d    7.0
e    7.0
f    7.0
z    7.0
dtype: float64

### Series is ndarray-like

In [42]:
s_ndarray_like = pd.Series(np.random.randint(10, size=5), index=list('abcde'))

display(s_ndarray_like)

print('>>> get serial element:')
display(s_ndarray_like[3])
print('>>> get serial elements:')
display(s_ndarray_like[[1,4]])
print('>>> get sliced elements from serial:')
display(s_ndarray_like[:3])
print('>>> get filtered elements:')
display(s_ndarray_like[s_ndarray_like > s_ndarray_like.median()])
print('>>> get exponent of each element:')
display(np.exp(s_ndarray_like))


print('>>> convert series to actuall array type')
display(s_ndarray_like.array)

print('>>> convert series to actuall ndarray type')
display(s_ndarray_like.to_numpy())

a    5
b    3
c    4
d    2
e    5
dtype: int32

>>> get serial element:


2

>>> get serial elements:


b    3
e    5
dtype: int32

>>> get sliced elements from serial:


a    5
b    3
c    4
dtype: int32

>>> get filtered elements:


a    5
e    5
dtype: int32

>>> get exponent of each element:


a    148.413159
b     20.085537
c     54.598150
d      7.389056
e    148.413159
dtype: float64

>>> convert series to actuall array type


<PandasArray>
[5, 3, 4, 2, 5]
Length: 5, dtype: int32

>>> convert series to actuall ndarray type


array([5, 3, 4, 2, 5])

### Series is dict-like

In [54]:
s_dict_like = pd.Series(np.random.randint(10, size=5), index=list('abcde'))

display(s_dict_like)

print('>>> get series element value by key name')
display(s_dict_like['c'])

print('>>> set series element with new value by key name')
s_dict_like['c'] = 99
display(s_dict_like)

print('>>> check key(label) does exist in series')
display('a' in s_dict_like)
display('z' in s_dict_like)

print('>>> get series element using get method')
display(s_dict_like.get('a'))

display(s_dict_like.get('z'))

display(s_dict_like.get('z', np.nan))

a    1
b    2
c    4
d    1
e    3
dtype: int32

>>> get series element value by key name


4

>>> set series element with new value by key name


a     1
b     2
c    99
d     1
e     3
dtype: int32

>>> check key(label) does exist in series


True

False

>>> get series element using get method


1

None

nan

### Vectorized operations and label alignment with Series

In [62]:
print('>>> Add 2 series')
s_op1 = pd.Series(np.random.randint(10, size=5), index=list('abcde'))
s_op2 = pd.Series(np.random.randint(10, size=5), index=list('uvxyz'))

display(s_op1)

display(s_op2)

display(s_op1 + s_op1)
display(s_op1 *2)

print('>>> Miss aligned label will get NaN')
display(s_op1 + s_op2)

>>> Add 2 series


a    4
b    6
c    4
d    9
e    0
dtype: int32

u    8
v    3
x    1
y    7
z    2
dtype: int32

a     8
b    12
c     8
d    18
e     0
dtype: int32

a     8
b    12
c     8
d    18
e     0
dtype: int32

>>> Miss aligned label will get NaN


a   NaN
b   NaN
c   NaN
d   NaN
e   NaN
u   NaN
v   NaN
x   NaN
y   NaN
z   NaN
dtype: float64

### Series Name attribute

In [67]:
s_name = pd.Series(np.random.randint(100, size=5), index=list('abcde'), name='best arrage')

display(s_name)
display(s_name.name)

s_rename = s_name.rename('bad arrage')
display(s_rename)

a    78
b    97
c    77
d    93
e    39
Name: best arrage, dtype: int32

'best arrage'

a    78
b    97
c    77
d    93
e    39
Name: bad arrage, dtype: int32

## DataFrame
---
1. From Dict of Series or Dicts

In [3]:
# Create DataFrame from Dict of Series
dict_series = {'one': pd.Series([1., 2., 3.], index=list('abc')), 
              'two': pd.Series([1., 2., 3., 4.], index=list('abcd'))}

# index wil be union of the indexes of various Series
df_dict_series = pd.DataFrame(dict_series)

df_dict_series

Unnamed: 0,one,two
a,1.0,1.0
b,2.0,2.0
c,3.0,3.0
d,,4.0


In [5]:
# create DataFrame with index
df_dict_series_index = pd.DataFrame(dict_series, index=['d', 'b', 'a'])
df_dict_series_index

Unnamed: 0,one,two
d,,4.0
b,2.0,2.0
a,1.0,1.0


In [8]:
# create DataFrane with index and columns
df_dict_series_index_columns = pd.DataFrame(dict_series, index=list('dac'), columns=['two', 'three'])
df_dict_series_index_columns

Unnamed: 0,two,three
d,4.0,
a,1.0,
c,3.0,


In [10]:
# access index and columns
display(df_dict_series.index)
display(df_dict_series.columns)

Index(['a', 'b', 'c', 'd'], dtype='object')

Index(['one', 'two'], dtype='object')

2. From dict of ndarrays / lists

In [14]:
# Create Dataframe from dict of ndarray
dict_ndarray = {'one': [1., 2., 3., 4.],
                  'two': [4., 5., 6., 7.,]}
df_dict_ndarray = pd.DataFrame(dict_ndarray)
display(df_dict_ndarray)

# with index
df_dict_ndarray = pd.DataFrame(dict_ndarray, index=list('qwer'))
display(df_dict_ndarray)

Unnamed: 0,one,two
0,1.0,4.0
1,2.0,5.0
2,3.0,6.0
3,4.0,7.0


Unnamed: 0,one,two
q,1.0,4.0
w,2.0,5.0
e,3.0,6.0
r,4.0,7.0


3. From structured or record array

In [28]:
# Create DataFrame from structured data
structured_data = np.zeros((2, ), dtype=[('A', 'i4'), ('B', 'f4'), ('C', 'a10')])
structured_data[:] = [(1, 2., 'hello'), (2, 3., 'world')]

df_structured_data = pd.DataFrame(structured_data)
display(df_structured_data)

# with index
df_structured_data_index = pd.DataFrame(structured_data, index=['first', 'second'])
display(df_structured_data_index)

# with columns
df_structured_data_columns = pd.DataFrame(structured_data, index=['first', 'second'], columns=['C', 'B', 'A'])
display(df_structured_data_columns)

Unnamed: 0,A,B,C
0,1,2.0,b'hello'
1,2,3.0,b'world'


Unnamed: 0,A,B,C
first,1,2.0,b'hello'
second,2,3.0,b'world'


Unnamed: 0,C,B,A
first,b'hello',2.0,1
second,b'world',3.0,2


4. From a list of dicts

In [3]:
list_of_dict = [{'a': 0, 'b':1}, {'a':5,'b':10, 'c':20 }]
df_list_of_dict = pd.DataFrame(list_of_dict)
df_list_of_dict

Unnamed: 0,a,b,c
0,0,1,
1,5,10,20.0


In [5]:
df_list_of_dict_with_index = pd.DataFrame(list_of_dict, index=['first', 'second'])
df_list_of_dict_with_index

Unnamed: 0,a,b,c
first,0,1,
second,5,10,20.0


In [6]:
df_list_of_dict_with_column = pd.DataFrame(list_of_dict, columns=['a', 'b'])
df_list_of_dict_with_column

Unnamed: 0,a,b
0,0,1
1,5,10


5. From a dict of tuples 

In [7]:
df_dict_of_tuples = pd.DataFrame(
    {
        ('a', 'b'): 
        {
            ('A', 'B'): 1, ('A', 'C'): 2
        },
        ('a', 'a'): 
        {
            ('A', 'C'): 3, ('A', 'B'): 4
        },
        ('a', 'c'): 
        {
            ('A', 'B'): 5, ('A', 'C'): 6
        },
        ('b', 'a'): 
        {
            ('A', 'C'): 7, ('A', 'B'): 8
        },
        ('b', 'b'): 
        {
            ('A', 'D'): 9, ('A', 'B'): 10
        }
    }
)

df_dict_of_tuples

Unnamed: 0_level_0,Unnamed: 1_level_0,a,a,a,b,b
Unnamed: 0_level_1,Unnamed: 1_level_1,b,a,c,a,b
A,B,1.0,4.0,5.0,8.0,10.0
A,C,2.0,3.0,6.0,7.0,
A,D,,,,,9.0


6. From a list of dataclasses

In [8]:
from dataclasses import make_dataclass
Point = make_dataclass("Point", [("x", int), ("y", int)])

df_from_dataclass = pd.DataFrame([Point(0,0), Point(1,0), Point(2,0)])

df_from_dataclass

Unnamed: 0,x,y
0,0,0
1,1,0
2,2,0


### Alternate constructors

1. DataFrame.from_dict

In [12]:
df_from_dict = pd.DataFrame.from_dict(dict([('A',[1,2,3]), ('B',[3,4,5])]))
df_from_dict

Unnamed: 0,A,B
0,1,3
1,2,4
2,3,5


In [15]:
df_from_dict_orient = pd.DataFrame.from_dict(dict([('A', [1,2,3]), ('B', [4,5,6])]), orient='index', columns=['first', 'second', 'third'])
df_from_dict_orient

Unnamed: 0,first,second,third
A,1,2,3
B,4,5,6


### Column selection, addition, deletion

In [2]:
df = pd.DataFrame({'one': pd.Series([1,3,4,5,6], index=list('abcde')),
                  'two': pd.Series([2,3,4,5,6], index=list('abcde')),
                  'three': pd.Series([4,5,6,6,7], index=list('abcde'))})
df

Unnamed: 0,one,two,three
a,1,2,4
b,3,3,5
c,4,4,6
d,5,5,6
e,6,6,7


In [3]:
df['add'] = df['one'] + df['two']
display(df['add'])
del df['add']
display(df)
df['foo'] = 'bar'
display(df)
df['trancate'] = df['foo'][:1]
display(df)

a     3
b     6
c     8
d    10
e    12
Name: add, dtype: int64

Unnamed: 0,one,two,three
a,1,2,4
b,3,3,5
c,4,4,6
d,5,5,6
e,6,6,7


Unnamed: 0,one,two,three,foo
a,1,2,4,bar
b,3,3,5,bar
c,4,4,6,bar
d,5,5,6,bar
e,6,6,7,bar


Unnamed: 0,one,two,three,foo,trancate
a,1,2,4,bar,bar
b,3,3,5,bar,
c,4,4,6,bar,
d,5,5,6,bar,
e,6,6,7,bar,


### Assigning new columns in method chains

In [7]:
df = pd.DataFrame({'one': pd.Series([1,3,4,5,6], index=list('abcde')),
                  'two': pd.Series([2,3,4,5,6], index=list('abcde')),
                  'three': pd.Series([4,5,6,6,7], index=list('abcde'))})

df_assign = df.assign(assigned_column = df['one'] + df['two'])

df_assign.head()

Unnamed: 0,one,two,three,assigned_column
a,1,2,4,3
b,3,3,5,6
c,4,4,6,8
d,5,5,6,10
e,6,6,7,12


In [9]:
# Assign column with lambda function
df_assign2 = df.assign(lambda_assiged = lambda x : x['two'] + x['three'])

df_assign2.head()

Unnamed: 0,one,two,three,lambda_assiged
a,1,2,4,6
b,3,3,5,8
c,4,4,6,10
d,5,5,6,11
e,6,6,7,13


### Indexing / selection

In [17]:
df = pd.DataFrame({'one': pd.Series([1,3,4,5,6], index=list('abcde')),
                  'two': pd.Series([2,3,4,5,6], index=list('abcde')),
                  'three': pd.Series([4,5,6,6,7], index=list('abcde'))})

# select column
display(df['one'])

# select row by label
display(df.loc['b'])

# select row by integer location
display(df.iloc(1))

# Slice rows
display(df[1:3])

#Select rows by boolean vector
display(df[df['one'] > 3])

a    1
b    3
c    4
d    5
e    6
Name: one, dtype: int64

one      3
two      3
three    5
Name: b, dtype: int64

<pandas.core.indexing._iLocIndexer at 0x2c71f951db0>

Unnamed: 0,one,two,three
b,3,3,5
c,4,4,6


Unnamed: 0,one,two,three
c,4,4,6
d,5,5,6
e,6,6,7


### Data alignment and arithmetic

In [23]:
df_one = pd.DataFrame(np.random.randn(10, 4), columns=list('ABCD'))

df_two = pd.DataFrame(np.random.randn(7,3), columns=list('ABC'))

display(df_one)

display(df_two)

display(df_one + df_two)

display(df_one - df_one.iloc[0])

Unnamed: 0,A,B,C,D
0,0.627965,-0.338788,-1.056774,1.074475
1,1.670145,-0.393482,0.309024,2.304574
2,0.884062,-1.963976,-0.06783,0.015278
3,1.441502,-0.694273,0.534941,-0.112045
4,2.030944,0.67461,0.173009,2.462795
5,0.805624,-0.596151,0.172904,-0.532634
6,1.096394,-1.256588,0.60678,1.163792
7,0.711794,-0.33189,-0.862366,0.34368
8,0.573071,-0.923914,-1.156466,-0.334592
9,1.053369,0.775711,-0.504098,0.041798


Unnamed: 0,A,B,C
0,-1.350058,-0.083981,0.719465
1,-0.473912,0.842377,-0.453956
2,1.584724,0.308027,0.481279
3,-0.716057,0.418463,1.980739
4,-0.882981,-1.025595,-1.952158
5,0.431275,-1.499428,1.940301
6,0.683689,-2.006589,-0.188898


Unnamed: 0,A,B,C,D
0,-0.722093,-0.422769,-0.337309,
1,1.196233,0.448895,-0.144932,
2,2.468786,-1.655949,0.413448,
3,0.725445,-0.27581,2.51568,
4,1.147964,-0.350984,-1.779148,
5,1.236898,-2.095579,2.113205,
6,1.780083,-3.263177,0.417883,
7,,,,
8,,,,
9,,,,


Unnamed: 0,A,B,C,D
0,0.0,0.0,0.0,0.0
1,1.04218,-0.054694,1.365798,1.230099
2,0.256097,-1.625188,0.988944,-1.059197
3,0.813538,-0.355485,1.591716,-1.18652
4,1.402979,1.013398,1.229784,1.38832
5,0.177659,-0.257363,1.229679,-1.607108
6,0.46843,-0.9178,1.663555,0.089318
7,0.083829,0.006898,0.194409,-0.730795
8,-0.054894,-0.585126,-0.099691,-1.409067
9,0.425405,1.114499,0.552676,-1.032677


### Transposing

In [29]:
index = pd.date_range('1/1/2020', periods=8)

df_trans = pd.DataFrame(np.random.randn(8,3), index=index, columns=list('ABC'))

display(df_trans)

display(df_trans.T)

Unnamed: 0,A,B,C
2020-01-01,1.941546,-0.589904,-0.098102
2020-01-02,-0.302925,-0.150141,-1.340448
2020-01-03,0.290643,-1.651802,-1.354535
2020-01-04,0.731066,0.210639,-1.179482
2020-01-05,1.265465,-0.325524,-0.319118
2020-01-06,0.430442,-0.438309,-2.102501
2020-01-07,-1.676366,0.91114,0.956515
2020-01-08,1.309145,0.297879,-0.920862


Unnamed: 0,2020-01-01,2020-01-02,2020-01-03,2020-01-04,2020-01-05,2020-01-06,2020-01-07,2020-01-08
A,1.941546,-0.302925,0.290643,0.731066,1.265465,0.430442,-1.676366,1.309145
B,-0.589904,-0.150141,-1.651802,0.210639,-0.325524,-0.438309,0.91114,0.297879
C,-0.098102,-1.340448,-1.354535,-1.179482,-0.319118,-2.102501,0.956515,-0.920862


### DataFrame interoperability with NumPy functions

In [31]:
np.exp(df_trans)

Unnamed: 0,A,B,C
2020-01-01,6.969515,0.554381,0.906557
2020-01-02,0.738655,0.860587,0.261728
2020-01-03,1.337286,0.191704,0.258067
2020-01-04,2.077294,1.234466,0.307438
2020-01-05,3.544742,0.722149,0.72679
2020-01-06,1.537938,0.645126,0.122151
2020-01-07,0.187053,2.487155,2.602611
2020-01-08,3.703005,1.346999,0.398176


In [33]:
np.asarray(df_trans)

array([[ 1.9415456 , -0.58990391, -0.09810174],
       [-0.30292492, -0.15014074, -1.34044791],
       [ 0.29064253, -1.65180198, -1.35453475],
       [ 0.73106625,  0.21063872, -1.17948172],
       [ 1.26546549, -0.32552444, -0.31911777],
       [ 0.43044234, -0.43830913, -2.1025013 ],
       [-1.67636578,  0.91113967,  0.95651507],
       [ 1.30914463,  0.29787913, -0.92086159]])