In [1]:
# all imports go at the top, even though python is a very open language
import numpy as np
import pandas as pd

## Series

In [6]:
# Normal series and series with custom index
s1 = pd.Series([1,2,3])
s2 = pd.Series([1,2,3, np.nan], index=list("abcd"))
print(s1)
print()
print(s2)

print()
print(pd.Series(range(5)))

0    1
1    2
2    3
dtype: int64

a    1.0
b    2.0
c    3.0
d    NaN
dtype: float64

0    0
1    1
2    2
3    3
4    4
dtype: int64


In [11]:
# changing dtype: astype
print(s1.astype('float64'))
print()
print(s1.astype(np.float32))

0    1.0
1    2.0
2    3.0
dtype: float64

0    1.0
1    2.0
2    3.0
dtype: float32


In [13]:
# shape, size, dtype, index, values
print(s1.shape)
print(s1.dtype)
print(s1.index)
print(s1.values)
print(s1.size)

print()
print(s2.index)

(3,)
int64
RangeIndex(start=0, stop=3, step=1)
[1 2 3]
3

Index(['a', 'b', 'c', 'd'], dtype='object')


In [31]:
# aggregate operations: min, max, unique, sum etc.
a = np.array([1,2, np.nan])
print(a.max())

print(s1)
print(s2)
print()
print(s1.max())
print(s1.min())
print(s1.sum())

print()
print(s2.max(skipna=False))
print(s2.min())
print(s2.sum())
print()
print(s2.unique())

nan
0    1
1    2
2    3
dtype: int64
a    1.0
b    2.0
c    3.0
d    NaN
dtype: float64

3
1
6

nan
1.0
6.0

[ 1.  2.  3. nan]


In [21]:
# arithmetic operations(new copy) and broadcast like numpy
print(s1*10)
print(s2-100)

print()
print(s1 - s2)

t1 = pd.Series([10,20,30], index=[1,2,3])
t2 = pd.Series([100,200,300, 400], index=[0,2,3,4])
print(t1 - t2)

0    10
1    20
2    30
dtype: int64
a   -99.0
b   -98.0
c   -97.0
d     NaN
dtype: float64

0   NaN
1   NaN
2   NaN
a   NaN
b   NaN
c   NaN
d   NaN
dtype: float64
0      NaN
1      NaN
2   -180.0
3   -270.0
4      NaN
dtype: float64


In [28]:
# indexing and slicing
print(s1)
print(s1[0])

print()
print(s2)
print()
print(s2['c'])
print(s2['c' :])

print()
print(s2[ [True, False, False, True] ])

0    1
1    2
2    3
dtype: int64
1

a    1.0
b    2.0
c    3.0
d    NaN
dtype: float64

3.0
c    3.0
d    NaN
dtype: float64

a    1.0
d    NaN
dtype: float64


In [34]:
# result of operations as index
print(type(s2['c']))
print(type(s2['c' :]))


<class 'numpy.float64'>
<class 'pandas.core.series.Series'>


In [59]:
# frequency : value_counts
s = pd.Series(list("abcdabAAb"))
print(s)
print()
r = s.value_counts()
print(r)

r.reindex()

print(r.index)
print(r.values)

0    a
1    b
2    c
3    d
4    a
5    b
6    A
7    A
8    b
dtype: object

b    3
A    2
a    2
c    1
d    1
dtype: int64
Index(['b', 'A', 'a', 'c', 'd'], dtype='object')
[3 2 2 1 1]


## DataFrame

In [1]:
import numpy as np
import pandas as pd

In [2]:
# dataframe from dict or list of rows
df1 = pd.DataFrame(
    {
        "Name":   ["Gaurav", "Abhishek1", "Krishna", "Abhishek2", "Harshita", "Joey", "Shweta", "na-1" , "na-2"  , np.nan],
        "Age":    [ 0      , 1          , 2        , 2          , 1         , 2     , 3       , 1      ,  np.nan , 2     ],
        "Gender": [ 0      , 0          , 0        , 0          , 1         , 1     , 1       , np.nan ,  0      , 0     ]
    }
)
df2 = pd.DataFrame([("Gaurav",1), ("Abhiskek",2), ("Krishna",3), ("Abhishek2",4)],
                    index=[0,11,2,3], 
                    columns=["Name", "Number"])
print(df1)
print()
print(df2)

        Name  Age  Gender
0     Gaurav  0.0     0.0
1  Abhishek1  1.0     0.0
2    Krishna  2.0     0.0
3  Abhishek2  2.0     0.0
4   Harshita  1.0     1.0
5       Joey  2.0     1.0
6     Shweta  3.0     1.0
7       na-1  1.0     NaN
8       na-2  NaN     0.0
9        NaN  2.0     0.0

         Name  Number
0      Gaurav       1
11   Abhiskek       2
2     Krishna       3
3   Abhishek2       4


In [44]:
# shape, size, dtype, index, columns, T
print(df1.size)
print()
print(df1.shape)
print()
print(df1.dtypes)
print()
print(df1.index)
print()
print(df1.columns)
print()
print(df1.T)

print()
print()
print(df2.index)
print()
print(df2.columns)

30

(10, 3)

Name       object
Age       float64
Gender    float64
dtype: object

RangeIndex(start=0, stop=10, step=1)

Index(['Name', 'Age', 'Gender'], dtype='object')

             0          1        2          3         4     5       6     7  \
Name    Gaurav  Abhishek1  Krishna  Abhishek2  Harshita  Joey  Shweta  na-1   
Age          0          1        2          2         1     2       3     1   
Gender       0          0        0          0         1     1       1   NaN   

           8    9  
Name    na-2  NaN  
Age      NaN    2  
Gender     0    0  


Int64Index([0, 11, 2, 3], dtype='int64')

Index(['Name', 'Number'], dtype='object')


In [55]:
# Rename index and columns on df2.
# Inplace vs normal operation
t = df2.rename(columns={"Name":"n", "Number" : "Num"})
#t = df2.rename({"Name":"n", "Number" : "Num"}, axis=1)
print(t)
print()
print(df2)

t = df2.rename(lambda x : x*10, axis=0)
print()
print(t)

print()
t = df2.rename(lambda s: s.upper(), axis=1, inplace=True)
print("T", t)
print("df2")
print(df2)

         NAME  NUMBER
0      Gaurav       1
11   Abhiskek       2
2     Krishna       3
3   Abhishek2       4

         NAME  NUMBER
0      Gaurav       1
11   Abhiskek       2
2     Krishna       3
3   Abhishek2       4

          NAME  NUMBER
0       Gaurav       1
110   Abhiskek       2
20     Krishna       3
30   Abhishek2       4

T None
df2
         NAME  NUMBER
0      Gaurav       1
11   Abhiskek       2
2     Krishna       3
3   Abhishek2       4


In [57]:
a = [1,2]
a.append(3)
b = a
print(a)
print(b)

b.append(4)
print(a)
print(b)

[1, 2, 3]
[1, 2, 3]
[1, 2, 3, 4]
[1, 2, 3, 4]


In [68]:
# Indexing columns directly and via index.
print(df1.columns)
print()
print(df1.Name)
print()
print(df1['Age'])

Index(['Name', 'Age', 'Gender'], dtype='object')

0       Gaurav
1    Abhishek1
2      Krishna
3    Abhishek2
4     Harshita
5         Joey
6       Shweta
7         na-1
8         na-2
9          NaN
Name: Name, dtype: object

0    0.0
1    1.0
2    2.0
3    2.0
4    1.0
5    2.0
6    3.0
7    1.0
8    NaN
9    2.0
Name: Age, dtype: float64


In [69]:
# Series in a DataFrame
print(type(df1.Name))

<class 'pandas.core.series.Series'>


In [83]:
# describe, top/bottom rows of data
print(df1.columns)
print()
print(df1.describe())

print()
print(df1.describe(include='all'))

print()
print(df1.Name.isna().sum())
print()
print(df1.count())
print()
print(df1.isna().sum())

Index(['Name', 'Age', 'Gender'], dtype='object')

            Age    Gender
count  9.000000  9.000000
mean   1.555556  0.333333
std    0.881917  0.500000
min    0.000000  0.000000
25%    1.000000  0.000000
50%    2.000000  0.000000
75%    2.000000  1.000000
max    3.000000  1.000000

            Name       Age    Gender
count          9  9.000000  9.000000
unique         9       NaN       NaN
top     Harshita       NaN       NaN
freq           1       NaN       NaN
mean         NaN  1.555556  0.333333
std          NaN  0.881917  0.500000
min          NaN  0.000000  0.000000
25%          NaN  1.000000  0.000000
50%          NaN  2.000000  0.000000
75%          NaN  2.000000  1.000000
max          NaN  3.000000  1.000000

1

Name      9
Age       9
Gender    9
dtype: int64

Name      1
Age       1
Gender    1
dtype: int64


In [80]:
print(df1.head())
print()
print(df1.tail())
print()
print(df1.tail(2))

        Name  Age  Gender
0     Gaurav  0.0     0.0
1  Abhishek1  1.0     0.0
2    Krishna  2.0     0.0
3  Abhishek2  2.0     0.0
4   Harshita  1.0     1.0

     Name  Age  Gender
5    Joey  2.0     1.0
6  Shweta  3.0     1.0
7    na-1  1.0     NaN
8    na-2  NaN     0.0
9     NaN  2.0     0.0

   Name  Age  Gender
8  na-2  NaN     0.0
9   NaN  2.0     0.0


In [89]:
# indexing and slicing
# loc, iloc [rows, cols]
print(df1.loc[0])
print()
print(df1.loc[1:3])
print()
print(df1.loc[ :, 'Name' ])
print()
print(df1.loc[ 3:5, 'Name' ])

print()
print(df1.loc[ 3:5, 'Name':'Age' ])

Name      Gaurav
Age            0
Gender         0
Name: 0, dtype: object

        Name  Age  Gender
1  Abhishek1  1.0     0.0
2    Krishna  2.0     0.0
3  Abhishek2  2.0     0.0

0       Gaurav
1    Abhishek1
2      Krishna
3    Abhishek2
4     Harshita
5         Joey
6       Shweta
7         na-1
8         na-2
9          NaN
Name: Name, dtype: object

3    Abhishek2
4     Harshita
5         Joey
Name: Name, dtype: object

        Name  Age
3  Abhishek2  2.0
4   Harshita  1.0
5       Joey  2.0


In [107]:
# loc, iloc [rows, cols]
print(df2.iloc[0])
print()
print(df2.iloc[1:3])
print()
print(df2.iloc[1:3, 0])

print()
print(df2)
print(df2.iloc[1:3, ::-1])

NAME      Gaurav
NUMBER         1
Name: 0, dtype: object

        NAME  NUMBER
11  Abhiskek       2
2    Krishna       3

11    Abhiskek
2      Krishna
Name: NAME, dtype: object

         NAME  NUMBER
0      Gaurav       1
11   Abhiskek       2
2     Krishna       3
3   Abhishek2       4
    NUMBER      NAME
11       2  Abhiskek
2        3   Krishna


In [109]:
a = [1,2,3]
print(a[::-1])
print(a[:-1])

[3, 2, 1]
[1, 2]


In [104]:
print()
print(df2.iloc[-2])

print()
print(df1.iloc[-1:-5])
print()
print(df1.iloc[-1:-5: -1])
print()
print(df1.iloc[-5:-1])


NAME      Krishna
NUMBER          3
Name: 2, dtype: object

Empty DataFrame
Columns: [Name, Age, Gender]
Index: []

     Name  Age  Gender
9     NaN  2.0     0.0
8    na-2  NaN     0.0
7    na-1  1.0     NaN
6  Shweta  3.0     1.0

     Name  Age  Gender
5    Joey  2.0     1.0
6  Shweta  3.0     1.0
7    na-1  1.0     NaN
8    na-2  NaN     0.0


In [115]:
# aggregate operations (with and without axis : row-1, col-0)
# min, max, count, ...
print(df1.min())
print()
print(df1.sum())

print()
print(df1)
print(df1.min(axis=1))
print(df1.max(axis=1))

Age       0.0
Gender    0.0
dtype: float64

Age       14.0
Gender     3.0
dtype: float64

        Name  Age  Gender
0     Gaurav  0.0     0.0
1  Abhishek1  1.0     0.0
2    Krishna  2.0     0.0
3  Abhishek2  2.0     0.0
4   Harshita  1.0     1.0
5       Joey  2.0     1.0
6     Shweta  3.0     1.0
7       na-1  1.0     NaN
8       na-2  NaN     0.0
9        NaN  2.0     0.0
0    0.0
1    0.0
2    0.0
3    0.0
4    1.0
5    1.0
6    1.0
7    1.0
8    0.0
9    0.0
dtype: float64
0    0.0
1    1.0
2    2.0
3    2.0
4    1.0
5    2.0
6    3.0
7    1.0
8    0.0
9    2.0
dtype: float64


In [120]:
# arithmetic operations
print(df1*2)
print(df1.iloc[:, 1:] - 2)
#print(df1-2) # this gives exception

                 Name  Age  Gender
0        GauravGaurav  0.0     0.0
1  Abhishek1Abhishek1  2.0     0.0
2      KrishnaKrishna  4.0     0.0
3  Abhishek2Abhishek2  4.0     0.0
4    HarshitaHarshita  2.0     2.0
5            JoeyJoey  4.0     2.0
6        ShwetaShweta  6.0     2.0
7            na-1na-1  2.0     NaN
8            na-2na-2  NaN     0.0
9                 NaN  4.0     0.0
   Age  Gender
0 -2.0    -2.0
1 -1.0    -2.0
2  0.0    -2.0
3  0.0    -2.0
4 -1.0    -1.0
5  0.0    -1.0
6  1.0    -1.0
7 -1.0     NaN
8  NaN    -2.0
9  0.0    -2.0


In [121]:
print(type(df1.iloc[0]))
print(type(df1.iloc[0:3]))
print(type(df1.iloc[0,0]))

<class 'pandas.core.series.Series'>
<class 'pandas.core.frame.DataFrame'>
<class 'str'>


In [124]:
# Na Values: isna, fillna, dropna
print(df1.isna())
print()
print(df1.isna().sum())


    Name    Age  Gender
0  False  False   False
1  False  False   False
2  False  False   False
3  False  False   False
4  False  False   False
5  False  False   False
6  False  False   False
7  False  False    True
8  False   True   False
9   True  False   False

Name      1
Age       1
Gender    1
dtype: int64


In [128]:
print(df1.tail())
print()

r = df1.fillna(method='ffill')
print(r)

r = df1.fillna(method='bfill')
print(r)

     Name  Age  Gender
5    Joey  2.0     1.0
6  Shweta  3.0     1.0
7    na-1  1.0     NaN
8    na-2  NaN     0.0
9     NaN  2.0     0.0

        Name  Age  Gender
0     Gaurav  0.0     0.0
1  Abhishek1  1.0     0.0
2    Krishna  2.0     0.0
3  Abhishek2  2.0     0.0
4   Harshita  1.0     1.0
5       Joey  2.0     1.0
6     Shweta  3.0     1.0
7       na-1  1.0     1.0
8       na-2  1.0     0.0
9       na-2  2.0     0.0
        Name  Age  Gender
0     Gaurav  0.0     0.0
1  Abhishek1  1.0     0.0
2    Krishna  2.0     0.0
3  Abhishek2  2.0     0.0
4   Harshita  1.0     1.0
5       Joey  2.0     1.0
6     Shweta  3.0     1.0
7       na-1  1.0     0.0
8       na-2  2.0     0.0
9        NaN  2.0     0.0


In [133]:
print(df1.tail())
print()

r = df1.fillna(10)
print(r)

r = df1.fillna({'Name': '--', 'Gender': 1.0})
print(r)

print()
r = df1.Age.fillna(df1.Age.mean())
print(r)

print(df1.tail())

     Name  Age  Gender
5    Joey  2.0     1.0
6  Shweta  3.0     1.0
7    na-1  1.0     NaN
8    na-2  NaN     0.0
9     NaN  2.0     0.0

        Name   Age  Gender
0     Gaurav   0.0     0.0
1  Abhishek1   1.0     0.0
2    Krishna   2.0     0.0
3  Abhishek2   2.0     0.0
4   Harshita   1.0     1.0
5       Joey   2.0     1.0
6     Shweta   3.0     1.0
7       na-1   1.0    10.0
8       na-2  10.0     0.0
9         10   2.0     0.0
        Name  Age  Gender
0     Gaurav  0.0     0.0
1  Abhishek1  1.0     0.0
2    Krishna  2.0     0.0
3  Abhishek2  2.0     0.0
4   Harshita  1.0     1.0
5       Joey  2.0     1.0
6     Shweta  3.0     1.0
7       na-1  1.0     1.0
8       na-2  NaN     0.0
9         --  2.0     0.0

0    0.000000
1    1.000000
2    2.000000
3    2.000000
4    1.000000
5    2.000000
6    3.000000
7    1.000000
8    1.555556
9    2.000000
Name: Age, dtype: float64
     Name  Age  Gender
5    Joey  2.0     1.0
6  Shweta  3.0     1.0
7    na-1  1.0     NaN
8    na-2  NaN     

In [138]:
print(df1.dropna()) # remove all rows where any column contains a na value
print()

print(df1.dropna(axis=1))

print()
print(df1)

        Name  Age  Gender
0     Gaurav  0.0     0.0
1  Abhishek1  1.0     0.0
2    Krishna  2.0     0.0
3  Abhishek2  2.0     0.0
4   Harshita  1.0     1.0
5       Joey  2.0     1.0
6     Shweta  3.0     1.0

Empty DataFrame
Columns: []
Index: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]

        Name  Age  Gender
0     Gaurav  0.0     0.0
1  Abhishek1  1.0     0.0
2    Krishna  2.0     0.0
3  Abhishek2  2.0     0.0
4   Harshita  1.0     1.0
5       Joey  2.0     1.0
6     Shweta  3.0     1.0
7       na-1  1.0     NaN
8       na-2  NaN     0.0
9        NaN  2.0     0.0


In [142]:
print(bool(np.nan))

True


In [143]:
# any, all
print(df1.any())
print()
print(df1.all())
print()
print(df1.all(axis=1))

Name      True
Age       True
Gender    True
dtype: bool

Name       True
Age       False
Gender    False
dtype: bool

0    False
1    False
2    False
3    False
4     True
5     True
6     True
7     True
8    False
9    False
dtype: bool


In [149]:
# ordering data, sort_values
print(df1.sort_values(by='Name', ascending=False))

print()
print(df1.sort_values(by=['Age', "Name"], ascending=False))

print()
print(df1.sort_values(by=['Age', "Name"], ascending=[False, True]))

        Name  Age  Gender
8       na-2  NaN     0.0
7       na-1  1.0     NaN
6     Shweta  3.0     1.0
2    Krishna  2.0     0.0
5       Joey  2.0     1.0
4   Harshita  1.0     1.0
0     Gaurav  0.0     0.0
3  Abhishek2  2.0     0.0
1  Abhishek1  1.0     0.0
9        NaN  2.0     0.0

        Name  Age  Gender
6     Shweta  3.0     1.0
2    Krishna  2.0     0.0
5       Joey  2.0     1.0
3  Abhishek2  2.0     0.0
9        NaN  2.0     0.0
7       na-1  1.0     NaN
4   Harshita  1.0     1.0
1  Abhishek1  1.0     0.0
0     Gaurav  0.0     0.0
8       na-2  NaN     0.0

        Name  Age  Gender
6     Shweta  3.0     1.0
3  Abhishek2  2.0     0.0
5       Joey  2.0     1.0
2    Krishna  2.0     0.0
9        NaN  2.0     0.0
1  Abhishek1  1.0     0.0
4   Harshita  1.0     1.0
7       na-1  1.0     NaN
0     Gaurav  0.0     0.0
8       na-2  NaN     0.0


In [164]:
# row / column wise operation: apply
tmp = pd.DataFrame(np.random.randint(10,20,20).reshape(5,4), columns=["One", 'a',  'b', 10])
print(tmp)

   One   a   b  10
0   18  13  10  15
1   18  17  19  11
2   17  12  19  16
3   14  14  17  17
4   17  13  17  18


In [165]:
print(tmp.apply(sum))
print()
print(tmp.apply(sum, axis=1))

import math

print(tmp.apply( lambda s : list(map(math.sqrt, s))   ))

One    84
a      69
b      82
10     77
dtype: int64

0    56
1    65
2    64
3    62
4    65
dtype: int64
        One         a         b        10
0  4.242641  3.605551  3.162278  3.872983
1  4.242641  4.123106  4.358899  3.316625
2  4.123106  3.464102  4.358899  4.000000
3  3.741657  3.741657  4.123106  4.123106
4  4.123106  3.605551  4.123106  4.242641


In [170]:
print(df2)
print()
print(df2.apply(pd.Series.sum))

def sum(iterable):
    start = 0
    for d in iterable:
        start += d
    return d


def sum(iterable):
    if isinstance(iterable[0], str):
        start = ''
    else:
        start = 0

    for d in iterable:
        start += d
    return d

         NAME  NUMBER
0      Gaurav       1
11   Abhiskek       2
2     Krishna       3
3   Abhishek2       4

NAME      GauravAbhiskekKrishnaAbhishek2
NUMBER                                10
dtype: object


In [175]:
s  =  'abcAAbc'
print(s.replace('A','1'))
print(dir(s))

abc11bc
['__add__', '__class__', '__contains__', '__delattr__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__getitem__', '__getnewargs__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__iter__', '__le__', '__len__', '__lt__', '__mod__', '__mul__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__rmod__', '__rmul__', '__setattr__', '__sizeof__', '__str__', '__subclasshook__', 'capitalize', 'casefold', 'center', 'count', 'encode', 'endswith', 'expandtabs', 'find', 'format', 'format_map', 'index', 'isalnum', 'isalpha', 'isascii', 'isdecimal', 'isdigit', 'isidentifier', 'islower', 'isnumeric', 'isprintable', 'isspace', 'istitle', 'isupper', 'join', 'ljust', 'lower', 'lstrip', 'maketrans', 'partition', 'replace', 'rfind', 'rindex', 'rjust', 'rpartition', 'rsplit', 'rstrip', 'split', 'splitlines', 'startswith', 'strip', 'swapcase', 'title', 'translate', 'upper', 'zfill']


In [179]:
# str submodule (replace etc)
print(dir(df1.Name.str))
print(df1.Name.str.upper())
print()

print(df1.Name.str.isalnum())

['__class__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__frozen', '__ge__', '__getattribute__', '__getitem__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__iter__', '__le__', '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', '_doc_args', '_freeze', '_get_series_list', '_inferred_dtype', '_is_categorical', '_make_accessor', '_orig', '_parent', '_validate', '_wrap_result', 'capitalize', 'casefold', 'cat', 'center', 'contains', 'count', 'decode', 'encode', 'endswith', 'extract', 'extractall', 'find', 'findall', 'get', 'get_dummies', 'index', 'isalnum', 'isalpha', 'isdecimal', 'isdigit', 'islower', 'isnumeric', 'isspace', 'istitle', 'isupper', 'join', 'len', 'ljust', 'lower', 'lstrip', 'match', 'normalize', 'pad', 'partition', 'repeat', 'replace', 'rfind', 'rindex', 'rjust', 'rpartition', 'rsplit', 'rstrip', 'slice', 'slice_replace',

In [182]:
# load and save data
df  = pd.read_csv('titanic_filtered.csv')
print(df.head())

   Unnamed: 0  pclass  survived  gender      age  sibsp  parch    fare  \
0           0       1         1  female  29.0000      0      0  211.34   
1           1       1         1    male   0.9167      1      2  151.55   
2           2       1         0  female   2.0000      1      2  151.55   
3           3       1         0    male  30.0000      1      2  151.55   
4           4       1         0  female  25.0000      1      2  151.55   

  embarked  
0        S  
1        S  
2        S  
3        S  
4        S  


In [187]:

df  = pd.read_csv(r'https://raw.githubusercontent.com/leangaurav/dsc_weekend_20200719/master/code/titanic_filtered.csv')
print(df.head())
print(df.isna().any())

   Unnamed: 0  pclass  survived  gender      age  sibsp  parch    fare  \
0           0       1         1  female  29.0000      0      0  211.34   
1           1       1         1    male   0.9167      1      2  151.55   
2           2       1         0  female   2.0000      1      2  151.55   
3           3       1         0    male  30.0000      1      2  151.55   
4           4       1         0  female  25.0000      1      2  151.55   

  embarked  
0        S  
1        S  
2        S  
3        S  
4        S  
Unnamed: 0    False
pclass        False
survived      False
gender        False
age           False
sibsp         False
parch         False
fare          False
embarked      False
dtype: bool


In [None]:
# groups: groupby

In [None]:
# groups and aggregates