# Data Manipulation

### Tools
- NumPy
- Pandas

source for __Markdown usage__: __https://github.com/adam-p/markdown-here/wiki/Markdown-Cheatsheet__

## NumPy: Numerical Python

### Why NumPy?

* effective data storage
* vectoral/matrix operations
* constant/fixed type arrays
* high level interface

#############
- better running time
- better memory usage 

In [12]:
a = [1, 2, 3, 4]
b = [2, 3, 4, 5]

ab = []

for i in range(0, len(a)):
    ab.append(a[i]*b[i])
    
print(ab)

In [1]:
import numpy as np

a = np.array([1, 2, 3, 4])
print(a)
b = np.array([2, 3, 4, 5])
print(b)

a*b

[1 2 3 4]
[2 3 4 5]


array([ 2,  6, 12, 20])

In [3]:
import time

tic = time.time()

ab = []

for i in range(0, len(a)):
    ab.append(a[i]*b[i])
    
print(ab)

toc = time.time()

print('Time to run: ', str((toc-tic)*1000), ' ms')

[2, 6, 12, 20]
Time to run:  0.9996891021728516  ms


In [30]:
ab = [a[i]*b[i] for i in range(0, len(a))]
print(ab)

[2, 6, 12, 20]


In [29]:
import time

tic = time.time()

ab = [a[i]*b[i] for i in range(0, len(a))]
print(ab)

toc = time.time()

print('Time to run: ', str((toc-tic)*1000), ' ms')

[2, 6, 12, 20]
Time to run:  0.0  ms


In [2]:
import time

a = np.random.rand(1000000)
b = np.random.rand(1000000)

tic = time.time()
c = np.dot(a,b)
toc = time.time()

print(c)
print("Vectorized version:" + str(1000*(toc-tic)) + "ms")

c = 0
tic = time.time()
for i in range(1000000):
    c+=a[i]*b[i]        
toc = time.time()

print(c)
print("For loop:" + str(1000*(toc-tic)) + "ms")

250436.23727647518
Vectorized version:41.0616397857666ms
250436.23727647113
For loop:1468.7223434448242ms


### Creating NumPy Arrays

In [31]:
import numpy as np

In [32]:
np.array([1,2,3,4,5])

array([1, 2, 3, 4, 5])

In [33]:
type(np.array([1,2,3,4,5]))

numpy.ndarray

In [38]:
np.zeros(10, dtype = int)

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [36]:
np.random.randint(10, size = 10)

array([6, 0, 0, 5, 8, 0, 4, 4, 5, 5])

In [35]:
np.random.normal(10, 4, (3,4))

array([[18.31396908,  8.59877665, 14.04009547,  7.07661633],
       [11.17962403,  9.24303285, 10.09269824, 11.42349526],
       [15.33398571, 10.68889422,  9.25814353,  6.30775272]])

In [34]:
? np.random.normal

[1;31mDocstring:[0m
normal(loc=0.0, scale=1.0, size=None)

Draw random samples from a normal (Gaussian) distribution.

The probability density function of the normal distribution, first
derived by De Moivre and 200 years later by both Gauss and Laplace
independently [2]_, is often called the bell curve because of
its characteristic shape (see the example below).

The normal distributions occurs often in nature.  For example, it
describes the commonly occurring distribution of samples influenced
by a large number of tiny, random disturbances, each with its own
unique distribution [2]_.

.. note::
    New code should use the ``normal`` method of a ``default_rng()``
    instance instead; see `random-quick-start`.

Parameters
----------
loc : float or array_like of floats
    Mean ("centre") of the distribution.
scale : float or array_like of floats
    Standard deviation (spread or "width") of the distribution. Must be
    non-negative.
size : int or tuple of ints, optional
    Output s

In [37]:
np.random.randint(0, 10, (3,3))

array([[8, 3, 8],
       [6, 2, 0],
       [7, 4, 4]])

* **ndim**: _boyut sayısı_
* shape: boyut bilgisi
* size: toplam eleman sayısı
* dtype: array veri tipi

In [39]:
a = np.random.randint(10, size = 10)
a

array([3, 4, 8, 8, 1, 8, 0, 3, 2, 7])

In [40]:
a.ndim

1

In [41]:
a.shape

(10,)

In [42]:
a.size

10

In [43]:
a.dtype

dtype('int32')

In [44]:
b = np.random.randint(10, size = (3,5))
b

array([[4, 1, 8, 0, 0],
       [0, 3, 6, 9, 1],
       [6, 6, 7, 8, 5]])

In [45]:
b.ndim

2

In [46]:
b.shape

(3, 5)

In [47]:
b.size

15

In [48]:
b.dtype

dtype('int32')

### Reshaping

In [49]:
np.arange(1,10)

array([1, 2, 3, 4, 5, 6, 7, 8, 9])

In [50]:
np.arange(1,10).reshape((3,3))

array([[1, 2, 3],
       [4, 5, 6],
       [7, 8, 9]])

### Concatenating

In [8]:
a = np.array([[1, 2], [3, 4]])
b = np.array([[5, 6]])
print(a)
print(b)

[[1 2]
 [3 4]]
[[5 6]]


In [9]:
np.concatenate((a, b), axis=0)

array([[1, 2],
       [3, 4],
       [5, 6]])

In [7]:
np.concatenate((a, b.T), axis=1)

array([[1, 2, 5],
       [3, 4, 6]])

### Splitting

In [10]:
x = np.arange(9.0)
print(x)
np.split(x, 3)

[0. 1. 2. 3. 4. 5. 6. 7. 8.]


[array([0., 1., 2.]), array([3., 4., 5.]), array([6., 7., 8.])]

In [11]:
x = np.arange(8.0)
print(x)
np.split(x, [3, 5, 6, 10])

[0. 1. 2. 3. 4. 5. 6. 7.]


[array([0., 1., 2.]),
 array([3., 4.]),
 array([5.]),
 array([6., 7.]),
 array([], dtype=float64)]

### Indexing

In [51]:
a

array([3, 4, 8, 8, 1, 8, 0, 3, 2, 7])

In [52]:
a[0]

3

In [53]:
a[-1]

7

In [54]:
a[:-1]

array([3, 4, 8, 8, 1, 8, 0, 3, 2])

In [55]:
a[0] = 99

In [56]:
a[0:4]

array([99,  4,  8,  8])

In [57]:
m = np.random.randint(10, size = (3,5))
m

array([[8, 5, 6, 5, 2],
       [9, 7, 2, 6, 0],
       [2, 6, 8, 3, 0]])

In [58]:
m[0][0]

8

In [60]:
m[0,0]

8

In [61]:
m[1,1]

7

In [62]:
m[1][1]

7

In [65]:
m[2,3] = 999

In [66]:
m

array([[  8,   5,   6,   5,   2],
       [  9,   7,   2,   6,   0],
       [  2,   6,   8, 999,   0]])

In [63]:
m[2,3] = 2.9

In [64]:
m

array([[8, 5, 6, 5, 2],
       [9, 7, 2, 6, 0],
       [2, 6, 8, 2, 0]])

In [68]:
m[:,1]

array([5, 7, 6])

In [70]:
m[1, :]

array([9, 7, 2, 6, 0])

In [71]:
m[0:2,0:3]

array([[8, 5, 6],
       [9, 7, 2]])

### Fancy Indexing : coklu index secimi

In [72]:
import numpy as np
v = np.arange(0, 30, 3)
v

array([ 0,  3,  6,  9, 12, 15, 18, 21, 24, 27])

In [73]:
v[1]

3

In [74]:
al_getir = [1, 2, 3]

In [75]:
v[al_getir]

array([3, 6, 9])

In [76]:
m = np.arange(9).reshape((3,3))
m

array([[0, 1, 2],
       [3, 4, 5],
       [6, 7, 8]])

In [77]:
m[2, [1,2]]

array([7, 8])

In [78]:
m

array([[0, 1, 2],
       [3, 4, 5],
       [6, 7, 8]])

In [79]:
m[0:1, [1,2]]

array([[1, 2]])

### Conditional Element Selection

In [80]:
import numpy as np
v = np.array([1, 2, 3, 4, 5])
v

array([1, 2, 3, 4, 5])

In [81]:
ab = []
for i in v:
    if i <3:
        ab.append(i)
        
ab

[1, 2]

In [83]:
ab = [i for i in v if i<3]
ab

[1, 2]

In [84]:
v < 3

array([ True,  True, False, False, False])

In [85]:
v[v < 3]

array([1, 2])

In [86]:
v[v > 3]

array([4, 5])

In [87]:
v[v >= 3]

array([3, 4, 5])

In [88]:
v[v <= 3]

array([1, 2, 3])

In [89]:
v[v == 3]

array([3])

In [90]:
v / 5  # ufunc

array([0.2, 0.4, 0.6, 0.8, 1. ])

In [91]:
v*5/10

array([0.5, 1. , 1.5, 2. , 2.5])

In [92]:
v**2

array([ 1,  4,  9, 16, 25], dtype=int32)

In [93]:
v*5

array([ 5, 10, 15, 20, 25])

In [94]:
np.subtract(v, 1)

array([0, 1, 2, 3, 4])

In [95]:
v-1

array([0, 1, 2, 3, 4])

In [96]:
np.mean(v)

3.0

In [97]:
np.sum(v)

15

In [99]:
v.sum()

15

In [98]:
v.min()

1

In [5]:
import time
import numpy as np

size_of_vec = 100000

def pure_python_version():
    t1 = time.time()
    X = range(size_of_vec)
    Y = range(size_of_vec)
    Z = [X[i] + Y[i] for i in range(len(X)) ]
    return time.time() - t1

def numpy_version():
    t1 = time.time()
    X = np.arange(size_of_vec)
    Y = np.arange(size_of_vec)
    Z = X + Y
    return time.time() - t1


t1 = pure_python_version()
t2 = numpy_version()
print(t1, t2)
print("Numpy is in this example " + str(t1/t2) + " faster!")

# https://webcourses.ucf.edu/courses/1249560/pages/python-lists-vs-numpy-arrays-what-is-the-difference#:~:text=It%20provides%20a%20high%2Dperformance,a%20tuple%20of%20nonnegative%20integers.&text=A%20list%20is%20the%20Python,contain%20elements%20of%20different%20types

0.08348870277404785 0.0009431838989257812
Numpy is in this example 88.51794742163801 faster!


## Pandas : Panel Data Analysis

### Creating Pandas DataFrame

https://pandas.pydata.org/about/

In [2]:
import pandas as pd

In [104]:
l = [1,2,39,67,90]

In [105]:
l

[1, 2, 39, 67, 90]

In [106]:
pd.DataFrame(l, columns = ["degisken_ismi"])

Unnamed: 0,degisken_ismi
0,1
1,2
2,39
3,67
4,90


In [107]:
df = pd.DataFrame(l, columns = ["degisken_ismi"])
df

Unnamed: 0,degisken_ismi
0,1
1,2
2,39
3,67
4,90


In [108]:
type(df)

pandas.core.frame.DataFrame

In [109]:
import numpy as np
m = np.arange(1,10).reshape((3,3))
m

array([[1, 2, 3],
       [4, 5, 6],
       [7, 8, 9]])

In [110]:
type(m)

numpy.ndarray

In [111]:
pd.DataFrame(m, columns = ["var1","var2","var3"])

Unnamed: 0,var1,var2,var3
0,1,2,3
1,4,5,6
2,7,8,9


In [113]:
df = pd.DataFrame(m, columns = ["var1","var2","var3"])
df.head()

Unnamed: 0,var1,var2,var3
0,1,2,3
1,4,5,6
2,7,8,9


In [114]:
df.columns = ("deg1","deg2","deg3")

In [115]:
df

Unnamed: 0,deg1,deg2,deg3
0,1,2,3
1,4,5,6
2,7,8,9


In [116]:
df.axes

[RangeIndex(start=0, stop=3, step=1),
 Index(['deg1', 'deg2', 'deg3'], dtype='object')]

In [117]:
df.shape

(3, 3)

In [118]:
df.size

9

In [119]:
df.values

array([[1, 2, 3],
       [4, 5, 6],
       [7, 8, 9]])

In [120]:
type(df.values)

numpy.ndarray

### Element Operations

In [121]:
import numpy as np
s1 = np.random.randint(10, size = 5)
s2 = np.random.randint(10, size = 5)
s3 = np.random.randint(10, size = 5)

In [122]:
sozluk = {"var1": s1, "var2": s2, "var3": s3}

In [123]:
sozluk

{'var1': array([6, 8, 1, 1, 9]),
 'var2': array([3, 6, 8, 1, 8]),
 'var3': array([6, 5, 2, 6, 0])}

In [124]:
df = pd.DataFrame(sozluk)

In [125]:
df

Unnamed: 0,var1,var2,var3
0,6,3,6
1,8,6,5
2,1,8,2
3,1,1,6
4,9,8,0


In [126]:
df.index

RangeIndex(start=0, stop=5, step=1)

In [127]:
df.index = ['a', 'b', 'c', 'd', 'e']

In [128]:
df

Unnamed: 0,var1,var2,var3
a,6,3,6
b,8,6,5
c,1,8,2
d,1,1,6
e,9,8,0


In [130]:
df['c':'e']

Unnamed: 0,var1,var2,var3
c,1,8,2
d,1,1,6
e,9,8,0


In [None]:
# silme

In [131]:
df.drop("a", axis = 0)

Unnamed: 0,var1,var2,var3
b,8,6,5
c,1,8,2
d,1,1,6
e,9,8,0


In [132]:
df

Unnamed: 0,var1,var2,var3
a,6,3,6
b,8,6,5
c,1,8,2
d,1,1,6
e,9,8,0


In [133]:
df.drop("a", axis = 0, inplace = True)

In [134]:
df

Unnamed: 0,var1,var2,var3
b,8,6,5
c,1,8,2
d,1,1,6
e,9,8,0


In [None]:
#fancy 

In [135]:
l = ["c","e"]

In [136]:
df.drop(l, axis = 0)

Unnamed: 0,var1,var2,var3
b,8,6,5
d,1,1,6


In [None]:
#degiskenler icin

In [137]:
df

Unnamed: 0,var1,var2,var3
b,8,6,5
c,1,8,2
d,1,1,6
e,9,8,0


In [138]:
? df.drop

[1;31mSignature:[0m
 [0mdf[0m[1;33m.[0m[0mdrop[0m[1;33m([0m[1;33m
[0m    [0mlabels[0m[1;33m=[0m[1;32mNone[0m[1;33m,[0m[1;33m
[0m    [0maxis[0m[1;33m=[0m[1;36m0[0m[1;33m,[0m[1;33m
[0m    [0mindex[0m[1;33m=[0m[1;32mNone[0m[1;33m,[0m[1;33m
[0m    [0mcolumns[0m[1;33m=[0m[1;32mNone[0m[1;33m,[0m[1;33m
[0m    [0mlevel[0m[1;33m=[0m[1;32mNone[0m[1;33m,[0m[1;33m
[0m    [0minplace[0m[1;33m=[0m[1;32mFalse[0m[1;33m,[0m[1;33m
[0m    [0merrors[0m[1;33m=[0m[1;34m'raise'[0m[1;33m,[0m[1;33m
[0m[1;33m)[0m[1;33m[0m[1;33m[0m[0m
[1;31mDocstring:[0m
Drop specified labels from rows or columns.

Remove rows or columns by specifying label names and corresponding
axis, or by specifying directly index or column names. When using a
multi-index, labels on different levels can be removed by specifying
the level.

Parameters
----------
labels : single label or list-like
    Index or column labels to drop.
axis : {0 or 'index', 

In [139]:
"var1" in df

True

In [142]:
df["var1"]

b    8
c    1
d    1
e    9
Name: var1, dtype: int32

In [143]:
type(df["var1"])

pandas.core.series.Series

In [144]:
type(df[["var1"]])

pandas.core.frame.DataFrame

In [145]:
df.var1

b    8
c    1
d    1
e    9
Name: var1, dtype: int32

In [146]:
df["var4"] = df["var1"] / df["var2"]

In [147]:
df

Unnamed: 0,var1,var2,var3,var4
b,8,6,5,1.333333
c,1,8,2,0.125
d,1,1,6,1.0
e,9,8,0,1.125


In [148]:
l = ["var1","var2"]

In [149]:
df.drop(l, axis = 1)

Unnamed: 0,var3,var4
b,5,1.333333
c,2,0.125
d,6,1.0
e,0,1.125


In [150]:
df

Unnamed: 0,var1,var2,var3,var4
b,8,6,5,1.333333
c,1,8,2,0.125
d,1,1,6,1.0
e,9,8,0,1.125


### iloc & loc

In [151]:
import numpy as np
import pandas as pd
m = np.random.randint(1,30, size = (10,3))
df = pd.DataFrame(m, columns = ["var1","var2","var3"])
df

Unnamed: 0,var1,var2,var3
0,27,26,24
1,29,7,22
2,25,20,17
3,20,3,14
4,23,24,23
5,23,21,7
6,2,4,9
7,19,3,2
8,26,23,9
9,26,19,1


In [152]:
# iloc: integer based selection

In [155]:
? df.iloc

[1;31mType:[0m        property
[1;31mString form:[0m <property object at 0x000002097FC959F0>
[1;31mDocstring:[0m  
Purely integer-location based indexing for selection by position.

``.iloc[]`` is primarily integer position based (from ``0`` to
``length-1`` of the axis), but may also be used with a boolean
array.

Allowed inputs are:

- An integer, e.g. ``5``.
- A list or array of integers, e.g. ``[4, 3, 0]``.
- A slice object with ints, e.g. ``1:7``.
- A boolean array.
- A ``callable`` function with one argument (the calling Series or
  DataFrame) and that returns valid output for indexing (one of the above).
  This is useful in method chains, when you don't have a reference to the
  calling object, but would like to base your selection on some value.

``.iloc`` will raise ``IndexError`` if a requested indexer is
out-of-bounds, except *slice* indexers which allow out-of-bounds
indexing (this conforms with python/numpy *slice* semantics).

See more at :ref:`Selection by Position 

In [159]:
df.iloc[0:3]

Unnamed: 0,var1,var2,var3
0,27,26,24
1,29,7,22
2,25,20,17


In [158]:
df.iloc[0:3][['var1']]

Unnamed: 0,var1
0,27
1,29
2,25


In [156]:
df[0:3]

Unnamed: 0,var1,var2,var3
0,27,26,24
1,29,7,22
2,25,20,17


In [153]:
# loc: label based selection

In [160]:
df.loc[0:3]

Unnamed: 0,var1,var2,var3
0,27,26,24
1,29,7,22
2,25,20,17
3,20,3,14


In [161]:
df[0:3, 'var3']

TypeError: '(slice(0, 3, None), 'var3')' is an invalid key

In [162]:
df[0:3]['var3']

0    24
1    22
2    17
Name: var3, dtype: int32

In [163]:
df[0:3][['var3']]

Unnamed: 0,var3
0,24
1,22
2,17


In [165]:
df.iloc[0:3, 'var3']

ValueError: Location based indexing can only have [integer, integer slice (START point is INCLUDED, END point is EXCLUDED), listlike of integers, boolean array] types

In [164]:
df.loc[0:3, 'var3']

0    24
1    22
2    17
3    14
Name: var3, dtype: int32

In [166]:
df.iloc[0:3]

Unnamed: 0,var1,var2,var3
0,27,26,24
1,29,7,22
2,25,20,17


In [167]:
df.iloc[0:3][['var3']]

Unnamed: 0,var3
0,24
1,22
2,17


### Conditional Element Selection

In [168]:
import numpy as np
import pandas as pd
m = np.random.randint(1,30, size = (10,3))
df = pd.DataFrame(m, columns = ["var1","var2","var3"])
df

Unnamed: 0,var1,var2,var3
0,1,21,9
1,8,25,27
2,11,17,25
3,12,28,8
4,27,5,23
5,24,20,21
6,15,15,17
7,22,16,22
8,8,7,1
9,20,7,1


In [169]:
df["var1"]

0     1
1     8
2    11
3    12
4    27
5    24
6    15
7    22
8     8
9    20
Name: var1, dtype: int32

In [172]:
df[["var1", "var2", "var3"]]

Unnamed: 0,var1,var2,var3
0,1,21,9
1,8,25,27
2,11,17,25
3,12,28,8
4,27,5,23
5,24,20,21
6,15,15,17
7,22,16,22
8,8,7,1
9,20,7,1


In [170]:
df[0:2][["var1","var2"]]

Unnamed: 0,var1,var2
0,1,21
1,8,25


In [173]:
df.var1 > 15

0    False
1    False
2    False
3    False
4     True
5     True
6    False
7     True
8    False
9     True
Name: var1, dtype: bool

In [174]:
df[df.var1 > 15]

Unnamed: 0,var1,var2,var3
4,27,5,23
5,24,20,21
7,22,16,22
9,20,7,1


In [175]:
df[df.var1 > 15]["var2"]

4     5
5    20
7    16
9     7
Name: var2, dtype: int32

In [176]:
df[df.var1 > 15][["var2"]]

Unnamed: 0,var2
4,5
5,20
7,16
9,7


In [177]:
df[(df.var1 > 15) & (df.var3 < 5)]

Unnamed: 0,var1,var2,var3
9,20,7,1


In [178]:
df.iloc[(df.var1 > 15), ["var1","var2"]]

NotImplementedError: iLocation based boolean indexing on an integer type is not available

In [179]:
df.loc[(df.var1 > 15), ["var1","var2"]]

Unnamed: 0,var1,var2
4,27,5
5,24,20
7,22,16
9,20,7


In [180]:
df[(df.var1 > 15)][["var1","var2"]]

Unnamed: 0,var1,var2
4,27,5
5,24,20
7,22,16
9,20,7


In [181]:
df.loc[(df.var1 > 7) & (df.var3 > 3), ['var1', 'var2']]

Unnamed: 0,var1,var2
1,8,25
2,11,17
3,12,28
4,27,5
5,24,20
6,15,15
7,22,16


# Aggregation & Grouping

In [13]:
import pandas as pd
import numpy as np

In [15]:
df = pd.read_csv("C:/Users/yakup/PycharmProjects/dsmlbc/datasets/train.csv")

In [16]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


- count()
- first()
- last()
- mean()
- median()
- min()
- max()
- std()
- var()
- sum()

In [6]:
df[["Age","Fare"]].max()

Age      80.0000
Fare    512.3292
dtype: float64

In [7]:
df.max()

PassengerId                            891
Survived                                 1
Pclass                                   3
Name           van Melkebeke, Mr. Philemon
Sex                                   male
Age                                     80
SibSp                                    8
Parch                                    6
Ticket                           WE/P 5735
Fare                               512.329
dtype: object

In [8]:
df.groupby("Sex").agg({"Age":"mean"})

Unnamed: 0_level_0,Age
Sex,Unnamed: 1_level_1
female,27.915709
male,30.726645


In [9]:
df.groupby("Sex").agg({"Age":"mean", "Survived":"mean"})

Unnamed: 0_level_0,Age,Survived
Sex,Unnamed: 1_level_1,Unnamed: 2_level_1
female,27.915709,0.742038
male,30.726645,0.188908


In [10]:
df.groupby("Name").agg({"Age":"mean"})

Unnamed: 0_level_0,Age
Name,Unnamed: 1_level_1
"Abbing, Mr. Anthony",42.0
"Abbott, Mr. Rossmore Edward",16.0
"Abbott, Mrs. Stanton (Rosa Hunt)",35.0
"Abelson, Mr. Samuel",30.0
"Abelson, Mrs. Samuel (Hannah Wizosky)",28.0
...,...
"de Mulder, Mr. Theodore",30.0
"de Pelsmaeker, Mr. Alfons",16.0
"del Carlo, Mr. Sebastiano",29.0
"van Billiard, Mr. Austin Blyler",40.5


In [11]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [12]:
df.groupby(["Sex","Pclass"]).agg({"Age":"mean"})

Unnamed: 0_level_0,Unnamed: 1_level_0,Age
Sex,Pclass,Unnamed: 2_level_1
female,1,34.611765
female,2,28.722973
female,3,21.75
male,1,41.281386
male,2,30.740707
male,3,26.507589


In [13]:
df.groupby(["Sex","Pclass","Embarked"]).agg({"Survived":"mean", "Age":"max"})

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Survived,Age
Sex,Pclass,Embarked,Unnamed: 3_level_1,Unnamed: 4_level_1
female,1,C,0.976744,60.0
female,1,Q,1.0,33.0
female,1,S,0.958333,63.0
female,2,C,1.0,28.0
female,2,Q,1.0,30.0
female,2,S,0.910448,57.0
female,3,C,0.652174,45.0
female,3,Q,0.727273,39.0
female,3,S,0.375,63.0
male,1,C,0.404762,71.0


In [14]:
df.groupby(["Sex","Pclass","Embarked"]).agg({"Survived":"mean", "Age":["min", np.mean, "max"]})

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Survived,Age,Age,Age
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,mean,min,mean,max
Sex,Pclass,Embarked,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
female,1,C,0.976744,16.0,36.052632,60.0
female,1,Q,1.0,33.0,33.0,33.0
female,1,S,0.958333,2.0,32.704545,63.0
female,2,C,1.0,3.0,19.142857,28.0
female,2,Q,1.0,30.0,30.0,30.0
female,2,S,0.910448,2.0,29.719697,57.0
female,3,C,0.652174,0.75,14.0625,45.0
female,3,Q,0.727273,15.0,22.85,39.0
female,3,S,0.375,1.0,23.223684,63.0
male,1,C,0.404762,17.0,40.111111,71.0


In [18]:
# apply 

In [15]:
df["Age"].sum()

21205.17

In [16]:
df[["Age","Parch"]].sum()

Age      21205.17
Parch      340.00
dtype: float64

In [17]:
df[["Age","Parch"]].apply(np.sum, axis = 0)

Age      21205.17
Parch      340.00
dtype: float64

In [18]:
df[["Age","Parch"]].apply(lambda x: x**2)

Unnamed: 0,Age,Parch
0,484.0,0
1,1444.0,0
2,676.0,0
3,1225.0,0
4,1225.0,0
...,...,...
886,729.0,0
887,361.0,0
888,,4
889,676.0,0


In [19]:
df[["Age","Parch"]].apply(lambda x: (x-x.mean())/x.std())

Unnamed: 0,Age,Parch
0,-0.530005,-0.473408
1,0.571430,-0.473408
2,-0.254646,-0.473408
3,0.364911,-0.473408
4,0.364911,-0.473408
...,...,...
886,-0.185807,-0.473408
887,-0.736524,-0.473408
888,,2.007806
889,-0.254646,-0.473408


In [20]:
df

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [21]:
df[["Age","Parch"]].transform(lambda x: (x-x.mean())/x.std())

Unnamed: 0,Age,Parch
0,-0.530005,-0.473408
1,0.571430,-0.473408
2,-0.254646,-0.473408
3,0.364911,-0.473408
4,0.364911,-0.473408
...,...,...
886,-0.185807,-0.473408
887,-0.736524,-0.473408
888,,2.007806
889,-0.254646,-0.473408


In [22]:
df

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [23]:
df.pivot_table("Survived", index = "Sex", columns = "Age")

Age,0.42,0.67,0.75,0.83,0.92,1.00,2.00,3.00,4.00,5.00,...,62.00,63.00,64.00,65.00,66.00,70.00,70.50,71.00,74.00,80.00
Sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
female,,,1.0,,,1.0,0.333333,0.5,1.0,1.0,...,1.0,1.0,,,,,,,,
male,1.0,1.0,,1.0,1.0,0.6,0.25,1.0,0.4,,...,0.333333,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [24]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [26]:
?pd.cut

[1;31mSignature:[0m
[0mpd[0m[1;33m.[0m[0mcut[0m[1;33m([0m[1;33m
[0m    [0mx[0m[1;33m,[0m[1;33m
[0m    [0mbins[0m[1;33m,[0m[1;33m
[0m    [0mright[0m[1;33m:[0m [0mbool[0m [1;33m=[0m [1;32mTrue[0m[1;33m,[0m[1;33m
[0m    [0mlabels[0m[1;33m=[0m[1;32mNone[0m[1;33m,[0m[1;33m
[0m    [0mretbins[0m[1;33m:[0m [0mbool[0m [1;33m=[0m [1;32mFalse[0m[1;33m,[0m[1;33m
[0m    [0mprecision[0m[1;33m:[0m [0mint[0m [1;33m=[0m [1;36m3[0m[1;33m,[0m[1;33m
[0m    [0minclude_lowest[0m[1;33m:[0m [0mbool[0m [1;33m=[0m [1;32mFalse[0m[1;33m,[0m[1;33m
[0m    [0mduplicates[0m[1;33m:[0m [0mstr[0m [1;33m=[0m [1;34m'raise'[0m[1;33m,[0m[1;33m
[0m[1;33m)[0m[1;33m[0m[1;33m[0m[0m
[1;31mDocstring:[0m
Bin values into discrete intervals.

Use `cut` when you need to segment and sort data values into bins. This
function is also useful for going from a continuous variable to a
categorical variable. For example, `cut` cou

In [31]:
df["NewAge"] = pd.cut(df["Age"], [0,10,18,25,40,90])

In [28]:
df["NewAge"] = pd.cut(df["Age"], [0,10,18,25,40,90], labels = ['Child', 'Teen', 'Young', 'MidAged', 'Old'])

In [29]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,NewAge
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,Young
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,MidAged
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,MidAged
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,MidAged
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,MidAged


In [30]:
df.pivot_table("Survived", index = "Sex", columns = "NewAge")

NewAge,Child,Teen,Young,MidAged,Old
Sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
female,0.612903,0.72973,0.759259,0.802198,0.770833
male,0.575758,0.131579,0.12037,0.22093,0.176471


In [32]:
df.pivot_table("Survived", index = "Sex", columns = "NewAge")

NewAge,"(0, 10]","(10, 18]","(18, 25]","(25, 40]","(40, 90]"
Sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
female,0.612903,0.72973,0.759259,0.802198,0.770833
male,0.575758,0.131579,0.12037,0.22093,0.176471
