## Numpy

### Creating Arrays

In [2]:
import numpy as np

In [11]:
# create arrays of 1 or 0

print(np.zeros(5, dtype='int'))
print(np.zeros([2,2], dtype='int'))
print(np.zeros((3,3), dtype='int'))    # () and [] can be used interchangably
print(np.ones((3,5), dtype=float),'\n')

# create with pre-defined value

print(np.full((3,5), 1.23))

# create a sequence set

print(np.arange(0,20,2))

# create a range of values with even space

print(np.linspace(0,1,5))

# create a 3x3 array with mean 0 and standard deviation 1

print(np.random.normal(0, 1, (3,3)))

# create an identity matrix

print(np.eye(3), '\n')

# create multi-dimension array

np.random.seed(0)
x1 = np.random.randint(5, size=5)          # 1-d
x2 = np.random.randint(5, size=(3,4))      # 2-d
x3 = np.random.randint(5, size=(3,4,5))    # 3-d

print(x1)
print(x2)
print(x3,'\n')

print(x3.ndim)
print(x3.shape)
print(x3.size)

[0 0 0 0 0]
[[0 0]
 [0 0]]
[[0 0 0]
 [0 0 0]
 [0 0 0]]
[[1. 1. 1. 1. 1.]
 [1. 1. 1. 1. 1.]
 [1. 1. 1. 1. 1.]] 

[[1.23 1.23 1.23 1.23 1.23]
 [1.23 1.23 1.23 1.23 1.23]
 [1.23 1.23 1.23 1.23 1.23]]
[ 0  2  4  6  8 10 12 14 16 18]
[0.   0.25 0.5  0.75 1.  ]
[[-0.74385608 -2.5174371  -1.50709602]
 [ 1.14907613 -1.19357825  1.14104245]
 [ 1.50944508  1.06777513 -0.68658948]]
[[1. 0. 0.]
 [0. 1. 0.]
 [0. 0. 1.]] 

[4 0 3 3 3]
[[1 3 2 4]
 [0 0 4 2]
 [1 0 1 1]]
[[[0 1 4 3 0]
  [3 0 2 3 0]
  [1 3 3 3 0]
  [1 1 1 0 2]]

 [[4 3 3 2 4]
  [2 0 0 4 0]
  [4 1 4 1 2]
  [2 0 1 1 1]]

 [[1 3 3 2 3]
  [0 3 4 1 2]
  [4 3 4 4 4]
  [3 4 4 4 0]]] 

3
(3, 4, 5)
60


### Indexing

In [12]:
# simple indexing

print(x1[0])
print(x1[-1], '\n')    # get the last value

# multi-dimension indexing

print(x2[2,])
print(x2[2,3])
print(x2[2,-1], '\n')

print(x3[0])
print(x3[0,1])
print(x3[0,1,2],'\n')

# slicing

print(x1[:2])
print(x1[2:])
print(x1[::2])    # return elements at even space
print(x1[1::2])   # return elements at from the specified position step by two
print(x1[::-1])   # reverse the array

4
3 

[1 0 1 1]
1
1 

[[0 1 4 3 0]
 [3 0 2 3 0]
 [1 3 3 3 0]
 [1 1 1 0 2]]
[3 0 2 3 0]
2 

[4 0]
[3 3 3]
[4 3 3]
[0 3]
[3 3 3 0 4]


### Concatenation and Split

In [13]:
# concatenate 2-dimensional arrays

x = np.array([1, 2, 3])
y = np.array([3, 2, 1])
z = [21, 21, 21]
print(np.concatenate([x, y, z]), '\n')

# concatenate 2-dimensional arrays

n = np.array([[1,2,3], [4,5,6]])
print(n.ndim)
print(np.concatenate([n, n]), '\n')

# using its axis parameter, you can define row-wise or column-wise matrix

print(np.concatenate([n, n],axis=0))
print(np.concatenate([n, n],axis=1))

[ 1  2  3  3  2  1 21 21 21] 

2
[[1 2 3]
 [4 5 6]
 [1 2 3]
 [4 5 6]] 

[[1 2 3]
 [4 5 6]
 [1 2 3]
 [4 5 6]]
[[1 2 3 1 2 3]
 [4 5 6 4 5 6]]


In [15]:
# use np.vstack or np.hstack to concatenate 
# object with different dimensions

print(np.vstack([n, n]), '\n')

z = np.array([[9],[9]])
print(np.hstack([n, z]))

[[1 2 3]
 [4 5 6]
 [1 2 3]
 [4 5 6]] 

[[1 2 3 9]
 [4 5 6 9]]


In [17]:
# split arrays based on pre-defined positions

x = np.arange(10)
print(x)
x1, x2, x3 = np.split(x, [3,6])    # 3 and 6 are the positions at where we split the array
print(x1, x2, x3, '\n')

# split matrix

n = np.arange(16).reshape(4,4)
upper, lower = np.vsplit(n, [3])
print(n, upper, lower, '\n')

left, right = np.hsplit(n, [2])
print(n, left, right, '\n')

[0 1 2 3 4 5 6 7 8 9]
[0 1 2] [3 4 5] [6 7 8 9] 

[[ 0  1  2  3]
 [ 4  5  6  7]
 [ 8  9 10 11]
 [12 13 14 15]]
[[ 0  1  2  3]
 [ 4  5  6  7]
 [ 8  9 10 11]] [[12 13 14 15]] 

[[ 0  1  2  3]
 [ 4  5  6  7]
 [ 8  9 10 11]
 [12 13 14 15]] [[ 0  1]
 [ 4  5]
 [ 8  9]
 [12 13]] [[ 2  3]
 [ 6  7]
 [10 11]
 [14 15]] 



## Pandas

### Sort

In [18]:
import pandas as pd

In [19]:
# let's create another data frame

data = pd.DataFrame({'group':['a', 'a', 'a', 'b','b', 'b', 'c', 'c','c'],
                     'ounces':[4, 3, 12, 6, 7.5, 8, 3, 5, 6]})
display(data)

# let's sort the data frame by one or more columns

data.sort_values(by=['ounces'], ascending=True, inplace=True)
display(data)
data.sort_values(by=['group','ounces'],ascending=[True,False], inplace=True)
display(data)

Unnamed: 0,group,ounces
0,a,4.0
1,a,3.0
2,a,12.0
3,b,6.0
4,b,7.5
5,b,8.0
6,c,3.0
7,c,5.0
8,c,6.0


Unnamed: 0,group,ounces
1,a,3.0
6,c,3.0
0,a,4.0
7,c,5.0
3,b,6.0
8,c,6.0
4,b,7.5
5,b,8.0
2,a,12.0


Unnamed: 0,group,ounces
2,a,12.0
0,a,4.0
1,a,3.0
5,b,8.0
4,b,7.5
3,b,6.0
8,c,6.0
7,c,5.0
6,c,3.0


### Remove Duplicates

In [20]:
# remove duplicates 

data = pd.DataFrame({'k1':['one']*3 + ['two']*4, 'k2':[3,2,1,3,3,4,4]})
display(data)
display(data.drop_duplicates())

# alternatively, we can also remove duplicates based on just one particular column
# let's remove duplicate values from the k1 column

display(data.drop_duplicates(subset='k1'))

Unnamed: 0,k1,k2
0,one,3
1,one,2
2,one,1
3,two,3
4,two,3
5,two,4
6,two,4


Unnamed: 0,k1,k2
0,one,3
1,one,2
2,one,1
3,two,3
5,two,4


Unnamed: 0,k1,k2
0,one,3
3,two,3


### Creating New Variables

In [21]:
data = pd.DataFrame({'food': ['bacon', 'pulled pork', 'bacon', 'Pastrami','corned beef', 
                              'Bacon', 'pastrami', 'honey ham','nova lox'],
                     'ounces': [4, 3, 12, 6, 7.5, 8, 3, 5, 6]})
data

Unnamed: 0,food,ounces
0,bacon,4.0
1,pulled pork,3.0
2,bacon,12.0
3,Pastrami,6.0
4,corned beef,7.5
5,Bacon,8.0
6,pastrami,3.0
7,honey ham,5.0
8,nova lox,6.0


In [22]:
# 1. mapping method, create a new variable based on the values of another column

meat_to_animal = {
'bacon': 'pig',
'pulled pork': 'pig',
'pastrami': 'cow',
'corned beef': 'cow',
'honey ham': 'pig',
'nova lox': 'salmon'
}

data['animal'] = data['food'].map(str.lower).map(meat_to_animal)
data

Unnamed: 0,food,ounces,animal
0,bacon,4.0,pig
1,pulled pork,3.0,pig
2,bacon,12.0,pig
3,Pastrami,6.0,cow
4,corned beef,7.5,cow
5,Bacon,8.0,pig
6,pastrami,3.0,cow
7,honey ham,5.0,pig
8,nova lox,6.0,salmon


In [23]:
# 2. apply method, create a new variable based on a function

def meat_2_animal(series):
    if series['food'] == 'bacon':
        return 'pig'
    elif series['food'] == 'pulled pork':
        return 'pig'
    elif series['food'] == 'pastrami':
        return 'cow'
    elif series['food'] == 'corned beef':
        return 'cow'
    elif series['food'] == 'honey ham':
        return 'pig'
    else:
        return 'salmon'
    
lower = lambda x: x.lower()
data['food'] = data['food'].apply(lower)
data['animal2'] = data.apply(meat_2_animal, axis='columns')
data

Unnamed: 0,food,ounces,animal,animal2
0,bacon,4.0,pig,pig
1,pulled pork,3.0,pig,pig
2,bacon,12.0,pig,pig
3,pastrami,6.0,cow,cow
4,corned beef,7.5,cow,cow
5,bacon,8.0,pig,pig
6,pastrami,3.0,cow,cow
7,honey ham,5.0,pig,pig
8,nova lox,6.0,salmon,salmon


In [24]:
# 3. assign method, create a new numerical variable

data.assign(new_ounces = data['ounces']*10)

Unnamed: 0,food,ounces,animal,animal2,new_ounces
0,bacon,4.0,pig,pig,40.0
1,pulled pork,3.0,pig,pig,30.0
2,bacon,12.0,pig,pig,120.0
3,pastrami,6.0,cow,cow,60.0
4,corned beef,7.5,cow,cow,75.0
5,bacon,8.0,pig,pig,80.0
6,pastrami,3.0,cow,cow,30.0
7,honey ham,5.0,pig,pig,50.0
8,nova lox,6.0,salmon,salmon,60.0


### Drop and Rename Column

In [25]:
# remove column

data.drop('animal2', axis='columns',inplace=True)
display(data)

# rename column and row names using rename function

data = pd.DataFrame(np.arange(12).reshape((3, 4)),
                    index=['Ohio', 'Colorado', 'New York'], columns=['one', 'two', 'three', 'four'])
display(data)

data.rename(index = {'Ohio':'SanF'}, columns={'one':'one_p','two':'two_p'},inplace=True)
display(data)

Unnamed: 0,food,ounces,animal
0,bacon,4.0,pig
1,pulled pork,3.0,pig
2,bacon,12.0,pig
3,pastrami,6.0,cow
4,corned beef,7.5,cow
5,bacon,8.0,pig
6,pastrami,3.0,cow
7,honey ham,5.0,pig
8,nova lox,6.0,salmon


Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
New York,8,9,10,11


Unnamed: 0,one_p,two_p,three,four
SanF,0,1,2,3
Colorado,4,5,6,7
New York,8,9,10,11


### Replacing and Filling NA

In [26]:
data = pd.Series([1., -999., 2., -999., -1000., 3.])
display(data)
data.replace([-999, -1000], np.nan, inplace=True)
display(data)

0       1.0
1    -999.0
2       2.0
3    -999.0
4   -1000.0
5       3.0
dtype: float64

0    1.0
1    NaN
2    2.0
3    NaN
4    NaN
5    3.0
dtype: float64

In [27]:
data.fillna(100.0, inplace=True)
display(data)

0      1.0
1    100.0
2      2.0
3    100.0
4    100.0
5      3.0
dtype: float64

### Groupby

In [55]:
# grouping data and creating pivots in pandas

df1 = pd.DataFrame({'key1' : ['a', 'a', 'b', 'b', 'a'],
                    'key2' : ['one', 'two', 'one', 'two', 'one'],
                    'data1' : np.random.randn(5),
                    'data2' : np.random.randn(5)})
df1

Unnamed: 0,key1,key2,data1,data2
0,a,one,-0.954634,0.838311
1,a,two,-0.248805,-0.416312
2,b,one,-0.055088,1.356928
3,b,two,0.28207,-1.197659
4,a,one,-1.894318,-1.677046


In [29]:
# calculate the mean of data1 column by key1

df1['data1'].groupby(df1['key1']).mean()

key1
a   -1.484957
b   -0.179010
Name: data1, dtype: float64

### Slicing a DataFrame

In [51]:
dates = pd.date_range('20130101',periods=6)
df2 = pd.DataFrame(np.random.randn(6,4), index=dates, columns=list('ABCD'))
display(df2)

# get first n rows

display(df2[:2])

# slice based on index
# if don't know index names

display(df2.loc[df2.index[:4], ])

# if know index names
display(df2['20130101':'20130104'])

Unnamed: 0,A,B,C,D
2013-01-01,0.982422,-1.180951,0.089944,-0.706245
2013-01-02,0.052225,-1.341579,1.382942,0.386913
2013-01-03,-1.332453,0.503817,1.032916,-0.122224
2013-01-04,1.832638,-0.065733,1.863777,-0.123691
2013-01-05,1.220399,-0.006587,0.593745,-0.663979
2013-01-06,-0.607423,-1.56437,-1.733742,-0.615564


Unnamed: 0,A,B,C,D
2013-01-01,0.982422,-1.180951,0.089944,-0.706245
2013-01-02,0.052225,-1.341579,1.382942,0.386913


Unnamed: 0,A,B,C,D
2013-01-01,0.982422,-1.180951,0.089944,-0.706245
2013-01-02,0.052225,-1.341579,1.382942,0.386913
2013-01-03,-1.332453,0.503817,1.032916,-0.122224
2013-01-04,1.832638,-0.065733,1.863777,-0.123691


Unnamed: 0,A,B,C,D
2013-01-01,0.982422,-1.180951,0.089944,-0.706245
2013-01-02,0.052225,-1.341579,1.382942,0.386913
2013-01-03,-1.332453,0.503817,1.032916,-0.122224
2013-01-04,1.832638,-0.065733,1.863777,-0.123691


In [52]:
# slicing based on both rows and columns

display(df2.loc['20130102':'20130103', ['A','B']])

# get specific rows and columns using columns and row indexes

display(df2.iloc[[1,5], [0,2]])

Unnamed: 0,A,B
2013-01-02,0.052225,-1.341579
2013-01-03,-1.332453,0.503817


Unnamed: 0,A,C
2013-01-02,0.052225,1.382942
2013-01-06,-0.607423,-1.733742


In [62]:
df1['E'] = 'one two three four five'.split()

# boolean indexing

display(df2[df2.A > 0.05])

# select rows based on column values

display(df1['E']==['one','two','three','five','three'])
display(df1[df1['E'].isin(['two','four'])])

# select all rows except those with two and four

display(df1[~df1['E'].isin(['two','four'])])

Unnamed: 0,A,B,C,D
2013-01-01,0.982422,-1.180951,0.089944,-0.706245
2013-01-02,0.052225,-1.341579,1.382942,0.386913
2013-01-04,1.832638,-0.065733,1.863777,-0.123691
2013-01-05,1.220399,-0.006587,0.593745,-0.663979


0     True
1     True
2     True
3    False
4    False
Name: E, dtype: bool

Unnamed: 0,key1,key2,data1,data2,E
1,a,two,-0.248805,-0.416312,two
3,b,two,0.28207,-1.197659,four


Unnamed: 0,key1,key2,data1,data2,E
0,a,one,-0.954634,0.838311,one
2,b,one,-0.055088,1.356928,three
4,a,one,-1.894318,-1.677046,five


### Query Method

In [70]:
# list all columns where A is greater than C

display(df1.query('data1 > data2'))

# using OR condition

display(df1.query('data1 > data2 | key1 == "a"'))

Unnamed: 0,key1,key2,data1,data2,E
1,a,two,-0.248805,-0.416312,two
3,b,two,0.28207,-1.197659,four


Unnamed: 0,key1,key2,data1,data2,E
0,a,one,-0.954634,0.838311,one
1,a,two,-0.248805,-0.416312,two
3,b,two,0.28207,-1.197659,four
4,a,one,-1.894318,-1.677046,five


### Pivot Table

In [71]:
data = pd.DataFrame({'group': ['a', 'a', 'a', 'b','b', 'b', 'c', 'c','c'],
                     'ounces': [4, 3, 12, 6, 7.5, 8, 3, 5, 6]})
display(data)

# calculate means of each group

display(data.pivot_table(values='ounces',index='group',aggfunc=np.mean))

# calculate count by each group

display(data.pivot_table(values='ounces',index='group',aggfunc='count'))

Unnamed: 0,group,ounces
0,a,4.0
1,a,3.0
2,a,12.0
3,b,6.0
4,b,7.5
5,b,8.0
6,c,3.0
7,c,5.0
8,c,6.0


Unnamed: 0_level_0,ounces
group,Unnamed: 1_level_1
a,6.333333
b,7.166667
c,4.666667


Unnamed: 0_level_0,ounces
group,Unnamed: 1_level_1
a,3
b,3
c,3
