## Numpy

### Create Arrays 

In [3]:
import numpy as np

In [26]:
# create arrays of 1 or 0

print(np.zeros(5, dtype='int'), '\n')
print(np.zeros([2,2], dtype='int'), '\n')
print(np.zeros((3,3), dtype='int'), '\n')    # () and [] can be used interchangably
print(np.ones((3,5), dtype=float), '\n')

# create with pre-defined value

print(np.full((3,5), 1.23), '\n')

# create a sequence set

print(np.arange(0,20,2), '\n')

# create a range of values with even space

print(np.linspace(0,1,5), '\n')

# create a 3x3 array with mean 0 and standard deviation 1

print(np.random.normal(0, 1, (3,3)), '\n')

# create an identity matrix

print(np.eye(3), '\n')

# create multi-dimension array

np.random.seed(0)
x1 = np.random.randint(5, size=5)          # 1-d
x2 = np.random.randint(5, size=(3,4))      # 2-d
x3 = np.random.randint(5, size=(3,4,5))    # 3-d

print(x1, '\n')
print(x2, '\n')

print(x3)
print(x3.ndim)
print(x3.shape)
print(x3.size)

[0 0 0 0 0] 

[[0 0]
 [0 0]] 

[[0 0 0]
 [0 0 0]
 [0 0 0]] 

[[1. 1. 1. 1. 1.]
 [1. 1. 1. 1. 1.]
 [1. 1. 1. 1. 1.]] 

[[1.23 1.23 1.23 1.23 1.23]
 [1.23 1.23 1.23 1.23 1.23]
 [1.23 1.23 1.23 1.23 1.23]] 

[ 0  2  4  6  8 10 12 14 16 18] 

[0.   0.25 0.5  0.75 1.  ] 

[[ 0.85292596  0.01835718  0.42830357]
 [ 0.99627783 -0.49114966  0.71267817]
 [ 1.11334035 -2.15367459 -0.41611148]] 

[[1. 0. 0.]
 [0. 1. 0.]
 [0. 0. 1.]] 

[4 0 3 3 3] 

[[1 3 2 4]
 [0 0 4 2]
 [1 0 1 1]] 

[[[0 1 4 3 0]
  [3 0 2 3 0]
  [1 3 3 3 0]
  [1 1 1 0 2]]

 [[4 3 3 2 4]
  [2 0 0 4 0]
  [4 1 4 1 2]
  [2 0 1 1 1]]

 [[1 3 3 2 3]
  [0 3 4 1 2]
  [4 3 4 4 4]
  [3 4 4 4 0]]]
3
(3, 4, 5)
60


### Indexing and Slicing

In [27]:
# simple indexing

print(x1[0])
print(x1[-1], '\n')    # get the last value

# multi-dimension indexing

print(x2[2,])
print(x2[2,3])
print(x2[2,-1], '\n')

print(x3[0], '\n')
print(x3[0,1], '\n')
print(x3[0,1,2])

# slicing

print(x1[:2])
print(x1[2:])
print(x1[::2])    # return elements at even space
print(x1[1::2])   # return elements at from the specified position step by two
print(x1[::-1])   # reverse the array

4
3 

[1 0 1 1]
1
1 

[[0 1 4 3 0]
 [3 0 2 3 0]
 [1 3 3 3 0]
 [1 1 1 0 2]] 

[3 0 2 3 0] 

2
[4 0]
[3 3 3]
[4 3 3]
[0 3]
[3 3 3 0 4]


### Concatenation and Split

In [28]:
# concatenate two or more arrays at once

x = np.array([1, 2, 3])
y = np.array([3, 2, 1])
z = [21, 21, 21]
print(np.concatenate([x, y, z]), '\n')

# create 2-dimensional arrays

n = np.array([[1,2,3], [4,5,6]])
print(n.ndim)
print(np.concatenate([n, n]), '\n')

# using its axis parameter, you can define row-wise or column-wise matrix

print(np.concatenate([n, n],axis=0), '\n')
print(np.concatenate([n, n],axis=1))

[ 1  2  3  3  2  1 21 21 21] 

2
[[1 2 3]
 [4 5 6]
 [1 2 3]
 [4 5 6]] 

[[1 2 3]
 [4 5 6]
 [1 2 3]
 [4 5 6]] 

[[1 2 3 1 2 3]
 [4 5 6 4 5 6]]


In [29]:
# use np.vstack or np.hstack to concatenate 
# object with different dimensions

print(np.vstack([x, n]), '\n')

z = np.array([[9],[9]])
print(np.hstack([n, z]))

[[1 2 3]
 [1 2 3]
 [4 5 6]] 

[[1 2 3 9]
 [4 5 6 9]]


In [30]:
# split arrays based on pre-defined positions

x = np.arange(10)
print(x)
x1, x2, x3 = np.split(x, [3,6])
print(x1, x2, x3, '\n')

# split matrix

n = np.arange(16).reshape(4,4)
upper, lower = np.vsplit(n, [3])
print(n, '\n')
print(upper, lower, '\n')

left, right = np.hsplit(n, [2])
print(left, right, '\n')

[0 1 2 3 4 5 6 7 8 9]
[0 1 2] [3 4 5] [6 7 8 9] 

[[ 0  1  2  3]
 [ 4  5  6  7]
 [ 8  9 10 11]
 [12 13 14 15]] 

[[ 0  1  2  3]
 [ 4  5  6  7]
 [ 8  9 10 11]] [[12 13 14 15]] 

[[ 0  1]
 [ 4  5]
 [ 8  9]
 [12 13]] [[ 2  3]
 [ 6  7]
 [10 11]
 [14 15]] 



## Pandas

### Sort

In [1]:
import pandas as pd

In [33]:
# let's create another data frame

data = pd.DataFrame({'group':['a', 'a', 'a', 'b','b', 'b', 'c', 'c','c'],
                     'ounces':[4, 3, 12, 6, 7.5, 8, 3, 5, 6]})

# let's sort the data frame by one or more columns

data.sort_values(by=['ounces'], ascending=True, inplace=True)
display(data)
data.sort_values(by=['group','ounces'],ascending=[True,False], inplace=True)
display(data)

Unnamed: 0,group,ounces
1,a,3.0
6,c,3.0
0,a,4.0
7,c,5.0
3,b,6.0
8,c,6.0
4,b,7.5
5,b,8.0
2,a,12.0


Unnamed: 0,group,ounces
2,a,12.0
0,a,4.0
1,a,3.0
5,b,8.0
4,b,7.5
3,b,6.0
8,c,6.0
7,c,5.0
6,c,3.0


### Remove Duplicates

In [34]:
# remove duplicates 

data = pd.DataFrame({'k1':['one']*3 + ['two']*4, 'k2':[3,2,1,3,3,4,4]})
display(data)
display(data.drop_duplicates())

# alternatively, we can also remove duplicates based on just one particular column
# let's remove duplicate values from the k1 column

display(data.drop_duplicates(subset='k1'))

Unnamed: 0,k1,k2
0,one,3
1,one,2
2,one,1
3,two,3
4,two,3
5,two,4
6,two,4


Unnamed: 0,k1,k2
0,one,3
1,one,2
2,one,1
3,two,3
5,two,4


Unnamed: 0,k1,k2
0,one,3
3,two,3


### Create New Variables

In [35]:
data = pd.DataFrame({'food': ['bacon', 'pulled pork', 'bacon', 'Pastrami','corned beef', 
                              'Bacon', 'pastrami', 'honey ham','nova lox'],
                     'ounces': [4, 3, 12, 6, 7.5, 8, 3, 5, 6]})

In [36]:
# 1. create a new variable based on the values of another column

meat_to_animal = {
'bacon': 'pig',
'pulled pork': 'pig',
'pastrami': 'cow',
'corned beef': 'cow',
'honey ham': 'pig',
'nova lox': 'salmon'
}

data['animal'] = data['food'].map(str.lower).map(meat_to_animal)
data

Unnamed: 0,food,ounces,animal
0,bacon,4.0,pig
1,pulled pork,3.0,pig
2,bacon,12.0,pig
3,Pastrami,6.0,cow
4,corned beef,7.5,cow
5,Bacon,8.0,pig
6,pastrami,3.0,cow
7,honey ham,5.0,pig
8,nova lox,6.0,salmon


In [37]:
# 2. create a new variable based on a function

def meat_2_animal(series):
    if series['food'] == 'bacon':
        return 'pig'
    elif series['food'] == 'pulled pork':
        return 'pig'
    elif series['food'] == 'pastrami':
        return 'cow'
    elif series['food'] == 'corned beef':
        return 'cow'
    elif series['food'] == 'honey ham':
        return 'pig'
    else:
        return 'salmon'
    
lower = lambda x: x.lower()
data['food'] = data['food'].apply(lower)
data['animal2'] = data.apply(meat_2_animal, axis='columns')
data

Unnamed: 0,food,ounces,animal,animal2
0,bacon,4.0,pig,pig
1,pulled pork,3.0,pig,pig
2,bacon,12.0,pig,pig
3,pastrami,6.0,cow,cow
4,corned beef,7.5,cow,cow
5,bacon,8.0,pig,pig
6,pastrami,3.0,cow,cow
7,honey ham,5.0,pig,pig
8,nova lox,6.0,salmon,salmon


In [38]:
# 3. another way to create a new variable is by using the assign function
# this method is particularly helpful to the numerical variables

data.assign(new_variable = data['ounces']*10)

Unnamed: 0,food,ounces,animal,animal2,new_variable
0,bacon,4.0,pig,pig,40.0
1,pulled pork,3.0,pig,pig,30.0
2,bacon,12.0,pig,pig,120.0
3,pastrami,6.0,cow,cow,60.0
4,corned beef,7.5,cow,cow,75.0
5,bacon,8.0,pig,pig,80.0
6,pastrami,3.0,cow,cow,30.0
7,honey ham,5.0,pig,pig,50.0
8,nova lox,6.0,salmon,salmon,60.0


### Drop and Rename Column

In [39]:
# remove column

data.drop('animal2',axis='columns',inplace=True)
display(data)

# rename column and row names using rename function

data = pd.DataFrame(np.arange(12).reshape((3, 4)),
                    index=['Ohio', 'Colorado', 'New York'], columns=['one', 'two', 'three', 'four'])

data.rename(index = {'Ohio':'SanF'}, columns={'one':'one_p','two':'two_p'},inplace=True)
display(data)

Unnamed: 0,food,ounces,animal
0,bacon,4.0,pig
1,pulled pork,3.0,pig
2,bacon,12.0,pig
3,pastrami,6.0,cow
4,corned beef,7.5,cow
5,bacon,8.0,pig
6,pastrami,3.0,cow
7,honey ham,5.0,pig
8,nova lox,6.0,salmon


### Replacing and Filling NA

In [40]:
# replacing or filling in values

data = pd.Series([1., -999., 2., -999., -1000., 3.])

data.replace([-999, -1000], np.nan, inplace=True)
display(data)

data.fillna(100.0, inplace=True)
display(data)

0    1.0
1    NaN
2    2.0
3    NaN
4    NaN
5    3.0
dtype: float64

0      1.0
1    100.0
2      2.0
3    100.0
4    100.0
5      3.0
dtype: float64

### Binning

In [41]:
# bin continuous variables
# we'll divide the ages into bins such as 18-25, 26-35,36-60 and 60 and above.

ages = [20, 22, 25, 27, 21, 23, 37, 31, 61, 45, 41, 32]
bins = [18, 25, 35, 60, 100]
cats = pd.cut(ages, bins)
display(cats)

# adjust bin boundaries

cats = pd.cut(ages,bins,right=False)
display(cats)

display(cats.value_counts())

# pass unique name to each label

bin_names = ['Youth', 'YoungAdult', 'MiddleAge', 'Senior']
new_cats = pd.cut(ages, bins,labels=bin_names)
display(new_cats)

[(18, 25], (18, 25], (18, 25], (25, 35], (18, 25], ..., (25, 35], (60, 100], (35, 60], (35, 60], (25, 35]]
Length: 12
Categories (4, interval[int64]): [(18, 25] < (25, 35] < (35, 60] < (60, 100]]

[[18, 25), [18, 25), [25, 35), [25, 35), [18, 25), ..., [25, 35), [60, 100), [35, 60), [35, 60), [25, 35)]
Length: 12
Categories (4, interval[int64]): [[18, 25) < [25, 35) < [35, 60) < [60, 100)]

[18, 25)     4
[25, 35)     4
[35, 60)     3
[60, 100)    1
dtype: int64

['Youth', 'Youth', 'Youth', 'YoungAdult', 'Youth', ..., 'YoungAdult', 'Senior', 'MiddleAge', 'MiddleAge', 'YoungAdult']
Length: 12
Categories (4, object): ['Youth' < 'YoungAdult' < 'MiddleAge' < 'Senior']

In [None]:
# another method

labels = ["{0}-{1}".format(i, i+9) for i in range(0,100,10)]
df.Age = pd.cut(df.Age, range(0,101,10), right=False, labels=labels)
df.Age = df.Age.astype('object')

### Groupby

In [12]:
# grouping data and creating pivots in pandas

df1 = pd.DataFrame({'key1' : ['a', 'a', 'b', 'b', 'a'],
                   'key2' : ['one', 'two', 'one', 'two', 'one'],
                   'data1' : np.random.randn(5),
                   'data2' : np.random.randn(5)})
df1

Unnamed: 0,key1,key2,data1,data2
0,a,one,0.665997,-1.062169
1,a,two,-1.063057,0.66466
2,b,one,-1.55302,0.247284
3,b,two,-0.806699,0.835481
4,a,one,0.429483,0.313688


In [13]:
# calculate the mean of data1 column by key1

df1['data1'].groupby(df1['key1']).mean()

key1
a    0.010808
b   -1.179860
Name: data1, dtype: float64

### Slicing a Dataframe

In [14]:
dates = pd.date_range('20130101',periods=6)
df2 = pd.DataFrame(np.random.randn(6,4), index=dates, columns=list('ABCD'))

# get first n rows

display(df2[:4])

# get based on column names

display(df2.loc[:, ['A','B']])

# slicing based on both rows and columns

display(df2.loc['20130102':'20130103', ['A','B']])

Unnamed: 0,A,B,C,D
2013-01-01,-0.721168,-0.000716,-0.251717,-0.742661
2013-01-02,1.248246,0.711003,-0.778874,0.076263
2013-01-03,0.475916,0.30556,-0.483136,-0.725237
2013-01-04,-0.879725,-0.114804,-0.435781,0.35779


Unnamed: 0,A,B
2013-01-01,-0.721168,-0.000716
2013-01-02,1.248246,0.711003
2013-01-03,0.475916,0.30556
2013-01-04,-0.879725,-0.114804
2013-01-05,-0.923669,-1.176535
2013-01-06,0.099438,-0.356465


Unnamed: 0,A,B
2013-01-02,1.248246,0.711003
2013-01-03,0.475916,0.30556


In [15]:
# slice based on index

display(df2.loc[df.index[:4], ])
# or
display(df2['20130101':'20130102'])

# get a range of rows and columns
display(df2.iloc[2:4, 0:2])

# get specific rows and columns using columns and row indexes
display(df2.iloc[[1,5], [0,2]])

Unnamed: 0,A,B,C,D
2013-01-01,-0.721168,-0.000716,-0.251717,-0.742661
2013-01-02,1.248246,0.711003,-0.778874,0.076263
2013-01-03,0.475916,0.30556,-0.483136,-0.725237
2013-01-04,-0.879725,-0.114804,-0.435781,0.35779


Unnamed: 0,A,B,C,D
2013-01-01,-0.721168,-0.000716,-0.251717,-0.742661
2013-01-02,1.248246,0.711003,-0.778874,0.076263


Unnamed: 0,A,B
2013-01-03,0.475916,0.30556
2013-01-04,-0.879725,-0.114804


Unnamed: 0,A,C
2013-01-02,1.248246,-0.778874
2013-01-06,0.099438,1.696193


In [None]:
# boolean indexing

display(df2[df2.A > 0.05])

# select rows based on column values

display(df1['E']==['one', 'one','two','three','four','three'])
display(df1[df1['E'].isin(['two','four'])])

# select all rows except those with two and four

display(df1[~df1['E'].isin(['two','four'])])

### Query Method

In [59]:
# list all columns where A is greater than C

display(df.query('A > C'))

# using OR condition

display(df.query('A < B | C > A'))

Unnamed: 0,A,B,C,D
2013-01-01,0.480906,-0.574142,-0.252468,1.039847
2013-01-02,-0.02348,0.089422,-0.594327,-0.105697
2013-01-03,0.742796,-0.754982,-0.138283,-2.12492
2013-01-05,1.481513,1.659277,0.00014,1.505838


Unnamed: 0,A,B,C,D
2013-01-02,-0.02348,0.089422,-0.594327,-0.105697
2013-01-04,-1.030019,-0.517452,-0.1418,-1.27648
2013-01-05,1.481513,1.659277,0.00014,1.505838
2013-01-06,0.104521,-0.236043,0.819435,0.402856


### Pivot Table

In [60]:
data = pd.DataFrame({'group': ['a', 'a', 'a', 'b','b', 'b', 'c', 'c','c'],
                     'ounces': [4, 3, 12, 6, 7.5, 8, 3, 5, 6]})

# calculate means of each group

display(data.pivot_table(values='ounces',index='group',aggfunc=np.mean))

# calculate count by each group

display(data.pivot_table(values='ounces',index='group',aggfunc='count'))

Unnamed: 0_level_0,ounces
group,Unnamed: 1_level_1
a,6.333333
b,7.166667
c,4.666667


Unnamed: 0_level_0,ounces
group,Unnamed: 1_level_1
a,3
b,3
c,3
