# DATA SCIENCE LIBRARY: NUMPY

## ndarray object creation, indexing and slicing (1-dimensional)

In [2]:
import numpy as np

In [9]:
a = np.array([0, 1, 5, 7, 6, 5, 2, 3, 8, 9])

In [10]:
a[3]

7

In [11]:
a[3:7]

array([7, 6, 5, 2])

In [12]:
b = np.array([1, 5, 7])

In [13]:
a[b]

array([1, 5, 3])

In [14]:
a[a > 5]

array([7, 6, 8, 9])

## More method to quickly create ndarray objects (1D) and (2D)

In [15]:
b = np.arange(1, 20, 3)

In [16]:
b

array([ 1,  4,  7, 10, 13, 16, 19])

In [20]:
a = np.ones((3, 3))

In [21]:
a

array([[ 1.,  1.,  1.],
       [ 1.,  1.,  1.],
       [ 1.,  1.,  1.]])

In [22]:
b = np.zeros((2, 2))

In [23]:
b

array([[ 0.,  0.],
       [ 0.,  0.]])

## 2-d ndarray object ("a matrix") can be defined, and operations applied

In [26]:
a = np.array([[1, 2, 3], [4, 5, 6]])

In [27]:
a

array([[1, 2, 3],
       [4, 5, 6]])

In [28]:
a.shape

(2, 3)

In [29]:
a.T

array([[1, 4],
       [2, 5],
       [3, 6]])

In [30]:
a.T.shape

(3, 2)

In [31]:
b = np.array([6, 7])

In [32]:
np.dot(a.T, b)

array([34, 47, 60])

## ndarry operations

In [8]:
b = np.arange(12).reshape(3, 4)

In [35]:
b

array([[ 0,  1,  2,  3],
       [ 4,  5,  6,  7],
       [ 8,  9, 10, 11]])

In [36]:
b.sum(axis = 0)

array([12, 15, 18, 21])

In [37]:
b.sum(axis = 1)

array([ 6, 22, 38])

In [38]:
b.sum()

66

In [39]:
b.min(axis = 0)

array([0, 1, 2, 3])

In [40]:
b.max(axis = 1)

array([ 3,  7, 11])

## Matrix object and 2-d matrix indexing and slicing

In [3]:
b = np.mat('1 2 3 4; 5 6 7 8; 9 10 11 12')

In [4]:
b

matrix([[ 1,  2,  3,  4],
        [ 5,  6,  7,  8],
        [ 9, 10, 11, 12]])

In [5]:
b[:, :3]

matrix([[ 1,  2,  3],
        [ 5,  6,  7],
        [ 9, 10, 11]])

## Matrix object - allows matrix arithmetic using operators

In [6]:
a = np.mat('1; 2; 3; 4')

In [7]:
b * a

matrix([[ 30],
        [ 70],
        [110]])

## n-dimension ndarray object ("matrices") can also be created

In [13]:
aa = np.array([[[1, 2, 3], [4, 5, 6]], [[1, 2, 3], [4, 5, 6]]])

In [14]:
aa

array([[[1, 2, 3],
        [4, 5, 6]],

       [[1, 2, 3],
        [4, 5, 6]]])

In [15]:
bb = np.array([[[3], [4], [6]], [[6], [5], [7]]])

In [16]:
bb

array([[[3],
        [4],
        [6]],

       [[6],
        [5],
        [7]]])

In [17]:
aa.shape, bb.shape

((2, 2, 3), (2, 3, 1))

In [20]:
np.dot(aa, bb)

array([[[[29],
         [37]],

        [[68],
         [91]]],


       [[[29],
         [37]],

        [[68],
         [91]]]])

## ndarray element-wise operations - scalar and matrix

In [21]:
aa = np.arange(5)

In [22]:
aa

array([0, 1, 2, 3, 4])

In [23]:
aa * 5

array([ 0,  5, 10, 15, 20])

In [24]:
bb = np.array([2, 4, 6, 8, 10])

In [28]:
np.multiply(aa, bb)

array([ 0,  4, 12, 24, 40])

In [29]:
np.divide(aa, bb.astype(float))

array([ 0.        ,  0.25      ,  0.33333333,  0.375     ,  0.4       ])

## Matrix math

In [30]:
a = np.array([[1, 2], [3, 4]])

In [31]:
a

array([[1, 2],
       [3, 4]])

In [32]:
b = np.array([[1], [2]])

In [33]:
b

array([[1],
       [2]])

In [34]:
a.shape, b.shape

((2, 2), (2, 1))

In [35]:
np.dot(a, b)

array([[ 5],
       [11]])

In [36]:
np.dot(b, a)

ValueError: shapes (2,1) and (2,2) not aligned: 1 (dim 1) != 2 (dim 0)

# DATA SCIENCE LIBRARY: PANDAS

## Series - one-dimensional labeled array

In [45]:
import pandas as pd

In [46]:
s = pd.Series(np.random.randn(3), index = ['a', 'b', 'c'])

In [47]:
s

a   -0.841837
b    0.536923
c    0.399702
dtype: float64

In [48]:
s['a']

-0.84183707259364216

In [49]:
s[0]

-0.84183707259364216

In [50]:
d = {'a': 0, 'b': 1, 'c': 2}

In [51]:
pd.Series(d)

a    0
b    1
c    2
dtype: int64

## Series - vector operation support and index alighment

In [52]:
s = pd.Series(np.arange(4), index = ['a', 'b', 'c', 'd'])

In [53]:
s

a    0
b    1
c    2
d    3
dtype: int64

In [54]:
s + s

a    0
b    2
c    4
d    6
dtype: int64

In [55]:
t = pd.Series([30, 40], index = ['b', 'c'])

In [56]:
t

b    30
c    40
dtype: int64

In [57]:
s + t

a   NaN
b    31
c    42
d   NaN
dtype: float64

## DataFrame - 2-d labeled data structure

In [58]:
d = {'one': [10., 20., 30., 40.], 'two': [4., 3., 2., 1.]}

In [59]:
pd.DataFrame(d, index = ['a', 'b', 'c', 'd'])

Unnamed: 0,one,two
a,10,4
b,20,3
c,30,2
d,40,1


In [60]:
dd = {'0': pd.Series([1, 2], index = ['a', 'b']), '1': pd.Series([15, 25, 35], index = ['a', 'b', 'c'])}

In [61]:
pd.DataFrame(dd)

Unnamed: 0,0,1
a,1.0,15
b,2.0,25
c,,35


## DataFrame - data alignment and arithmetic operations

In [62]:
df = pd.DataFrame(np.floor(np.random.randn(3, 4) * 10), columns = ['A', 'B', 'C', 'D'])

In [63]:
df

Unnamed: 0,A,B,C,D
0,3,4,-25,-5
1,17,-15,9,4
2,-4,-15,-5,-18


In [64]:
df2 = pd.DataFrame(np.floor(np.random.randn(3, 2) * 10), columns = ['B', 'C'])

In [65]:
df2

Unnamed: 0,B,C
0,23,1
1,22,-17
2,-7,-7


In [66]:
df + df2

Unnamed: 0,A,B,C,D
0,,27,-24,
1,,7,-8,
2,,-22,-12,


## Panel - 3-d labeled data structure

In [67]:
panel = pd.Panel(np.random.randn(5, 3, 2).round(decimals = 1), items = ['one', 'two', 'three', 'four', 'five'], 
                 major_axis = pd.date_range('1/1/2000', periods = 3), minor_axis = ['a', 'b'])

In [70]:
panel.to_frame()

Unnamed: 0_level_0,Unnamed: 1_level_0,one,two,three,four,five
major,minor,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2000-01-01,a,-2.1,1.4,1.4,0.6,2.3
2000-01-01,b,0.4,1.1,0.9,1.0,-0.8
2000-01-02,a,0.3,-0.4,0.9,-0.8,-0.1
2000-01-02,b,2.1,-0.3,0.5,-1.1,0.7
2000-01-03,a,-0.1,1.5,-0.6,0.6,0.4
2000-01-03,b,-0.7,1.0,0.7,-1.5,-1.1


## Creating a dataframe

In [141]:
df = pd.DataFrame({'int_col': [1, 2, 6, 8, -1], 'float_col': [0.1, 0.2, 0.2, 10.1, None], 
                   'str_col': ['a', 'b', None, 'c', 'a']})

In [142]:
df

Unnamed: 0,float_col,int_col,str_col
0,0.1,1,a
1,0.2,2,b
2,0.2,6,
3,10.1,8,c
4,,-1,a


## Indexing

In [73]:
df.ix[:, ['float_col', 'int_col']]

Unnamed: 0,float_col,int_col
0,0.1,1
1,0.2,2
2,0.2,6
3,10.1,8
4,,-1


In [74]:
df[['float_col', 'int_col']]

Unnamed: 0,float_col,int_col
0,0.1,1
1,0.2,2
2,0.2,6
3,10.1,8
4,,-1


## Conditional indexing

In [75]:
df['float_col'] > 0.15

0    False
1     True
2     True
3     True
4    False
Name: float_col, dtype: bool

In [76]:
df[df['float_col'] > 0.15]

Unnamed: 0,float_col,int_col,str_col
1,0.2,2,b
2,0.2,6,
3,10.1,8,c


In [78]:
df[(df['float_col'] > 0.1) & (df['int_col'] > 2)]

Unnamed: 0,float_col,int_col,str_col
2,0.2,6,
3,10.1,8,c


In [79]:
df[(df['float_col'] > 0.1) | (df['int_col'] > 2)]

Unnamed: 0,float_col,int_col,str_col
1,0.2,2,b
2,0.2,6,
3,10.1,8,c


In [80]:
df[~(df['float_col'] > 0.1)]

Unnamed: 0,float_col,int_col,str_col
0,0.1,1,a
4,,-1,a


## Column renaming

In [81]:
df2 = df.rename(columns = {'int_col' : 'some_other_name'})

In [82]:
df2  # this is a copy of the DataFrame

Unnamed: 0,float_col,some_other_name,str_col
0,0.1,1,a
1,0.2,2,b
2,0.2,6,
3,10.1,8,c
4,,-1,a


In [84]:
df2.rename(columns = {'some_other_name' : 'int_col'}, inplace = True)  # this will modify the DataFrame in place

In [85]:
df2

Unnamed: 0,float_col,int_col,str_col
0,0.1,1,a
1,0.2,2,b
2,0.2,6,
3,10.1,8,c
4,,-1,a


## Remove / Replace missing values

In [87]:
df2.dropna()

Unnamed: 0,float_col,int_col,str_col
0,0.1,1,a
1,0.2,2,b
3,10.1,8,c


In [88]:
df3 = df.copy()

In [89]:
df3

Unnamed: 0,float_col,int_col,str_col
0,0.1,1,a
1,0.2,2,b
2,0.2,6,
3,10.1,8,c
4,,-1,a


In [90]:
mean = df3['float_col'].mean()

In [93]:
df3['float_col'].fillna(mean, inplace = True)

In [94]:
df3

Unnamed: 0,float_col,int_col,str_col
0,0.1,1,a
1,0.2,2,b
2,0.2,6,
3,10.1,8,c
4,2.65,-1,a


## Map & Apply functions

In [96]:
# .map() works on a Series (in this case an extracted column of a DF)
df['str_col'].dropna().map(lambda x : 'map_' + x)

0    map_a
1    map_b
3    map_c
4    map_a
Name: str_col, dtype: object

In [104]:
# .apply() works on a DataFrame
df[['int_col', 'float_col']].apply(np.sqrt)

Unnamed: 0,int_col,float_col
0,1.0,0.316228
1,1.414214,0.447214
2,2.44949,0.447214
3,2.828427,3.17805
4,,


In [101]:
def this_fn(x):
    if type(x) is str:
        return 'applymap_' + x
    elif x:
        return 100 * x
    else:
        return
    
df.applymap(this_fn)

Unnamed: 0,float_col,int_col,str_col
0,10.0,100,applymap_a
1,20.0,200,applymap_b
2,20.0,600,
3,1010.0,800,applymap_c
4,,-100,applymap_a


In [105]:
# apply works on a row / column basis of a DataFrame, applymap works element-wise on a DataFrame, 
# and map works element-wise on a Series.

## Vectorized math operations

In [106]:
df = pd.DataFrame(data = {"A": [1, 2], "B": [1.2, 1.3]})

In [107]:
df["C"] = df["A"] + df["B"]

In [108]:
df

Unnamed: 0,A,B,C
0,1,1.2,2.2
1,2,1.3,3.3


In [109]:
df["D"] = df["A"] * 3

In [110]:
df

Unnamed: 0,A,B,C,D
0,1,1.2,2.2,3
1,2,1.3,3.3,6


In [111]:
df["E"] = np.sqrt(df["A"])

In [112]:
df

Unnamed: 0,A,B,C,D,E
0,1,1.2,2.2,3,1.0
1,2,1.3,3.3,6,1.414214


## Vectorized string operations

In [113]:
df = pd.DataFrame(data = {"A": [1, 2], "B": [1.2, 1.3], "Z": ['apple', 'pear']})

In [114]:
df

Unnamed: 0,A,B,Z
0,1,1.2,apple
1,2,1.3,pear


In [119]:
df["F"] = df.Z.str.upper()

In [120]:
df

Unnamed: 0,A,B,Z,F
0,1,1.2,apple,APPLE
1,2,1.3,pear,PEAR


## Groupby

In [122]:
df4 = pd.DataFrame({"float_col": [0.1, 0.2, 0.5, 10.1, 0.1], "int_col": [1, 2, 6, 8, -1], 
                    "str_col": ["a", "b", "b", "c", "a"]})

In [123]:
df4

Unnamed: 0,float_col,int_col,str_col
0,0.1,1,a
1,0.2,2,b
2,0.5,6,b
3,10.1,8,c
4,0.1,-1,a


In [124]:
grouped = df4['float_col'].groupby(df4['str_col'])

In [126]:
grouped.mean()

str_col
a     0.10
b     0.35
c    10.10
Name: float_col, dtype: float64

In [127]:
g2 = df4.groupby(['float_col', 'str_col'])

In [128]:
g2.sum()

Unnamed: 0_level_0,Unnamed: 1_level_0,int_col
float_col,str_col,Unnamed: 2_level_1
0.1,a,0
0.2,b,2
0.5,b,6
10.1,c,8


## New columns from existing columns

In [129]:
df4['new_col'] = df4['int_col'] * 10

In [130]:
df4

Unnamed: 0,float_col,int_col,str_col,new_col
0,0.1,1,a,10
1,0.2,2,b,20
2,0.5,6,b,60
3,10.1,8,c,80
4,0.1,-1,a,-10


In [131]:
df4['other_new_col'] = df4['int_col'] + 103

In [132]:
df4

Unnamed: 0,float_col,int_col,str_col,new_col,other_new_col
0,0.1,1,a,10,104
1,0.2,2,b,20,105
2,0.5,6,b,60,109
3,10.1,8,c,80,111
4,0.1,-1,a,-10,102


In [133]:
def sum_two_cols(series):
    return series['int_col'] + series['float_col']

In [146]:
df5 = df.copy()

In [148]:
df5['sum_col'] = df5.apply(sum_two_cols, axis = 1)  # apply function to each row

In [149]:
df5

Unnamed: 0,float_col,int_col,str_col,sum_col
0,0.1,1,a,1.1
1,0.2,2,b,2.2
2,0.2,6,,6.2
3,10.1,8,c,18.1
4,,-1,a,


## Statistics

In [150]:
df.describe()

Unnamed: 0,float_col,int_col
count,4.0,5.0
mean,2.65,3.2
std,4.96689,3.701351
min,0.1,-1.0
25%,0.175,1.0
50%,0.2,2.0
75%,2.675,6.0
max,10.1,8.0


In [151]:
df.cov()

Unnamed: 0,float_col,int_col
float_col,24.67,12.483333
int_col,12.483333,13.7


In [152]:
df.corr()

Unnamed: 0,float_col,int_col
float_col,1.0,0.760678
int_col,0.760678,1.0


## Merge and join

In [153]:
other = pd.DataFrame({'str_col': ['a', 'b'], 'some_val': [1, 2]})

In [154]:
other

Unnamed: 0,some_val,str_col
0,1,a
1,2,b


In [155]:
pd.merge(df, other, on = 'str_col', how = 'inner')

Unnamed: 0,float_col,int_col,str_col,some_val
0,0.1,1,a,1
1,,-1,a,1
2,0.2,2,b,2


## Basic plotting: lines and histogram

In [156]:
plot_df = pd.DataFrame(np.random.randn(1000, 2), columns = ['x', 'y'])

In [157]:
plot_df['y'] = plot_df['y'].map(lambda x : x + 1)

In [158]:
plot_df.head()

Unnamed: 0,x,y
0,0.76956,0.749039
1,-0.960528,1.026725
2,-0.450425,1.318623
3,-0.484409,1.865155
4,-1.353504,0.952622


In [159]:
plot_df.plot()

<matplotlib.axes._subplots.AxesSubplot at 0x10975fba8>

In [161]:
plot_df.hist()

array([[<matplotlib.axes._subplots.AxesSubplot object at 0x10c390cc0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x10c3a0630>]], dtype=object)