# Numpy

NumPy (Numerical Python) is a library consisting of multidimensional array objects and a functions for processing those arrays

In [3]:
import numpy as np 

In [24]:
a1 = np.array([1, 2, 3, 4, 5]) 
a1

array([1, 2, 3, 4, 5])

In [25]:
type(a1)

numpy.ndarray

ndarray.shape -> Tuple of array dimensions.

In [11]:
a1.shape

(5,)

In [12]:
a1.ndim

1

Multi dimensinal array

In [13]:
a1 = np.array([[1, 2, 3],[4, 5, 6]])
a1

array([[1, 2, 3],
       [4, 5, 6]])

In [14]:
a1.shape

(2, 3)

In [15]:
a1.ndim

2

Specifying data type

In [36]:
a1 = np.array([[6, 4, 5], [5, 8, 3]], dtype=float)
a1

array([[6., 4., 5.],
       [5., 8., 3.]])

Reshape

In [37]:
a1 =a1.reshape(3, 2)
a1

array([[6., 4.],
       [5., 5.],
       [8., 3.]])

Array creation

In [50]:
a1 = np.zeros(6)
a1

array([0., 0., 0., 0., 0., 0.])

In [52]:
a1 = np.ones([2, 2], dtype=int)
a1

array([[1, 1],
       [1, 1]])

Creating array from existing data

In [57]:
x1 = ([3, 2], [4, 5], [6, 7])
a1 = np.asarray(x1, dtype = float)
a1

array([[3., 2.],
       [4., 5.],
       [6., 7.]])

Array from numerical ranges

In [62]:
a1 = np.arange(0, 16, 3)
a1

array([ 0,  3,  6,  9, 12, 15])

In [63]:
type(a1)

numpy.ndarray

np.linspace: This function is similar to arange() function. In this function, instead of step size, the number of evenly spaced values between the interval is specified

In [64]:
a1 = np.linspace(10,20,5) 
print(a1)

[10.  12.5 15.  17.5 20. ]


Slicing: Basic slicing is an extension of Python's basic concept of slicing to n dimensions

In [66]:
l = np.arange(10)
a1 = slice(2, 9, 2)
l[a1]

array([2, 4, 6, 8])

same can be achieved by

In [69]:
l = np.arange(10)
a1 = l[2: 9: 2]
l[a1]

array([2, 4, 6, 8])

Slice using index and ellipsis (...)

In [70]:
a1 = np.array([[1,2,3],[3,4,5],[4,5,6]]) 
a1[1:]

array([[3, 4, 5],
       [4, 5, 6]])

In [72]:
a1[..., 1]

array([2, 4, 5])

In [74]:
a1[1:, ...]

array([[3, 4, 5],
       [4, 5, 6]])

Integer indexing

In [75]:
a = np.array([[1, 2], [3, 4], [5, 6]]) 
b = a[[0,1,2], [0,1,0]]
print(b)

[1 4 5]


In [9]:
x = np.array([[ 0,  1,  2],[ 3,  4,  5],[ 6,  7,  8],[ 9, 10, 11]]) 
z = x[1:4,1:3]
z

array([[ 4,  5],
       [ 7,  8],
       [10, 11]])

or

In [6]:
y = x[1:4, [1,2]]
y

array([[ 4,  5],
       [ 7,  8],
       [10, 11]])

Print the values greater than 4

In [10]:
z[z>4]

array([ 5,  7,  8, 10, 11])

Omitting Nan using complement operator

In [11]:
a = np.array([1, 5, np.nan, 8, np.nan, 52, 65, np.nan])
a

array([ 1.,  5., nan,  8., nan, 52., 65., nan])

In [13]:
a[~np.isnan(a)]

array([ 1.,  5.,  8., 52., 65.])

Broadcasting: Numpy arithmetic operations on arrays of different shapes
If two arrays are of exactly the same shape, then these operations are smoothly performed

In [14]:
x = np.array([5, 7, 8])
y = np.array([8, 9, 2])
z = x*y
z

array([40, 63, 16])

In [15]:
a = np.array([1.0, 2.0, 3.0])
b = 2.0
a * b

array([2., 4., 6.])

If the dimensions of two arrays are dissimilar, element-to-element operations are not possible. However, operations on arrays of non-similar shapes is still possible in NumPy, because of the broadcasting capability. The smaller array is broadcast to the size of the larger array so that they have compatible shapes.
- Subject to certain constraints, the smaller array is “broadcast” across the larger array so that they have compatible shapes

When operating on two arrays, NumPy compares their shapes element-wise. It starts with the trailing dimensions, and works its way forward. Two dimensions are compatible when

- they are equal, or
- one of them is 1

Transpose of an array

In [16]:
a = np.arange(0,60,5) 
a = a.reshape(3,4)
a

array([[ 0,  5, 10, 15],
       [20, 25, 30, 35],
       [40, 45, 50, 55]])

In [17]:
a.T

array([[ 0, 20, 40],
       [ 5, 25, 45],
       [10, 30, 50],
       [15, 35, 55]])

# Array manipulation

reshape

In [21]:
a = np.arange(6)
a = a.reshape(3, 2)
a

array([[0, 1],
       [2, 3],
       [4, 5]])

flatten

In [22]:
a = np.arange(8).reshape(4,2)
a.flatten()

array([0, 1, 2, 3, 4, 5, 6, 7])

Concatenate

In [24]:
a = np.array([[1, 2, 3], [4, 5, 6]])
b = np.array([[11, 22, 33], [44, 55, 66]])
np.concatenate((a, b)) #default axis is 0

array([[ 1,  2,  3],
       [ 4,  5,  6],
       [11, 22, 33],
       [44, 55, 66]])

In [25]:
np.concatenate((a, b), axis = 1)

array([[ 1,  2,  3, 11, 22, 33],
       [ 4,  5,  6, 44, 55, 66]])

Stack

In [26]:
a = np.array([[1, 2, 3], [4, 5, 6]])
b = np.array([[11, 22, 33], [44, 55, 66]])
np.stack((a, b)) #default axis is 0

array([[[ 1,  2,  3],
        [ 4,  5,  6]],

       [[11, 22, 33],
        [44, 55, 66]]])

In [27]:
a = np.array([[1, 2, 3], [4, 5, 6]])
b = np.array([[11, 22, 33], [44, 55, 66]])
np.stack((a, b), axis=1) 

array([[[ 1,  2,  3],
        [11, 22, 33]],

       [[ 4,  5,  6],
        [44, 55, 66]]])

hstack

In [28]:
a = np.array([[1, 2, 3], [4, 5, 6]])
b = np.array([[11, 22, 33], [44, 55, 66]])
np.hstack((a, b))

array([[ 1,  2,  3, 11, 22, 33],
       [ 4,  5,  6, 44, 55, 66]])

vstack

In [29]:
a = np.array([[1, 2, 3], [4, 5, 6]])
b = np.array([[11, 22, 33], [44, 55, 66]])
np.vstack((a, b))

array([[ 1,  2,  3],
       [ 4,  5,  6],
       [11, 22, 33],
       [44, 55, 66]])

Split. Array splitting can also be done horizontally (hstack) and vertically (vstack)

In [30]:
a = np.arange(9)
np.split(a, 3)

[array([0, 1, 2]), array([3, 4, 5]), array([6, 7, 8])]

Append

In [32]:
a = np.array([[20, 30, 40], [50, 60, 70]])
np.append(a, [[1, 2, 3]], axis = 0)

array([[20, 30, 40],
       [50, 60, 70],
       [ 1,  2,  3]])

In [33]:
a = np.array([[20, 30, 40], [50, 60, 70]])
np.append(a, [[1, 2, 3],[80, 60, 40]], axis = 1)

array([[20, 30, 40,  1,  2,  3],
       [50, 60, 70, 80, 60, 40]])

insert

In [35]:
a = np.array([[1,2],[3,4],[5,6]]) 
np.insert(a, 1, [7], axis=0)

array([[1, 2],
       [7, 7],
       [3, 4],
       [5, 6]])

In [37]:
a = np.array([[1,2],[3,4],[5,6]]) 
np.insert(a, 1, [7], axis=1)

array([[1, 7, 2],
       [3, 7, 4],
       [5, 7, 6]])

delete

In [38]:
a = np.arange(12).reshape(3,4)
np.delete(a, 1, axis=0)

array([[ 0,  1,  2,  3],
       [ 8,  9, 10, 11]])

In [39]:
a = np.arange(12).reshape(3,4)
np.delete(a, 1, axis=1)

array([[ 0,  2,  3],
       [ 4,  6,  7],
       [ 8, 10, 11]])

unique

In [40]:
a = np.array([5,2,6,2,7,5,6,8,2,9]) 
np.unique(a)

array([2, 5, 6, 7, 8, 9])

add

In [45]:
a = np.arange(9, dtype = np.float).reshape(3,3)
b = np.array([1, 2, 3])
np.add(a, b)

array([[ 1.,  3.,  5.],
       [ 4.,  6.,  8.],
       [ 7.,  9., 11.]])

subtract

In [46]:
np.subtract(a, b)

array([[-1., -1., -1.],
       [ 2.,  2.,  2.],
       [ 5.,  5.,  5.]])

multiply

In [47]:
np.multiply(a, b)

array([[ 0.,  2.,  6.],
       [ 3.,  8., 15.],
       [ 6., 14., 24.]])

divide

In [48]:
np.divide(a, b)

array([[0.        , 0.5       , 0.66666667],
       [3.        , 2.        , 1.66666667],
       [6.        , 3.5       , 2.66666667]])

power

In [50]:
a = np.array([5, 6, 7])
np.power(a, 2)

array([25, 36, 49], dtype=int32)

Percentile

In [51]:
a = np.array([[30,40,70],[80,20,10],[50,90,60]])
print(np.percentile(a,50, axis = 1))

[40. 20. 60.]


In [59]:
print(np.percentile(a,50, axis = 0))

[50. 40. 60.]


mean

In [61]:
print(np.mean(a, axis=0))

[53.33333333 50.         46.66666667]


In [62]:
print(np.mean(a, axis=1))

[46.66666667 36.66666667 66.66666667]


median

In [63]:
print(np.median(a, axis=0))

[50. 40. 60.]


In [64]:
print(np.median(a, axis=1))

[40. 20. 60.]


Standard deviation

In [65]:
print(np.std([5, 8, 7, 20]))

5.873670062235365


Variance

In [66]:
print(np.var([5, 8, 7, 20]))

34.5


In [72]:
#In Python, like many other languages,

#The values before the comma stand for the rows
#The value on the rights stands for the columns.
#If you want to select a column, you need to add : before the column index.
#: means you want all the rows from the selected column.

print('Second column:', e[:,1])
print(e[1, :2])
print ("Entire table " , e[:,:])
# Accessing 2nd row with column 1 till end
print ("Accessing Rowise", e[2,1:])
print(e[0:2,0:1])

Second column: [ 2  6 10]
[5 6]
Entire table  [[ 1  2  3  4]
 [ 5  6  7  8]
 [ 9 10 11 12]]
Accessing Rowise [10 11 12]
[[1]
 [5]]


# Pandas

It is a data analysis tool for Python. And it has following data structures
- Series
- Dataframe

Series
- Series is a one-dimensional array like structure with homogeneous data. For eg, the following series is a collection of integers 1, 2, 3, 4 ....

Dataframe
- DataFrame is a two-dimensional array with heterogeneous data. It has table like structure. It is represented in rows and columns.

Series

In [68]:
import pandas as pd
import numpy as np

Creating series using array

In [69]:
a = np.array([10, 2, 50, 60])
s = pd.Series(a)
s

0    10
1     2
2    50
3    60
dtype: int32

Set an index

In [70]:
a = np.array([10, 2, 50, 60])
s = pd.Series(a, index=[100, 101, 102, 103])
s

100    10
101     2
102    50
103    60
dtype: int32

Creating series from dictionary

In [86]:
dic = {'a': 1, 'b': 2, 'c': 3}
s = pd.Series(dic)
s

a    1
b    2
c    3
dtype: int64

In [87]:
type(s)

pandas.core.series.Series

Index order is used and the missing element is filled with NaN (Not a Number)

In [88]:
dic = {'a': 10, 'b': 12, 'c': 13}
s = pd.Series(dic, index=['d', 'c', 'b', 'a'])
s

d     NaN
c    13.0
b    12.0
a    10.0
dtype: float64

Accessing series data using position

In [89]:
s = pd.Series([1,2,3,4,5],index = ['a','b','c','d','e'])
s[2]

3

In [90]:
s[:3]

a    1
b    2
c    3
dtype: int64

Retrieving series data using label (Index)

In [91]:
s['d']

4

Dataframe

Creating empty dataframe

In [176]:
df = pd.DataFrame()
print(df)

Empty DataFrame
Columns: []
Index: []


Creating df from list

In [177]:
l = list(range(5))
df = pd.DataFrame(l)
df

Unnamed: 0,0
0,0
1,1
2,2
3,3
4,4


In [178]:
lil = [['Col1', 10], ['Col2', 20], ['Col3', 30]]
df = pd.DataFrame(lil, columns=['First column', 'Second column'], dtype=float)
df

Unnamed: 0,First column,Second column
0,Col1,10.0
1,Col2,20.0
2,Col3,30.0


Creating df from dictionary

In [179]:
dic = {'Name':['AAA', 'BBB', 'CCC'], 'Age':[25, 26, 27]}
df = pd.DataFrame(dic)
df

Unnamed: 0,Age,Name
0,25,AAA
1,26,BBB
2,27,CCC


In [180]:
df = pd.DataFrame(dic, index=[1, 2, 3])
df

Unnamed: 0,Age,Name
1,25,AAA
2,26,BBB
3,27,CCC


Column selection

In [181]:
df['Age']

1    25
2    26
3    27
Name: Age, dtype: int64

Column addition

In [182]:
df['Height'] = pd.Series([150, 160, 170], index=[1, 2, 3])
df['Add'] = df['Age'] + df['Height']
df

Unnamed: 0,Age,Name,Height,Add
1,25,AAA,150,175
2,26,BBB,160,186
3,27,CCC,170,197


Column deletion

In [174]:
del df['Add']
df

Unnamed: 0,Age,Name,Height
1,25,AAA,150
2,26,BBB,160
3,27,CCC,170


In [183]:
df.drop('Add', axis=1, inplace=True)
df

Unnamed: 0,Age,Name,Height
1,25,AAA,150
2,26,BBB,160
3,27,CCC,170


In [184]:
df1 = df.copy()

Select, add and delete a row

Selection by label

In [156]:
df.index = ['a', 'b', 'c']

In [158]:
df.loc['b']

Age        26
Name      BBB
Height    160
Name: b, dtype: object

Selection by integer location

In [190]:
df.iloc[1]

Age        26
Name      BBB
Height    160
Name: b, dtype: object

Adding a row

In [188]:
df.loc[-1] = [29, 'DDD', 180]
df.index = ['a', 'b', 'c', 'd']

In [189]:
df

Unnamed: 0,Age,Name,Height
a,25,AAA,150
b,26,BBB,160
c,27,CCC,170
d,29,DDD,180


Slicing

In [191]:
df[1:3]

Unnamed: 0,Age,Name,Height
b,26,BBB,160
c,27,CCC,170


Row addition

In [196]:
df1 = pd.DataFrame([[1, 2], [3, 4]], columns = ['a','b'])
df2 = pd.DataFrame([[5, 6], [7, 8]], columns = ['a','b'])

df3 = df1.append(df2)
df3.index = [0, 1, 2, 3]
df3

Unnamed: 0,a,b
0,1,2
1,3,4
2,5,6
3,7,8


Deleting rows

In [199]:
df3.drop(2 , 0, inplace=True)

In [205]:
df3

Unnamed: 0,a,b
0,1,2
1,3,4
3,7,8


In [208]:
df3.drop(df3.index[[1, 2]], 0, inplace=True)
df3

Unnamed: 0,a,b
0,1,2


Basic functionalities

In [209]:
dic = {'Name':pd.Series(['Tom','James','Ricky','Vin','Steve','Smith','Jack']),
   'Age':pd.Series([25,26,25,23,30,29,23]),
   'Rating':pd.Series([4.23,3.24,3.98,2.56,3.20,4.6,3.8])}
df = pd.DataFrame(dic)
df

Unnamed: 0,Age,Name,Rating
0,25,Tom,4.23
1,26,James,3.24
2,25,Ricky,3.98
3,23,Vin,2.56
4,30,Steve,3.2
5,29,Smith,4.6
6,23,Jack,3.8


head

In [210]:
df.head()

Unnamed: 0,Age,Name,Rating
0,25,Tom,4.23
1,26,James,3.24
2,25,Ricky,3.98
3,23,Vin,2.56
4,30,Steve,3.2


tail

In [211]:
df.tail()

Unnamed: 0,Age,Name,Rating
2,25,Ricky,3.98
3,23,Vin,2.56
4,30,Steve,3.2
5,29,Smith,4.6
6,23,Jack,3.8


transpose

In [212]:
df.T

Unnamed: 0,0,1,2,3,4,5,6
Age,25,26,25,23,30,29,23
Name,Tom,James,Ricky,Vin,Steve,Smith,Jack
Rating,4.23,3.24,3.98,2.56,3.2,4.6,3.8


axes - returns the list of row axis labels and column axis labels

In [213]:
df.axes

[RangeIndex(start=0, stop=7, step=1),
 Index(['Age', 'Name', 'Rating'], dtype='object')]

Data types of columns

In [214]:
df.dtypes

Age         int64
Name       object
Rating    float64
dtype: object

empty - returns the Boolean value saying whether the Object is empty or not; True indicates that the object is empty

In [215]:
df.empty

False

shape - returns tuple with number of rows and columns

In [217]:
df.shape

(7, 3)

size - returns the number of elements in the DataFrame

In [218]:
df.size

21

Descriptive statistics

sum - returns the sum of the values for the requested axis. By default, axis is index (axis=0)

In [219]:
df.sum()

Age                                  181
Name      TomJamesRickyVinSteveSmithJack
Rating                             25.61
dtype: object

Each individual column is added individually (Strings are appended)

In [220]:
df.sum(1)

0    29.23
1    29.24
2    28.98
3    25.56
4    33.20
5    33.60
6    26.80
dtype: float64

mean

In [221]:
df.mean()

Age       25.857143
Rating     3.658571
dtype: float64

median

In [222]:
df.median()

Age       25.0
Rating     3.8
dtype: float64

mode

In [226]:
df['Age'].mode()

0    23
1    25
dtype: int64

Standard deviation

In [223]:
df.std()

Age       2.734262
Rating    0.698628
dtype: float64

count - No of non-null observations

In [224]:
df.count()

Age       7
Name      7
Rating    7
dtype: int64

min

In [231]:
df['Age'].min()

23

max

In [232]:
df['Age'].max()

30

describe

In [235]:
df.describe()

Unnamed: 0,Age,Rating
count,7.0,7.0
mean,25.857143,3.658571
std,2.734262,0.698628
min,23.0,2.56
25%,24.0,3.22
50%,25.0,3.8
75%,27.5,4.105
max,30.0,4.6


Apply function

In [238]:
df1 = df.copy()

In [240]:
dic = {'Col1': [20, 50, 40, 60, 80],
      'Col2': [80, 60, 40, 20, 55],
      'Col3': [20, 80, 55, 66, 33]}
df = pd.DataFrame(dic)
df

Unnamed: 0,Col1,Col2,Col3
0,20,80,20
1,50,60,80
2,40,40,55
3,60,20,66
4,80,55,33


In [242]:
df.apply(np.mean)

Col1    50.0
Col2    51.0
Col3    50.8
dtype: float64

In [241]:
df.apply(np.mean, axis=1)

0    40.000000
1    63.333333
2    45.000000
3    48.666667
4    56.000000
dtype: float64

In [245]:
df.apply(lambda x:x.max() - x.min())

Col1    60
Col2    60
Col3    60
dtype: int64

In [244]:
df.apply(lambda x: x.max() - x.min(), axis=1)

0    60
1    30
2    15
3    46
4    47
dtype: int64

Rename

In [263]:
df.rename(columns={'Age':'Col1', 'Name':'Col2', 'Rating':'Col3'})

Unnamed: 0,Col1,Col2,Col3
0,25,Tom,4.23
1,26,James,3.24
2,25,Ricky,3.98
3,23,Vin,2.56
4,30,Steve,3.2
5,29,Smith,4.6
6,23,Jack,3.8


Sorting

In [264]:
df = pd.DataFrame(np.random.randn(5, 2),index=[1,5, 3, 2, 4],columns=['col2','col1'])
df

Unnamed: 0,col2,col1
1,-0.958034,-0.103182
5,1.257592,0.540175
3,1.17966,0.856654
2,-0.66342,0.010525
4,2.360882,-2.644734


By default, sorting is done on row labels in ascending order.

In [265]:
df.sort_index()

Unnamed: 0,col2,col1
1,-0.958034,-0.103182
2,-0.66342,0.010525
3,1.17966,0.856654
4,2.360882,-2.644734
5,1.257592,0.540175


In [266]:
df.sort_index(ascending = False)

Unnamed: 0,col2,col1
5,1.257592,0.540175
4,2.360882,-2.644734
3,1.17966,0.856654
2,-0.66342,0.010525
1,-0.958034,-0.103182


Sorting columns

In [267]:
df.sort_index(axis=1)

Unnamed: 0,col1,col2
1,-0.103182,-0.958034
5,0.540175,1.257592
3,0.856654,1.17966
2,0.010525,-0.66342
4,-2.644734,2.360882


Sort by values

In [268]:
df.sort_values(by='col1')

Unnamed: 0,col2,col1
4,2.360882,-2.644734
1,-0.958034,-0.103182
2,-0.66342,0.010525
5,1.257592,0.540175
3,1.17966,0.856654


loc

In [319]:
df1

Unnamed: 0,Age,Name,Rating
0,25,Tom,4.23
1,26,James,3.24
2,25,Ricky,3.98
3,23,Vin,2.56
4,30,Steve,3.2
5,29,Smith,4.6
6,23,Jack,3.8


In [320]:
df1.loc[:3, ['Age', 'Name']]

Unnamed: 0,Age,Name
0,25,Tom
1,26,James
2,25,Ricky
3,23,Vin


In [321]:
df1.loc[:2]

Unnamed: 0,Age,Name,Rating
0,25,Tom,4.23
1,26,James,3.24
2,25,Ricky,3.98


In [322]:
df1.index = ['a', 'b', 'c', 'd', 'e', 'f', 'g']

In [323]:
df1.loc['a':'e', ['Age', 'Name']]

Unnamed: 0,Age,Name
a,25,Tom
b,26,James
c,25,Ricky
d,23,Vin
e,30,Steve


iloc

In [324]:
df1.iloc[0:4, 0:2]

Unnamed: 0,Age,Name
a,25,Tom
b,26,James
c,25,Ricky
d,23,Vin


In [331]:
df1.iloc[:4]

Unnamed: 0,Age,Name,Rating
a,25,Tom,4.23
b,26,James,3.24
c,25,Ricky,3.98
d,23,Vin,2.56


In [334]:
df1.iloc[:, 1:]

Unnamed: 0,Name,Rating
a,Tom,4.23
b,James,3.24
c,Ricky,3.98
d,Vin,2.56
e,Steve,3.2
f,Smith,4.6
g,Jack,3.8


Handling missing data

In [335]:
df = pd.DataFrame(np.random.randn(5, 3), index=['a', 'c', 'e', 'f','h'],columns=['one', 'two', 'three'])
df = df.reindex(['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h'])
df

Unnamed: 0,one,two,three
a,1.323032,0.531442,-0.460334
b,,,
c,0.430173,1.2197,-0.722199
d,,,
e,-0.498475,-0.875424,-0.272322
f,0.023408,-1.979074,-0.320843
g,,,
h,-2.092057,-0.295953,-1.971496


In [340]:
df['one'].isnull() #isna can also be used. Both isnull() and isna() are same

a    False
b     True
c    False
d     True
e    False
f    False
g     True
h    False
Name: one, dtype: bool

In [341]:
df['two'].notnull()

a     True
b    False
c     True
d    False
e     True
f     True
g    False
h     True
Name: two, dtype: bool

In [342]:
df['three'].isnull().sum()

3

Filling missing data

In [345]:
df['one'].fillna(0)

a    1.323032
b    0.000000
c    0.430173
d    0.000000
e   -0.498475
f    0.023408
g    0.000000
h   -2.092057
Name: one, dtype: float64

In [346]:
df.fillna(1, inplace=True)
df

Unnamed: 0,one,two,three
a,1.323032,0.531442,-0.460334
b,1.0,1.0,1.0
c,0.430173,1.2197,-0.722199
d,1.0,1.0,1.0
e,-0.498475,-0.875424,-0.272322
f,0.023408,-1.979074,-0.320843
g,1.0,1.0,1.0
h,-2.092057,-0.295953,-1.971496


In [348]:
df.loc['d':'f'] = pd.Series({'one':np.NaN, 'two':np.NaN, 'three':np.NaN})

In [349]:
df

Unnamed: 0,one,two,three
a,1.323032,0.531442,-0.460334
b,1.0,1.0,1.0
c,0.430173,1.2197,-0.722199
d,,,
e,,,
f,,,
g,1.0,1.0,1.0
h,-2.092057,-0.295953,-1.971496


In [350]:
df.dropna()

Unnamed: 0,one,two,three
a,1.323032,0.531442,-0.460334
b,1.0,1.0,1.0
c,0.430173,1.2197,-0.722199
g,1.0,1.0,1.0
h,-2.092057,-0.295953,-1.971496


In [353]:
print(df.dropna(axis=1))

Empty DataFrame
Columns: []
Index: [a, b, c, d, e, f, g, h]


replace - Replace a generic value with some specific value

In [358]:
df = pd.DataFrame({'one':[10,20,30,40,50,2000], 'two':[1000,0,30,40,50,60]})
df

Unnamed: 0,one,two
0,10,1000
1,20,0
2,30,30
3,40,40
4,50,50
5,2000,60


In [359]:
df.replace({2000:60, 1000:10}, inplace=True)
df

Unnamed: 0,one,two
0,10,10
1,20,0
2,30,30
3,40,40
4,50,50
5,60,60


Aggregation - group by

In [368]:
df1['Score'] = pd.Series([70, 70, 80, 85, 90, 95, 90], index=['a', 'b', 'c', 'd', 'e', 'f', 'g'])

In [369]:
df1

Unnamed: 0,Age,Name,Rating,Score
a,25,Tom,4.23,70
b,26,James,3.24,70
c,25,Ricky,3.98,80
d,23,Vin,2.56,85
e,30,Steve,3.2,90
f,29,Smith,4.6,95
g,23,Jack,3.8,90


In [378]:
df1.groupby(['Age'])['Score'].sum()

Age
23    175
25    150
26     70
29     95
30     90
Name: Score, dtype: int64

In [377]:
df1.groupby(['Age'])['Rating', 'Score'].mean().reset_index()

Unnamed: 0,Age,Rating,Score
0,23,3.18,87.5
1,25,4.105,75.0
2,26,3.24,70.0
3,29,4.6,95.0
4,30,3.2,90.0


Multiple aggregation

In [382]:
df1.groupby(['Age'])['Rating', 'Score'].agg([np.mean, np.median, np.std]).reset_index()

Unnamed: 0_level_0,Age,Rating,Rating,Rating,Score,Score,Score
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,median,std,mean,median,std
0,23,3.18,3.18,0.876812,87.5,87.5,3.535534
1,25,4.105,4.105,0.176777,75.0,75.0,7.071068
2,26,3.24,3.24,,70.0,70.0,
3,29,4.6,4.6,,95.0,95.0,
4,30,3.2,3.2,,90.0,90.0,


Merge: Default join is 'inner'

In [392]:
df_left = pd.DataFrame({
   'id':[1,2,3,4,5],
   'Name': ['Alex', 'Amy', 'Allen', 'Alice', 'Ayoung'],
   'subject_id':['sub1','sub2','sub4','sub6','sub5']})
df_right = pd.DataFrame(
   {'id':[1,2,3,4,5],
   'Name': ['Billy', 'Brian', 'Bran', 'Bryce', 'Betty'],
   'subject_id':['sub2','sub4','sub3','sub6','sub5']})x

In [384]:
df_left

Unnamed: 0,Name,id,subject_id
0,Alex,1,sub1
1,Amy,2,sub2
2,Allen,3,sub4
3,Alice,4,sub6
4,Ayoung,5,sub5


In [385]:
df_right

Unnamed: 0,Name,id,subject_id
0,Billy,1,sub2
1,Brian,2,sub4
2,Bran,3,sub3
3,Bryce,4,sub6
4,Betty,5,sub5


merge with one key

In [386]:
pd.merge(df_left, df_right, on='id')

Unnamed: 0,Name_x,id,subject_id_x,Name_y,subject_id_y
0,Alex,1,sub1,Billy,sub2
1,Amy,2,sub2,Brian,sub4
2,Allen,3,sub4,Bran,sub3
3,Alice,4,sub6,Bryce,sub6
4,Ayoung,5,sub5,Betty,sub5


merge with multiple keys

In [390]:
pd.merge(df_left, df_right, on=['id', 'subject_id'])

Unnamed: 0,Name_x,id,subject_id,Name_y
0,Alice,4,sub6,Bryce
1,Ayoung,5,sub5,Betty


Merge using 'how' argument

In [393]:
df_left.merge(df_right, on='subject_id', how='left')

Unnamed: 0,Name_x,id_x,subject_id,Name_y,id_y
0,Alex,1,sub1,,
1,Amy,2,sub2,Billy,1.0
2,Allen,3,sub4,Brian,2.0
3,Alice,4,sub6,Bryce,4.0
4,Ayoung,5,sub5,Betty,5.0


In [394]:
df_left.merge(df_right, on='subject_id', how='right')

Unnamed: 0,Name_x,id_x,subject_id,Name_y,id_y
0,Amy,2.0,sub2,Billy,1
1,Allen,3.0,sub4,Brian,2
2,Alice,4.0,sub6,Bryce,4
3,Ayoung,5.0,sub5,Betty,5
4,,,sub3,Bran,3


Concat

In [395]:
pd.concat([df_left, df_right])

Unnamed: 0,Name,id,subject_id
0,Alex,1,sub1
1,Amy,2,sub2
2,Allen,3,sub4
3,Alice,4,sub6
4,Ayoung,5,sub5
0,Billy,1,sub2
1,Brian,2,sub4
2,Bran,3,sub3
3,Bryce,4,sub6
4,Betty,5,sub5


In [398]:
pd.concat([df_left, df_right], axis=1, keys=['left', 'right'])

Unnamed: 0_level_0,left,left,left,right,right,right
Unnamed: 0_level_1,Name,id,subject_id,Name,id,subject_id
0,Alex,1,sub1,Billy,1,sub2
1,Amy,2,sub2,Brian,2,sub4
2,Allen,3,sub4,Bran,3,sub3
3,Alice,4,sub6,Bryce,4,sub6
4,Ayoung,5,sub5,Betty,5,sub5


In [397]:
pd.concat([df_left, df_right], keys=['left', 'right'])

Unnamed: 0,Unnamed: 1,Name,id,subject_id
left,0,Alex,1,sub1
left,1,Amy,2,sub2
left,2,Allen,3,sub4
left,3,Alice,4,sub6
left,4,Ayoung,5,sub5
right,0,Billy,1,sub2
right,1,Brian,2,sub4
right,2,Bran,3,sub3
right,3,Bryce,4,sub6
right,4,Betty,5,sub5


The index of the resultant is duplicated; each index is repeated.

If the resultant object has to follow its own indexing, set ignore_index to True.

In [400]:
pd.concat([df_left, df_right], ignore_index=True)

Unnamed: 0,Name,id,subject_id
0,Alex,1,sub1
1,Amy,2,sub2
2,Allen,3,sub4
3,Alice,4,sub6
4,Ayoung,5,sub5
5,Billy,1,sub2
6,Brian,2,sub4
7,Bran,3,sub3
8,Bryce,4,sub6
9,Betty,5,sub5


Concat using append

Multiple dataframes can be appended by mentioning as many dataframes in a list inside append() function

In [403]:
df_left.append(df_right, ignore_index=True)

Unnamed: 0,Name,id,subject_id
0,Alex,1,sub1
1,Amy,2,sub2
2,Allen,3,sub4
3,Alice,4,sub6
4,Ayoung,5,sub5
5,Billy,1,sub2
6,Brian,2,sub4
7,Bran,3,sub3
8,Bryce,4,sub6
9,Betty,5,sub5


Time functions

In [405]:
print(pd.datetime.now())

2019-04-12 01:17:48.851008


In [409]:
pd.to_datetime(pd.Series(['Jul 31, 2009','2010-01-10', None]))

0   2009-07-31
1   2010-01-10
2          NaT
dtype: datetime64[ns]

In [411]:
pd.date_range('1/1/2019', periods=5)

DatetimeIndex(['2019-01-01', '2019-01-02', '2019-01-03', '2019-01-04',
               '2019-01-05'],
              dtype='datetime64[ns]', freq='D')

In [412]:
pd.date_range('1/1/2019', periods=5, freq='M')

DatetimeIndex(['2019-01-31', '2019-02-28', '2019-03-31', '2019-04-30',
               '2019-05-31'],
              dtype='datetime64[ns]', freq='M')

In [421]:
start = pd.datetime(2018, 8, 1)
end = pd.datetime(2019, 5, 31)

print(pd.date_range(start, end, freq='M'))

DatetimeIndex(['2018-08-31', '2018-09-30', '2018-10-31', '2018-11-30',
               '2018-12-31', '2019-01-31', '2019-02-28', '2019-03-31',
               '2019-04-30', '2019-05-31'],
              dtype='datetime64[ns]', freq='M')


Basic text operations

In [432]:
text = pd.Series(['Data analyst', '@nalytics ', ' Data Science', np.nan, 'Exploratory Data Analysis', 'Models', '1234', 'Evaluation', 'Metrics'])
text

0                 Data analyst
1                   @nalytics 
2                 Data Science
3                          NaN
4    Exploratory Data Analysis
5                       Models
6                         1234
7                   Evaluation
8                      Metrics
dtype: object

In [433]:
text.str.lower()

0                 data analyst
1                   @nalytics 
2                 data science
3                          NaN
4    exploratory data analysis
5                       models
6                         1234
7                   evaluation
8                      metrics
dtype: object

In [434]:
text.str.upper()

0                 DATA ANALYST
1                   @NALYTICS 
2                 DATA SCIENCE
3                          NaN
4    EXPLORATORY DATA ANALYSIS
5                       MODELS
6                         1234
7                   EVALUATION
8                      METRICS
dtype: object

In [435]:
text.str.len()

0    12.0
1    10.0
2    13.0
3     NaN
4    25.0
5     6.0
6     4.0
7    10.0
8     7.0
dtype: float64

In [436]:
text.str.strip() #Helps strip whitespace(including newline) from each string from both the sides

0                 Data analyst
1                    @nalytics
2                 Data Science
3                          NaN
4    Exploratory Data Analysis
5                       Models
6                         1234
7                   Evaluation
8                      Metrics
dtype: object

In [438]:
text.str.cat(sep='_') #Concatenates the strings with a separator

'Data analyst_@nalytics _ Data Science_Exploratory Data Analysis_Models_1234_Evaluation_Metrics'

In [439]:
text.str.contains('a')

0     True
1     True
2     True
3      NaN
4     True
5    False
6    False
7     True
8    False
dtype: object

In [440]:
text.str.replace('@', '$')

0                 Data analyst
1                   $nalytics 
2                 Data Science
3                          NaN
4    Exploratory Data Analysis
5                       Models
6                         1234
7                   Evaluation
8                      Metrics
dtype: object

In [441]:
text.str.count('s')

0    1.0
1    1.0
2    0.0
3    NaN
4    2.0
5    1.0
6    0.0
7    0.0
8    1.0
dtype: float64

In [447]:
text.str.find('n')

0     6.0
1     1.0
2    10.0
3     NaN
4    18.0
5    -1.0
6    -1.0
7     9.0
8    -1.0
dtype: float64

In [448]:
text.str.isnumeric()

0    False
1    False
2    False
3      NaN
4    False
5    False
6     True
7    False
8    False
dtype: object