# Numpy

##### Numpy (Numerical Python) is library consisting of multidimensional array objects, and has functions for processing the array

In [3]:
import numpy as np

In [5]:
a1 = np.array([1, 2, 3, 4, 5])
a1

array([1, 2, 3, 4, 5])

In [7]:
type(a1)

numpy.ndarray

In [9]:
a1.shape

(5,)

In [11]:
a1.ndim

1

In [15]:
a1.dtype

dtype('int32')

## Multi dimensional array

In [18]:
a1 = np.array([[1, 2, 3], [4, 5, 6]])
a1

array([[1, 2, 3],
       [4, 5, 6]])

In [20]:
a1.shape

(2, 3)

In [22]:
a1.ndim

2

### Specifying the data type

In [25]:
a1 = np.array([[6, 4, 5], [5, 8, 3]], dtype = float)
a1

array([[6., 4., 5.],
       [5., 8., 3.]])

In [28]:
a1.shape

(2, 3)

### Reshape

In [30]:
a1 = a1.reshape(3, 2)
a1

array([[6., 4.],
       [5., 5.],
       [8., 3.]])

### Array creation

In [33]:
a1 = np.zeros(6)
a1

array([0., 0., 0., 0., 0., 0.])

In [41]:
a1 = np.ones([2, 2], dtype = int)
a1

array([[1, 1],
       [1, 1]])

In [43]:
a2=np.array([[6,4,5],[5,8,3]], dtype=float)
a2

array([[6., 4., 5.],
       [5., 8., 3.]])

In [45]:
a2.shape

(2, 3)

#### Creating an array from existing data


In [48]:
x1 = ([3, 2], [4, 5], [6, 7])
a1 = np.asarray(x1, dtype = float)
a1

array([[3., 2.],
       [4., 5.],
       [6., 7.]])

## using numerical ranges


In [51]:
a1 = np.arange(0, 16, 3)
a1

array([ 0,  3,  6,  9, 12, 15])

In [53]:
type(a1)

numpy.ndarray

In [55]:
a1 = np.linspace(10, 20, 5)
a1

array([10. , 12.5, 15. , 17.5, 20. ])

### Slicing


In [60]:
# l = np.arange(10)
# a1 = a1.slice(2, 9, 2)
# a1

In [62]:
l = np.arange(10)
a1 = l[2: 9]
a1

array([2, 3, 4, 5, 6, 7, 8])

In [64]:
l = np.arange(10)
a1 = l[2: 9: 2]
a1

array([2, 4, 6, 8])

In [68]:
a1 = np.array([[1, 2, 3], [3, 4, 5], [4, 5, 6]])
a1[1:]

array([[3, 4, 5],
       [4, 5, 6]])

### ellipsis (...)

In [70]:
a1[..., 1]

array([2, 4, 5])

In [72]:
a1[1:, ...]

array([[3, 4, 5],
       [4, 5, 6]])

### Integer indexing

In [76]:
a = np.array([[1, 2], [3, 4], [5, 6]])
b = a[[0, 1, 2], [0, 1, 0]]
print(b)

[1 4 5]


In [80]:
x = np.array([[0, 1, 2], [3, 4, 5], [6, 7, 8], [9, 10, 11]])
z = x[1:4, 1:3]
z

array([[ 4,  5],
       [ 7,  8],
       [10, 11]])

In [98]:
x=np.array([[0,1,2],[3,4,5],[6,7,8],[9,10,11]])
z=x[-3:-1]  #
# z=x[-1:-2]
z

array([[3, 4, 5],
       [6, 7, 8]])

In [None]:
# loc, iloc

### Omitting Nan using complement operator

In [None]:
Nan - Not a number (np.nan)

In [103]:
a = np.array([1, 5, np.nan, 8, np.nan, 52, 65, np.nan])
a

array([ 1.,  5., nan,  8., nan, 52., 65., nan])

In [107]:
a[~ np.isnan(a)]

array([ 1.,  5.,  8., 52., 65.])

In [109]:
x = np.array([5, 7, 8])
y = np.array([8, 9, 2]) # multiply using a scalar

z = x * y
z

array([40, 63, 16])

In [111]:
x1 = np.array([5,6,8])
y1 = np.array(5)
print(x1*y1)

[25 30 40]


In [115]:
x = np.array([5, 7, 8])
y = 2.0 # multiply using a scalar

z = x * 2
z

array([10, 14, 16])

In [117]:
a = np.arange(0, 60, 5)
a = a.reshape(3, 4)
a

array([[ 0,  5, 10, 15],
       [20, 25, 30, 35],
       [40, 45, 50, 55]])

In [119]:
a.T

array([[ 0, 20, 40],
       [ 5, 25, 45],
       [10, 30, 50],
       [15, 35, 55]])

In [121]:
a.flatten()

array([ 0,  5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55])

### Concatenate

In [126]:
a = np.array([[1, 2, 3], [4, 5, 6]])
b =  np.array([[11, 22, 33], [44, 55, 66]])
np.concatenate((a, b ), axis = 0) #axis = 0 is default

array([[ 1,  2,  3],
       [ 4,  5,  6],
       [11, 22, 33],
       [44, 55, 66]])

In [128]:
np.concatenate((a, b), axis = 1)

array([[ 1,  2,  3, 11, 22, 33],
       [ 4,  5,  6, 44, 55, 66]])

### Split

In [131]:
a = np.arange(9)
np.split(a, 3)

[array([0, 1, 2]), array([3, 4, 5]), array([6, 7, 8])]

### Append

In [134]:
a = np.array([[1, 2, 3], [4, 5, 6]])
np.append(a, [[11, 22, 33], [44, 55, 66]], axis = 1)

array([[ 1,  2,  3, 11, 22, 33],
       [ 4,  5,  6, 44, 55, 66]])

### insert

In [139]:
a = np.array([[1, 2], [3, 4], [5,6]])
np.insert(a, 1, [7, 8], axis = 0)

array([[1, 2],
       [7, 8],
       [3, 4],
       [5, 6]])

In [145]:
a = np.array([[1, 2], [3, 4], [5,6]])
np.insert(a, 1, [7], axis = 1)

array([[1, 7, 2],
       [3, 7, 4],
       [5, 7, 6]])

### delete

In [152]:
a= np.arange(12).reshape(3, 4)
print(a)
np.delete(a, 1, axis = 0)

[[ 0  1  2  3]
 [ 4  5  6  7]
 [ 8  9 10 11]]


array([[ 0,  1,  2,  3],
       [ 8,  9, 10, 11]])

In [154]:
a= np.arange(12).reshape(3, 4)
print(a)
np.delete(a, 1, axis = 1)

[[ 0  1  2  3]
 [ 4  5  6  7]
 [ 8  9 10 11]]


array([[ 0,  2,  3],
       [ 4,  6,  7],
       [ 8, 10, 11]])

### unique

In [157]:
a = np.array([5, 2, 6, 2, 7, 5, 6, 8, 2, 9])
np.unique(a)

array([2, 5, 6, 7, 8, 9])

### add

In [164]:
a = np.arange(12).reshape(3, 4)
b = np.arange(12).reshape(3, 4)

np.add(a, b)

array([[ 0,  2,  4,  6],
       [ 8, 10, 12, 14],
       [16, 18, 20, 22]])

In [None]:
np.subtract
np.multiply
np.divide
np.power
np.mean
np.median

In [None]:
a=np.arange(12).reshape(3,4)
b=np.array([10,20,30,40])
np.add(a,b)

In [172]:
a = np.arange(12).reshape(3,4)
b = np.arange(30,42).reshape(3,4)
print(a)
print(b)
print('Add\n', np.add(a,b))
print('Sub\n',np.subtract(a,b))
print('Mul\n',np.multiply(a,b))
print('Div\n',np.divide(a,b))

[[ 0  1  2  3]
 [ 4  5  6  7]
 [ 8  9 10 11]]
[[30 31 32 33]
 [34 35 36 37]
 [38 39 40 41]]
Add
 [[30 32 34 36]
 [38 40 42 44]
 [46 48 50 52]]
Sub
 [[-30 -30 -30 -30]
 [-30 -30 -30 -30]
 [-30 -30 -30 -30]]
Mul
 [[  0  31  64  99]
 [136 175 216 259]
 [304 351 400 451]]
Div
 [[0.         0.03225806 0.0625     0.09090909]
 [0.11764706 0.14285714 0.16666667 0.18918919]
 [0.21052632 0.23076923 0.25       0.26829268]]


In [180]:
np.median([25000, 10000, 15000, 20000, 100000000])

20000.0

In [182]:
np.std([25000, 10000, 15000, 20000, 100000000])

39993000.312554695

In [184]:
np.std([25000, 10000, 15000, 20000])

5590.169943749474

In [240]:
# np.random?

In [213]:
np.random.random?

[1;31mDocstring:[0m
random(size=None)

Return random floats in the half-open interval [0.0, 1.0). Alias for
`random_sample` to ease forward-porting to the new random API.
[1;31mType:[0m      builtin_function_or_method

In [215]:
np.random.random()

0.35771057775815185

### Pandas

In [244]:
import warnings
warnings.filterwarnings('ignore')

In [218]:
import numpy as np
import pandas as pd

In [220]:
a = np.array([10, 2, 50, 60])
s = pd.Series(a)
s

0    10
1     2
2    50
3    60
dtype: int32

In [222]:
a

array([10,  2, 50, 60])

In [224]:
a = np.array([10, 2, 50, 60])
s = pd.Series(a, index = [100, 101, 102, 103])
s

100    10
101     2
102    50
103    60
dtype: int32

In [232]:
var1= {'class':['a','b','c'],'age':[12,34,50]}
var2= pd.Series(var1)
var2

class       [a, b, c]
age      [12, 34, 50]
dtype: object

In [236]:
a = {'a':1, 'b':2,'c':3,'d':4,'e':5} 
s = pd.Series(a)
s

a    1
b    2
c    3
d    4
e    5
dtype: int64

In [None]:
a = {'a':1, 'b':2,'c':3,'d':4,'e':5} 
s = pd.Series(a)
s

In [246]:
a = pd.Series([1, 2, 3, 4, 5], index= ['a', 'b', 'c', 'd', 'c'])
s[2]

3

In [248]:
s[:3]

a    1
b    2
c    3
dtype: int64

In [250]:
s['d']

4

In [262]:
data1 = {
    "school": 'SRMHSS',  #['SRMHSS', 'Prince Matric', 'Nehru High School'],
    "rank":  4  #[1,4,7]
}
df = pd.Series(data1, index = ["school", "rank"])  #index=[{"school": 100, "rank":101}]
print(df)

school    SRMHSS
rank           4
dtype: object


In [264]:
b={'class':['a','b','c'],'age':[12,34,50]}
c=pd.Series(b)
c

class       [a, b, c]
age      [12, 34, 50]
dtype: object

### Dataframe

In [267]:
df = pd.DataFrame()
print(df)

Empty DataFrame
Columns: []
Index: []


### Creating df from list

In [270]:
l = list(range(5))
df = pd.DataFrame(l)
print(df)

   0
0  0
1  1
2  2
3  3
4  4


In [274]:
list1 = [['Col1', 10], ['Col2', 20], ['Col3', 30] ]
df = pd.DataFrame(list1, columns=['First column', 'Second column'])
df

Unnamed: 0,First column,Second column
0,Col1,10
1,Col2,20
2,Col3,30


In [278]:
list1 = [[1000, 10], [200, 20], ['5', 30] ]
df = pd.DataFrame(list1, columns=['First column', 'Second column'], dtype= float)
df

Unnamed: 0,First column,Second column
0,1000.0,10.0
1,200.0,20.0
2,5.0,30.0


In [280]:
list2 = [['Ram',40],['Raju',20],['Karthi',30]]
df = pd.DataFrame(list2,columns=['Name','Age'],index=[100,101,102])
df

Unnamed: 0,Name,Age
100,Ram,40
101,Raju,20
102,Karthi,30


In [314]:
b={'Name':['Ram', 'Raju', 'Karthi'],'Age':[12,34,50]}
df= pd.DataFrame(b)
df

Unnamed: 0,Name,Age
0,Ram,12
1,Raju,34
2,Karthi,50


### Column selection

In [288]:
df['Age']

0    12
1    34
2    50
Name: Age, dtype: int64

In [291]:
df # 'Height', -> create a series

Unnamed: 0,Name,Age
0,Ram,12
1,Raju,34
2,Karthi,50


In [316]:
df['Height'] =  pd.Series([150, 200, 175], index = (0, 1, 2))

In [297]:
df

Unnamed: 0,Name,Age,Height
0,Ram,12,150
1,Raju,34,200
2,Karthi,50,175


In [318]:
df['Add'] = df['Height'] + df['Age']
df

Unnamed: 0,Name,Age,Height,Add
0,Ram,12,150,162
1,Raju,34,200,234
2,Karthi,50,175,225


###  Deleting a column

In [302]:
del df['Add']
df

Unnamed: 0,Name,Age,Height
0,Ram,12,150
1,Raju,34,200
2,Karthi,50,175


In [322]:
df.drop('Add', axis = 1, inplace=True)

In [324]:
df

Unnamed: 0,Name,Age,Height
0,Ram,12,150
1,Raju,34,200
2,Karthi,50,175


In [326]:
df.index= ['a', 'b', 'c']

In [328]:
df

Unnamed: 0,Name,Age,Height
a,Ram,12,150
b,Raju,34,200
c,Karthi,50,175


In [330]:
df.loc['b']

Name      Raju
Age         34
Height     200
Name: b, dtype: object

In [332]:
df.iloc[1]

Name      Raju
Age         34
Height     200
Name: b, dtype: object

In [334]:
df.loc[-1] = ['John', 33, 123]
df.index = ['a', 'b', 'c', 'd']

In [336]:
df

Unnamed: 0,Name,Age,Height
a,Ram,12,150
b,Raju,34,200
c,Karthi,50,175
d,John,33,123


In [338]:
df.loc['e'] = ['David', 23, 183]

In [343]:
df

Unnamed: 0,Name,Age,Height
a,Ram,12,150
b,Raju,34,200
c,Karthi,50,175
d,John,33,123
e,David,23,183


### Slicing

In [345]:
df[1:3]

Unnamed: 0,Name,Age,Height
b,Raju,34,200
c,Karthi,50,175


### Row addition

###### Append

In [None]:
# df1 = pd.DataFrame([[1, 2], [3, 4]], columns= ['a', 'b'])
# df2 = pd.DataFrame([[5, 6], [7, 8]], columns= ['a', 'b'])

# # df3 = df1.append(df2)
# df3.index = [0, 1, 2, 3]
# df3

In [353]:
print(pd.__version__)

2.2.2


In [373]:
df1 =pd.DataFrame([[1,2], [3,4]], columns= ['a','b'])
df1 =pd.DataFrame([[5,6], [7,8]], columns= ['a','b'])

df3 = pd.concat([df1,df2], ignore_index=True)
print(df3)

   a  b
0  5  6
1  7  8
2  5  6
3  7  8


In [363]:
df3.drop(0, axis = 0, inplace = True)

In [365]:
df3

Unnamed: 0,a,b
1,7,8
1,7,8


In [3]:
dic = {'Name':[7, value], 'Age':[7 values], 'Rating': [7 values]}

SyntaxError: invalid syntax. Perhaps you forgot a comma? (1391580551.py, line 1)

In [7]:
import numpy as np
import pandas as pd

In [9]:
dic = {'Name': pd.Series(['Tom', 'James', 'Ricky', 'Vin', 'Steve', 'Smith', 'Jack']),
      'Age': pd.Series([25, 26, 25, 23, 30, 29, 23]),
      'Rating': pd.Series([4.23, 3.24, 3.98, 2.56, 3.20, 4.6, 3.8])}

df = pd.DataFrame(dic)
df

Unnamed: 0,Name,Age,Rating
0,Tom,25,4.23
1,James,26,3.24
2,Ricky,25,3.98
3,Vin,23,2.56
4,Steve,30,3.2
5,Smith,29,4.6
6,Jack,23,3.8


### Basic pandas functionalities


In [11]:
df.head() # top 5 records

Unnamed: 0,Name,Age,Rating
0,Tom,25,4.23
1,James,26,3.24
2,Ricky,25,3.98
3,Vin,23,2.56
4,Steve,30,3.2


In [13]:
df.tail()

Unnamed: 0,Name,Age,Rating
2,Ricky,25,3.98
3,Vin,23,2.56
4,Steve,30,3.2
5,Smith,29,4.6
6,Jack,23,3.8


In [15]:
df.T

Unnamed: 0,0,1,2,3,4,5,6
Name,Tom,James,Ricky,Vin,Steve,Smith,Jack
Age,25,26,25,23,30,29,23
Rating,4.23,3.24,3.98,2.56,3.2,4.6,3.8


In [17]:
dfT = df.copy()

In [19]:
dfT = df.T
dfT

Unnamed: 0,0,1,2,3,4,5,6
Name,Tom,James,Ricky,Vin,Steve,Smith,Jack
Age,25,26,25,23,30,29,23
Rating,4.23,3.24,3.98,2.56,3.2,4.6,3.8


In [21]:
dfT.T

Unnamed: 0,Name,Age,Rating
0,Tom,25,4.23
1,James,26,3.24
2,Ricky,25,3.98
3,Vin,23,2.56
4,Steve,30,3.2
5,Smith,29,4.6
6,Jack,23,3.8


In [23]:
df.shape #Data dimension

# (7, 3)
# 7 -> # of observations (records)
# 3 -> # of cols (variables)

(7, 3)

In [25]:
df.axes

[RangeIndex(start=0, stop=7, step=1),
 Index(['Name', 'Age', 'Rating'], dtype='object')]

In [27]:
df.empty

False

In [29]:
df.size

21

In [31]:
df

Unnamed: 0,Name,Age,Rating
0,Tom,25,4.23
1,James,26,3.24
2,Ricky,25,3.98
3,Vin,23,2.56
4,Steve,30,3.2
5,Smith,29,4.6
6,Jack,23,3.8


In [33]:
df.sum()

Name      TomJamesRickyVinSteveSmithJack
Age                                  181
Rating                             25.61
dtype: object

In [35]:
df['Rating'].sum()

25.610000000000003

In [37]:
df['Age'].mean()

25.857142857142858

In [39]:
df['Age'].median()

25.0

In [41]:
df['Age'].std()

2.734262327610589

In [43]:
df

Unnamed: 0,Name,Age,Rating
0,Tom,25,4.23
1,James,26,3.24
2,Ricky,25,3.98
3,Vin,23,2.56
4,Steve,30,3.2
5,Smith,29,4.6
6,Jack,23,3.8


In [45]:
df['Age'].mode()

0    23
1    25
Name: Age, dtype: int64

In [47]:
df.count()

Name      7
Age       7
Rating    7
dtype: int64

In [49]:
df['Age'].min()

23

In [51]:
df['Age'].max()

30

In [53]:
df.describe(include = 'all')

Unnamed: 0,Name,Age,Rating
count,7,7.0,7.0
unique,7,,
top,Tom,,
freq,1,,
mean,,25.857143,3.658571
std,,2.734262,0.698628
min,,23.0,2.56
25%,,24.0,3.22
50%,,25.0,3.8
75%,,27.5,4.105


In [55]:
# df.describe?

In [57]:
with pd.option_context('display.max_columns', 40):
    print(df.describe(include='all'))

       Name        Age    Rating
count     7   7.000000  7.000000
unique    7        NaN       NaN
top     Tom        NaN       NaN
freq      1        NaN       NaN
mean    NaN  25.857143  3.658571
std     NaN   2.734262  0.698628
min     NaN  23.000000  2.560000
25%     NaN  24.000000  3.220000
50%     NaN  25.000000  3.800000
75%     NaN  27.500000  4.105000
max     NaN  30.000000  4.600000


In [59]:
df.dtypes

Name       object
Age         int64
Rating    float64
dtype: object

In [61]:
df1 = df.copy()

In [63]:
df2 = df

In [65]:
id(df1)

2318848745088

In [67]:
id(df)

2318855579568

In [69]:
id(df2)

2318855579568

In [71]:
dic = {'Col1': [20, 50, 40, 60, 80], 
      'Col2': [80, 60, 40, 20, 55],
      'Col3': [20, 80, 55, 66, 33]}

df = pd.DataFrame(dic)
df

Unnamed: 0,Col1,Col2,Col3
0,20,80,20
1,50,60,80
2,40,40,55
3,60,20,66
4,80,55,33


### apply function


In [74]:
df.apply(np.mean, axis = 0)

Col1    50.0
Col2    51.0
Col3    50.8
dtype: float64

In [76]:
df.apply(np.mean, axis = 1)

0    40.000000
1    63.333333
2    45.000000
3    48.666667
4    56.000000
dtype: float64

In [78]:
df.apply(lambda x: x.max() - x.min())

Col1    60
Col2    60
Col3    60
dtype: int64

In [80]:
df.apply(lambda x: x.max() - x.min(), axis = 1)

0    60
1    30
2    15
3    46
4    47
dtype: int64

### Renaming the column

In [83]:
df1

Unnamed: 0,Name,Age,Rating
0,Tom,25,4.23
1,James,26,3.24
2,Ricky,25,3.98
3,Vin,23,2.56
4,Steve,30,3.2
5,Smith,29,4.6
6,Jack,23,3.8


In [85]:
df1.rename(columns={'Name': 'Col1', 'Age': 'Col2', 'Rating': 'Col3'})


Unnamed: 0,Col1,Col2,Col3
0,Tom,25,4.23
1,James,26,3.24
2,Ricky,25,3.98
3,Vin,23,2.56
4,Steve,30,3.2
5,Smith,29,4.6
6,Jack,23,3.8


In [87]:
np.random.random?

[1;31mDocstring:[0m
random(size=None)

Return random floats in the half-open interval [0.0, 1.0). Alias for
`random_sample` to ease forward-porting to the new random API.
[1;31mType:[0m      builtin_function_or_method

In [89]:
np.random.random()

0.7483496163805455

In [91]:
np.random.randn()

-0.6685944056311651

In [93]:
# np.random.randn?

In [95]:
df = pd.DataFrame(np.random.randn(5, 2), index = [1, 5, 3, 2, 4], columns= ['col2', 'col1'])
df

Unnamed: 0,col2,col1
1,-0.388776,0.910576
5,-0.238851,0.729435
3,-0.435259,-0.587792
2,0.5046,-2.238363
4,0.121668,-1.059043


### sorting the indeces

In [98]:
# df.sort_index?

In [100]:
df.sort_index(ascending=False)

Unnamed: 0,col2,col1
5,-0.238851,0.729435
4,0.121668,-1.059043
3,-0.435259,-0.587792
2,0.5046,-2.238363
1,-0.388776,0.910576


### Sort the cols

In [103]:
df.sort_index(axis = 1)

Unnamed: 0,col1,col2
1,0.910576,-0.388776
5,0.729435,-0.238851
3,-0.587792,-0.435259
2,-2.238363,0.5046
4,-1.059043,0.121668


### Sort by values

In [106]:
df.sort_values(by = 'col2')

Unnamed: 0,col2,col1
3,-0.435259,-0.587792
1,-0.388776,0.910576
5,-0.238851,0.729435
4,0.121668,-1.059043
2,0.5046,-2.238363


In [108]:
df1

Unnamed: 0,Name,Age,Rating
0,Tom,25,4.23
1,James,26,3.24
2,Ricky,25,3.98
3,Vin,23,2.56
4,Steve,30,3.2
5,Smith,29,4.6
6,Jack,23,3.8


### loc

In [119]:
df1.loc[2:4 , ['Name', 'Age']]

Unnamed: 0,Name,Age
2,Ricky,25
3,Vin,23
4,Steve,30


In [121]:
df1.loc[:2]

Unnamed: 0,Name,Age,Rating
0,Tom,25,4.23
1,James,26,3.24
2,Ricky,25,3.98


In [123]:
df1.loc[3:,['Age','Rating']]

Unnamed: 0,Age,Rating
3,23,2.56
4,30,3.2
5,29,4.6
6,23,3.8


In [129]:
df1.loc[-2:,['Name','Rating']]

Unnamed: 0,Name,Rating
0,Tom,4.23
1,James,3.24
2,Ricky,3.98
3,Vin,2.56
4,Steve,3.2
5,Smith,4.6
6,Jack,3.8


In [157]:
df1.iloc[-3:, 1:3]

Unnamed: 0,Age,Rating
4,30,3.2
5,29,4.6
6,23,3.8


In [133]:
# df.loc[ len(df)-6:-1,['Age','Rating']]

### iloc

In [160]:
df21 = df1.copy()

In [164]:
df1.index = ['a', 'b', 'c', 'd', 'e', 'f', 'g']

In [166]:
df1

Unnamed: 0,Name,Age,Rating
a,Tom,25,4.23
b,James,26,3.24
c,Ricky,25,3.98
d,Vin,23,2.56
e,Steve,30,3.2
f,Smith,29,4.6
g,Jack,23,3.8


In [168]:
df1.loc['a': 'e', ['Age', 'Name']]

Unnamed: 0,Age,Name
a,25,Tom
b,26,James
c,25,Ricky
d,23,Vin
e,30,Steve


In [170]:
df1.iloc[0:4, 0:2]

Unnamed: 0,Name,Age
a,Tom,25
b,James,26
c,Ricky,25
d,Vin,23


In [172]:
df1.iloc[0:4, [0, 2]]

Unnamed: 0,Name,Rating
a,Tom,4.23
b,James,3.24
c,Ricky,3.98
d,Vin,2.56


In [174]:
df1.iloc[:4]

Unnamed: 0,Name,Age,Rating
a,Tom,25,4.23
b,James,26,3.24
c,Ricky,25,3.98
d,Vin,23,2.56


In [176]:
df1.iloc[:, 1:]

Unnamed: 0,Age,Rating
a,25,4.23
b,26,3.24
c,25,3.98
d,23,2.56
e,30,3.2
f,29,4.6
g,23,3.8


### Handling missing records

In [189]:
df = pd.DataFrame(np.random.randn(5, 3), index = ['a', 'b', 'c', 'd', 'e'], columns = ['one', 'two', 'three'])

In [191]:
df

Unnamed: 0,one,two,three
a,1.679656,-1.105322,0.795376
b,1.309722,-1.243248,-0.388559
c,0.340181,2.713515,0.787253
d,0.44437,0.385681,-0.908193
e,1.347082,-0.579861,-0.091947


In [193]:
df = df.reindex(['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h'])
df

Unnamed: 0,one,two,three
a,1.679656,-1.105322,0.795376
b,1.309722,-1.243248,-0.388559
c,0.340181,2.713515,0.787253
d,0.44437,0.385681,-0.908193
e,1.347082,-0.579861,-0.091947
f,,,
g,,,
h,,,


In [197]:
df['one'].isnull()

a    False
b    False
c    False
d    False
e    False
f     True
g     True
h     True
Name: one, dtype: bool

In [199]:
df['two'].notnull()

a     True
b     True
c     True
d     True
e     True
f    False
g    False
h    False
Name: two, dtype: bool

In [201]:
df['three'].isnull().sum()

3

### Filling missing data

In [204]:
df['one'].fillna(0)

a    1.679656
b    1.309722
c    0.340181
d    0.444370
e    1.347082
f    0.000000
g    0.000000
h    0.000000
Name: one, dtype: float64

In [206]:
df

Unnamed: 0,one,two,three
a,1.679656,-1.105322,0.795376
b,1.309722,-1.243248,-0.388559
c,0.340181,2.713515,0.787253
d,0.44437,0.385681,-0.908193
e,1.347082,-0.579861,-0.091947
f,,,
g,,,
h,,,


In [208]:
df.fillna(1, inplace=True)

In [210]:
df

Unnamed: 0,one,two,three
a,1.679656,-1.105322,0.795376
b,1.309722,-1.243248,-0.388559
c,0.340181,2.713515,0.787253
d,0.44437,0.385681,-0.908193
e,1.347082,-0.579861,-0.091947
f,1.0,1.0,1.0
g,1.0,1.0,1.0
h,1.0,1.0,1.0


In [214]:
df.loc['d':'f'] = pd.Series({'one': np.nan, 'two':np.nan, 'three': np.nan})

In [216]:
df

Unnamed: 0,one,two,three
a,1.679656,-1.105322,0.795376
b,1.309722,-1.243248,-0.388559
c,0.340181,2.713515,0.787253
d,,,
e,,,
f,,,
g,1.0,1.0,1.0
h,1.0,1.0,1.0


In [224]:
df.drop(['three'], axis = 1)

Unnamed: 0,one,two
a,1.679656,-1.105322
b,1.309722,-1.243248
c,0.340181,2.713515
d,,
e,,
f,,
g,1.0,1.0
h,1.0,1.0


### Replace 

In [227]:
df = pd.DataFrame({'one': [10, 20, 30, 40, 50, 2000], 'two': [1000, 0, 30, 40, 50, 60]})
df

Unnamed: 0,one,two
0,10,1000
1,20,0
2,30,30
3,40,40
4,50,50
5,2000,60


In [229]:
df.replace({2000: 60, 1000: 10}, inplace=True)

In [231]:
df

Unnamed: 0,one,two
0,10,10
1,20,0
2,30,30
3,40,40
4,50,50
5,60,60


### Aggregate - group by

In [240]:
df1['Score'] = pd.Series([70, 70, 80, 85, 90, 95, 90], index = ['a', 'b', 'c', 'd', 'e', 'f', 'g'])

In [242]:
df1

Unnamed: 0,Name,Age,Rating,Score
a,Tom,25,4.23,70
b,James,26,3.24,70
c,Ricky,25,3.98,80
d,Vin,23,2.56,85
e,Steve,30,3.2,90
f,Smith,29,4.6,95
g,Jack,23,3.8,90


In [244]:
df1.groupby(['Age'])['Score'].sum()

Age
23    175
25    150
26     70
29     95
30     90
Name: Score, dtype: int64

In [252]:
df1.groupby(['Age'])[['Rating', 'Score']].mean().reset_index()

Unnamed: 0,Age,Rating,Score
0,23,3.18,87.5
1,25,4.105,75.0
2,26,3.24,70.0
3,29,4.6,95.0
4,30,3.2,90.0


In [262]:
df1.groupby(['Age'])[['Rating', 'Score']].agg(['mean', 'median', 'std']).reset_index()

Unnamed: 0_level_0,Age,Rating,Rating,Rating,Score,Score,Score
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,median,std,mean,median,std
0,23,3.18,3.18,0.876812,87.5,87.5,3.535534
1,25,4.105,4.105,0.176777,75.0,75.0,7.071068
2,26,3.24,3.24,,70.0,70.0,
3,29,4.6,4.6,,95.0,95.0,
4,30,3.2,3.2,,90.0,90.0,


### Merge operations

In [266]:
df_left = pd.DataFrame({'id': [1, 2, 3, 4, 5],
                       'Name': ['Alex', 'Amy', 'Allen', 'Alice', 'Ayoung'],
                       'subject_id': ['sub1', 'sub2', 'sub4', 'sub6', 'sub5']})

df_right = pd.DataFrame({'id': [1, 2, 3, 4, 5],
                       'Name': ['Billy', 'Brian', 'Bran', 'Bryce', 'Betty'],
                       'subject_id': ['sub2', 'sub4', 'sub3', 'sub6', 'sub5']})

In [268]:
df_left

Unnamed: 0,id,Name,subject_id
0,1,Alex,sub1
1,2,Amy,sub2
2,3,Allen,sub4
3,4,Alice,sub6
4,5,Ayoung,sub5


In [270]:
df_right

Unnamed: 0,id,Name,subject_id
0,1,Billy,sub2
1,2,Brian,sub4
2,3,Bran,sub3
3,4,Bryce,sub6
4,5,Betty,sub5


### Merge using one key

In [273]:
pd.merge(df_left, df_right, on='id')

Unnamed: 0,id,Name_x,subject_id_x,Name_y,subject_id_y
0,1,Alex,sub1,Billy,sub2
1,2,Amy,sub2,Brian,sub4
2,3,Allen,sub4,Bran,sub3
3,4,Alice,sub6,Bryce,sub6
4,5,Ayoung,sub5,Betty,sub5


In [275]:
pd.merge(df_left, df_right, on= ['id', 'subject_id'])

Unnamed: 0,id,Name_x,subject_id,Name_y
0,4,Alice,sub6,Bryce
1,5,Ayoung,sub5,Betty


### Merge using 'how' argument

In [278]:
df_left.merge(df_right, on= 'subject_id', how='left')

Unnamed: 0,id_x,Name_x,subject_id,id_y,Name_y
0,1,Alex,sub1,,
1,2,Amy,sub2,1.0,Billy
2,3,Allen,sub4,2.0,Brian
3,4,Alice,sub6,4.0,Bryce
4,5,Ayoung,sub5,5.0,Betty


In [280]:
df_left.merge(df_right, on= 'subject_id', how='right')

Unnamed: 0,id_x,Name_x,subject_id,id_y,Name_y
0,2.0,Amy,sub2,1,Billy
1,3.0,Allen,sub4,2,Brian
2,,,sub3,3,Bran
3,4.0,Alice,sub6,4,Bryce
4,5.0,Ayoung,sub5,5,Betty


### concat

In [283]:
pd.concat([df_left, df_right])

Unnamed: 0,id,Name,subject_id
0,1,Alex,sub1
1,2,Amy,sub2
2,3,Allen,sub4
3,4,Alice,sub6
4,5,Ayoung,sub5
0,1,Billy,sub2
1,2,Brian,sub4
2,3,Bran,sub3
3,4,Bryce,sub6
4,5,Betty,sub5


In [287]:
pd.concat([df_left, df_right], axis=1, keys=['tab1', 'tab2'])

Unnamed: 0_level_0,tab1,tab1,tab1,tab2,tab2,tab2
Unnamed: 0_level_1,id,Name,subject_id,id,Name,subject_id
0,1,Alex,sub1,1,Billy,sub2
1,2,Amy,sub2,2,Brian,sub4
2,3,Allen,sub4,3,Bran,sub3
3,4,Alice,sub6,4,Bryce,sub6
4,5,Ayoung,sub5,5,Betty,sub5


### concat using append

In [292]:
# df_left.append(df_right)

In [296]:
print(pd.Timestamp.now())

2024-11-17 07:56:47.441394


In [310]:
text = pd.Series(['Data analyst ', '@nalytics', '  Data Science', np.nan, 'Exploratory Data Analysis   ', 'Model', '1234'])
text

0                   Data analyst 
1                       @nalytics
2                    Data Science
3                             NaN
4    Exploratory Data Analysis   
5                           Model
6                            1234
dtype: object

In [302]:
type(text)

pandas.core.series.Series

In [300]:
text.str.lower()

0                 data analyst
1                    @nalytics
2                 data science
3                          NaN
4    exploratory data analysis
5                        model
6                         1234
dtype: object

In [304]:
text.str.upper()

0                 DATA ANALYST
1                    @NALYTICS
2                 DATA SCIENCE
3                          NaN
4    EXPLORATORY DATA ANALYSIS
5                        MODEL
6                         1234
dtype: object

In [306]:
text.str.len()

0    12.0
1     9.0
2    12.0
3     NaN
4    25.0
5     5.0
6     4.0
dtype: float64

In [312]:
text.str.strip()

0                 Data analyst
1                    @nalytics
2                 Data Science
3                          NaN
4    Exploratory Data Analysis
5                        Model
6                         1234
dtype: object

In [314]:
text.str.cat(sep = '_')

'Data analyst _@nalytics_  Data Science_Exploratory Data Analysis   _Model_1234'

In [316]:
text.str.contains('a')

0     True
1     True
2     True
3      NaN
4     True
5    False
6    False
dtype: object

In [318]:
text.str.replace('@', '$')

0                   Data analyst 
1                       $nalytics
2                    Data Science
3                             NaN
4    Exploratory Data Analysis   
5                           Model
6                            1234
dtype: object

In [320]:
text.str.count('s')

0    1.0
1    1.0
2    0.0
3    NaN
4    2.0
5    0.0
6    0.0
dtype: float64

In [322]:
text.str.find('n')

0     6.0
1     1.0
2    11.0
3     NaN
4    18.0
5    -1.0
6    -1.0
dtype: float64

In [324]:
text.str.isnumeric()

0    False
1    False
2    False
3      NaN
4    False
5    False
6     True
dtype: object

In [326]:
df_left

Unnamed: 0,id,Name,subject_id
0,1,Alex,sub1
1,2,Amy,sub2
2,3,Allen,sub4
3,4,Alice,sub6
4,5,Ayoung,sub5


In [353]:
df_left[df_left['Name'] == 'Alex']

Unnamed: 0,id,Name,subject_id
0,1,Alex,sub1
