In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [3]:
data = {'A':[1,2,np.nan,4,5],
        'B':[np.nan,2,3,4,np.nan],
        'C':[10,20,30,40,50]}
df = pd.DataFrame(data)

print("DataFrame with NaNs:")
df

DataFrame with NaNs:


Unnamed: 0,A,B,C
0,1.0,,10
1,2.0,2.0,20
2,,3.0,30
3,4.0,4.0,40
4,5.0,,50


In [4]:
#Data with null values
df.isnull()

Unnamed: 0,A,B,C
0,False,True,False
1,False,False,False
2,True,False,False
3,False,False,False
4,False,True,False


In [5]:
#Data count null values
df.isnull().sum()

A    1
B    2
C    0
dtype: int64

In [6]:
#To know the percentage of null values 
df.isnull().sum() / len(df)*100

A    20.0
B    40.0
C     0.0
dtype: float64

In [7]:
#Updating a dataFrame (it won't show raws that include null values)
df_dropped = df.dropna()
df_dropped 

Unnamed: 0,A,B,C
1,2.0,2.0,20
3,4.0,4.0,40


In [8]:
#To delete any column that includes null values
df_dropped = df.dropna(axis = 1)
df_dropped 

Unnamed: 0,C
0,10
1,20
2,30
3,40
4,50


In [9]:
import pandas as pd
import numpy as np

df = pd.DataFrame({
    'A': [None, None, None],
    'B': [None, None, None]
})

df_dropped = df.dropna(axis=1)
print(df_dropped)

Empty DataFrame
Columns: []
Index: [0, 1, 2]


In [10]:
data = {'A':[1,2,np.nan,4,5,np.nan],
        'B':[np.nan,2,3,4,np.nan,np.nan],
        'C':[10,20,30,40,50,np.nan]}
df = pd.DataFrame(data)

print("DataFrame with NaNs:")
df

DataFrame with NaNs:


Unnamed: 0,A,B,C
0,1.0,,10.0
1,2.0,2.0,20.0
2,,3.0,30.0
3,4.0,4.0,40.0
4,5.0,,50.0
5,,,


In [11]:
df_dropped = df.dropna(axis = 1, how='all', inplace = False)
df_dropped 

Unnamed: 0,A,B,C
0,1.0,,10.0
1,2.0,2.0,20.0
2,,3.0,30.0
3,4.0,4.0,40.0
4,5.0,,50.0
5,,,


In [12]:
df_dropped = df.dropna(thresh = 1)
df_dropped 

Unnamed: 0,A,B,C
0,1.0,,10.0
1,2.0,2.0,20.0
2,,3.0,30.0
3,4.0,4.0,40.0
4,5.0,,50.0


In [13]:
df_dropped = df.dropna(thresh = 4, axis = 1)
df_dropped 
#It will remove (B column) as it has the most number of null values

Unnamed: 0,A,C
0,1.0,10.0
1,2.0,20.0
2,,30.0
3,4.0,40.0
4,5.0,50.0
5,,


In [14]:
#To fill all null values with zeroes, but that will affects the accuracy as (e.g.avg)
df.fillna(0)

Unnamed: 0,A,B,C
0,1.0,0.0,10.0
1,2.0,2.0,20.0
2,0.0,3.0,30.0
3,4.0,4.0,40.0
4,5.0,0.0,50.0
5,0.0,0.0,0.0


In [15]:
#Instead of filling using zeroes, we use mean to fill value, to avoid affecting mean
#We can use the mode as will
#We can use avg
df.fillna(df.mean())

Unnamed: 0,A,B,C
0,1.0,3.0,10.0
1,2.0,2.0,20.0
2,3.0,3.0,30.0
3,4.0,4.0,40.0
4,5.0,3.0,50.0
5,3.0,3.0,30.0


In [16]:
data = {'A':[1,2,np.nan,4,5,np.nan],
        'B':[np.nan,2,3,4,np.nan,np.nan],
        'C':[10,20,30,40,50,np.nan],
       'D':['S','S',np.nan,'A','C',50]}
df = pd.DataFrame(data)

print("DataFrame with NaNs:")
df

DataFrame with NaNs:


Unnamed: 0,A,B,C,D
0,1.0,,10.0,S
1,2.0,2.0,20.0,S
2,,3.0,30.0,
3,4.0,4.0,40.0,A
4,5.0,,50.0,C
5,,,,50


In [17]:
#We can't use mean in this case
df.fillna(df.mean(numeric_only = True))

Unnamed: 0,A,B,C,D
0,1.0,3.0,10.0,S
1,2.0,2.0,20.0,S
2,3.0,3.0,30.0,
3,4.0,4.0,40.0,A
4,5.0,3.0,50.0,C
5,3.0,3.0,30.0,50


In [18]:
#We use it with sequnece (e.g.high prices)
df.fillna(df.median(numeric_only = True))

Unnamed: 0,A,B,C,D
0,1.0,3.0,10.0,S
1,2.0,2.0,20.0,S
2,3.0,3.0,30.0,
3,4.0,4.0,40.0,A
4,5.0,3.0,50.0,C
5,3.0,3.0,30.0,50


In [19]:
df.fillna(df.mode().iloc[0])

Unnamed: 0,A,B,C,D
0,1.0,2.0,10.0,S
1,2.0,2.0,20.0,S
2,1.0,3.0,30.0,S
3,4.0,4.0,40.0,A
4,5.0,2.0,50.0,C
5,1.0,2.0,10.0,50


In [20]:
#Updating for categorical column
df[['D']].fillna(df.mode().iloc[0])

Unnamed: 0,D
0,S
1,S
2,S
3,A
4,C
5,50


In [21]:
df['D']=df[['D']].fillna(df.mode().iloc[0])

In [22]:
df

Unnamed: 0,A,B,C,D
0,1.0,,10.0,S
1,2.0,2.0,20.0,S
2,,3.0,30.0,S
3,4.0,4.0,40.0,A
4,5.0,,50.0,C
5,,,,50


In [23]:
#Backward fill
df.fillna(method='bfill', inplace=True)

  df.fillna(method='bfill', inplace=True)


In [24]:
#Foreard fill
df.fillna(method='ffill', inplace=True)

  df.fillna(method='ffill', inplace=True)


In [25]:
df

Unnamed: 0,A,B,C,D
0,1.0,2.0,10.0,S
1,2.0,2.0,20.0,S
2,4.0,3.0,30.0,S
3,4.0,4.0,40.0,A
4,5.0,4.0,50.0,C
5,5.0,4.0,50.0,50


In [26]:
data_dup = {'A': [1, 2, 3, 2, 4],
            'B': ['X', 'Y', 'Z', 'Y', 'W'],
            'C': [10, 20, 30, 20, 40]}
df_dup = pd.DataFrame(data_dup)
df_dup

Unnamed: 0,A,B,C
0,1,X,10
1,2,Y,20
2,3,Z,30
3,2,Y,20
4,4,W,40


In [27]:
df_dup[df_dup.duplicated(keep = 'first')]

Unnamed: 0,A,B,C
3,2,Y,20


In [28]:
df_dup.drop_duplicates(subset = ['A','B'])

Unnamed: 0,A,B,C
0,1,X,10
1,2,Y,20
2,3,Z,30
4,4,W,40


In [29]:
! pip install -U scikit-learn



In [46]:
from sklearn.preprocessing import StandardScaler

In [47]:
scaler_standard = StandardScaler()
df_standard_scaled = pd.DataFrame(scaler_standard.fit_transform(df_scale), columns=df_scale.columns)
print("\nDataFrame after Standardization:")
print(df_standard_scaled)

NameError: name 'df_scale' is not defined

In [48]:
from sklearn.preprocessing import OneHotEncoder, LabelEncoder

In [56]:
data_dup = {'A': [1, 2, 3, 2, 4,5],
            'B': ['X', 'Y', 'Z', 'Y', 'W','W'],
            'C': [10, 20, 30, 20, 40,20]}
df_dup = pd.DataFrame(data_dup)
df_dup

Unnamed: 0,A,B,C
0,1,X,10
1,2,Y,20
2,3,Z,30
3,2,Y,20
4,4,W,40
5,5,W,20


In [57]:
df_dup

Unnamed: 0,A,B,C
0,1,X,10
1,2,Y,20
2,3,Z,30
3,2,Y,20
4,4,W,40
5,5,W,20


In [58]:
le=LabelEncoder() 

In [59]:
data_dup['B_label']=le.fit_transform(data_dup['B'])
data_dup['B_label']

array([1, 2, 3, 2, 0, 0])

In [67]:
df_dup['B_label1']=le.fit_transform(df_dup['B'])
df_dup['B_label1']

0    1
1    2
2    3
3    2
4    0
5    0
Name: B_label1, dtype: int64

In [70]:
df_dup

Unnamed: 0,A,B,C,B_label1
0,1,X,10,1
1,2,Y,20,2
2,3,Z,30,3
3,2,Y,20,2
4,4,W,40,0
5,5,W,20,0


In [71]:
from sklearn.preprocessing import OneHotEncoder

In [73]:
ohe = OneHotEncoder(sparse_output = False)

In [74]:
one_hot_B = ohe.fit_transform(df_dup[['B']])
one_hot_B

array([[0., 1., 0., 0.],
       [0., 0., 1., 0.],
       [0., 0., 0., 1.],
       [0., 0., 1., 0.],
       [1., 0., 0., 0.],
       [1., 0., 0., 0.]])

In [78]:
ohe_columns = ohe.get_feature_names_out(['B'])
ohe_columns

array(['B_W', 'B_X', 'B_Y', 'B_Z'], dtype=object)

In [79]:
ohe_df = pd.DataFrame(one_hot_B, columns = ohe_columns)
ohe_df

Unnamed: 0,B_W,B_X,B_Y,B_Z
0,0.0,1.0,0.0,0.0
1,0.0,0.0,1.0,0.0
2,0.0,0.0,0.0,1.0
3,0.0,0.0,1.0,0.0
4,1.0,0.0,0.0,0.0
5,1.0,0.0,0.0,0.0


In [83]:
combined_df = pd.concat((df_dup, ohe_df), axis =1)
combined_df

Unnamed: 0,A,B,C,B_label1,B_W,B_X,B_Y,B_Z
0,1,X,10,1,0.0,1.0,0.0,0.0
1,2,Y,20,2,0.0,0.0,1.0,0.0
2,3,Z,30,3,0.0,0.0,0.0,1.0
3,2,Y,20,2,0.0,0.0,1.0,0.0
4,4,W,40,0,1.0,0.0,0.0,0.0
5,5,W,20,0,1.0,0.0,0.0,0.0
