In [33]:
import pandas as pd
import numpy as np

In [34]:
data = {
    "Name": ['Iyynes', 'Adithya', 'Deepa', 'Muthu', 'Sri', 'Gobika', np.nan, 'Iyynes'],
    "Age": [20, 15, 18, np.nan, 19, 18, np.nan, 20],
    "Department": ['Full-stack', np.nan, 'UI/UX', 'Backend', 'Frontend', 'AI', np.nan, 'Full-stack'],
    "Salary": [90000, 70000, 80000, 90000, np.nan, 55000, np.nan, 90000]
}

In [35]:
df = pd.DataFrame(data)
df['Promoted Salary'] = df['Salary'] + 15000
print(df)

      Name   Age  Department   Salary  Promoted Salary
0   Iyynes  20.0  Full-stack  90000.0         105000.0
1  Adithya  15.0         NaN  70000.0          85000.0
2    Deepa  18.0       UI/UX  80000.0          95000.0
3    Muthu   NaN     Backend  90000.0         105000.0
4      Sri  19.0    Frontend      NaN              NaN
5   Gobika  18.0          AI  55000.0          70000.0
6      NaN   NaN         NaN      NaN              NaN
7   Iyynes  20.0  Full-stack  90000.0         105000.0


In [36]:
# to check the number of null values
print('\n True for null and vice versa \n', df.isnull(), sep='\n') #returns true of false


 True for null and vice versa 

    Name    Age  Department  Salary  Promoted Salary
0  False  False       False   False            False
1  False  False        True   False            False
2  False  False       False   False            False
3  False   True       False   False            False
4  False  False       False    True             True
5  False  False       False   False            False
6   True   True        True    True             True
7  False  False       False   False            False


In [37]:
# to count the null values
print('\n Count of null values in each column \n', df.isnull().sum(),sep='\n')


 Count of null values in each column 

Name               1
Age                2
Department         2
Salary             2
Promoted Salary    2
dtype: int64


In [38]:
df.dropna()
# or 
df.dropna(how='any') #drops the row with any null value, I mean if there's a single null value then the row is eliminated.

print("\n Dataset without null values \n", df.dropna(how='any'))


 Dataset without null values 
      Name   Age  Department   Salary  Promoted Salary
0  Iyynes  20.0  Full-stack  90000.0         105000.0
2   Deepa  18.0       UI/UX  80000.0          95000.0
5  Gobika  18.0          AI  55000.0          70000.0
7  Iyynes  20.0  Full-stack  90000.0         105000.0


In [39]:
# to drop the rows with all values as null:
df.dropna(how='all')
print('\n dataset without all values in a row as null', df.dropna(how='all', inplace=True), sep='\n')


 dataset without all values in a row as null
None


In [40]:
# filling empty values 
df.fillna(0)

Unnamed: 0,Name,Age,Department,Salary,Promoted Salary
0,Iyynes,20.0,Full-stack,90000.0,105000.0
1,Adithya,15.0,0,70000.0,85000.0
2,Deepa,18.0,UI/UX,80000.0,95000.0
3,Muthu,0.0,Backend,90000.0,105000.0
4,Sri,19.0,Frontend,0.0,0.0
5,Gobika,18.0,AI,55000.0,70000.0
7,Iyynes,20.0,Full-stack,90000.0,105000.0


In [41]:
# fill the age NaN with mean values
df['Age'].fillna(df['Age'].mean())
print(df['Age'].fillna(df['Age'].mean()))

0    20.000000
1    15.000000
2    18.000000
3    18.333333
4    19.000000
5    18.000000
7    20.000000
Name: Age, dtype: float64


In [42]:
# fill the Department and salary NaN with most repeated department and salary respectively
df[['Department', 'Salary']] = df[['Department', 'Salary']].fillna(df.mode().iloc[0])
print(df[['Department', 'Salary']].fillna(df.mode().iloc[0]))

   Department   Salary
0  Full-stack  90000.0
1  Full-stack  70000.0
2       UI/UX  80000.0
3     Backend  90000.0
4    Frontend  90000.0
5          AI  55000.0
7  Full-stack  90000.0


In [43]:
# forward and backware fill 
# forward fill [ffill] - fill the above value to the null
print(df['Age'].fillna(method = 'ffill'))
# use anyone but below is preferred
print(df['Age'].ffill())

0    20.0
1    15.0
2    18.0
3    18.0
4    19.0
5    18.0
7    20.0
Name: Age, dtype: float64
0    20.0
1    15.0
2    18.0
3    18.0
4    19.0
5    18.0
7    20.0
Name: Age, dtype: float64


  print(df['Age'].fillna(method = 'ffill'))


In [44]:
# backward fill 
# backward fill [bfill] - fill the below value to the null

print(df['Age'].bfill())
# or 
print(df['Age'].fillna(method = 'bfill'))

0    20.0
1    15.0
2    18.0
3    19.0
4    19.0
5    18.0
7    20.0
Name: Age, dtype: float64
0    20.0
1    15.0
2    18.0
3    19.0
4    19.0
5    18.0
7    20.0
Name: Age, dtype: float64


  print(df['Age'].fillna(method = 'bfill'))


In [45]:
# if first and last value is null don't use forward and backward fill respectively
# instead use statistical analysis like mean, median and mode 

In [46]:
# replace a particular value with another

print("Original dataset\n", df['Name'], sep='\n')
print("\n dataset with replaced value \n",df['Name'].replace("Sri", "Srindhi"))

Original dataset

0     Iyynes
1    Adithya
2      Deepa
3      Muthu
4        Sri
5     Gobika
7     Iyynes
Name: Name, dtype: object

 dataset with replaced value 
 0     Iyynes
1    Adithya
2      Deepa
3      Muthu
4    Srindhi
5     Gobika
7     Iyynes
Name: Name, dtype: object


In [None]:
# to make the change permanent:
df['Name'] = df['Name'].replace("Sri", "unknown")
df

Unnamed: 0,Name,Age,Department,Salary,Promoted Salary
0,Iyynes,20.0,Full-stack,90000.0,105000.0
1,Adithya,15.0,Full-stack,70000.0,85000.0
2,Deepa,18.0,UI/UX,80000.0,95000.0
3,Muthu,,Backend,90000.0,105000.0
4,unknown,19.0,Frontend,90000.0,
5,Gobika,18.0,AI,55000.0,70000.0
7,Iyynes,20.0,Full-stack,90000.0,105000.0


In [56]:
# df_dup = df[df.duplicated()]

# first : Mark duplicates as True except for the first occurrence.
df_dup = df[df.duplicated(keep='first')]

# last : Mark duplicates as True except for the last occurrence.
df_dup = df[df.duplicated(keep='last')]

# False : Mark all duplicates as True.
df_dup = df[df.duplicated(keep=False)]
df_dup

Unnamed: 0,Name,Age,Department,Salary,Promoted Salary
0,Iyynes,20.0,Full-stack,90000.0,105000.0
7,Iyynes,20.0,Full-stack,90000.0,105000.0


In [58]:
df.drop_duplicates(inplace=True)
df

Unnamed: 0,Name,Age,Department,Salary,Promoted Salary
0,Iyynes,20.0,Full-stack,90000.0,105000.0
1,Adithya,15.0,Full-stack,70000.0,85000.0
2,Deepa,18.0,UI/UX,80000.0,95000.0
3,Muthu,,Backend,90000.0,105000.0
4,unknown,19.0,Frontend,90000.0,
5,Gobika,18.0,AI,55000.0,70000.0
