Checking for missing values.

In [1]:
import pandas as pd

In [2]:
#Sampling Data with missing values.
data = {
    "Name":["Alice","Bob","Charlie","James",None],
    "Age":[25, None,35,36,28],
    "Salary":[50000, 60000, None, 80000,60000]
}

df = pd.DataFrame(data)
df

Unnamed: 0,Name,Age,Salary
0,Alice,25.0,50000.0
1,Bob,,60000.0
2,Charlie,35.0,
3,James,36.0,80000.0
4,,28.0,60000.0


In [3]:
print(df.isnull())  # Returns True where data is missing

    Name    Age  Salary
0  False  False   False
1  False   True   False
2  False  False    True
3  False  False   False
4   True  False   False


In [4]:
print(df.isnull().sum()) # Count missing values per column

Name      1
Age       1
Salary    1
dtype: int64


In [5]:
df_cleaned = df.dropna() #Remove rows with missing values.
df_cleaned

Unnamed: 0,Name,Age,Salary
0,Alice,25.0,50000.0
3,James,36.0,80000.0


In [6]:
#Filling Missing Values
df_filled = df.fillna({"Name":"Sara","Age": df["Age"].mean(), "Salary": 0})
df_filled

Unnamed: 0,Name,Age,Salary
0,Alice,25.0,50000.0
1,Bob,31.0,60000.0
2,Charlie,35.0,0.0
3,James,36.0,80000.0
4,Sara,28.0,60000.0


In [7]:
##Checking duplicated rows....
print(df.duplicated())

0    False
1    False
2    False
3    False
4    False
dtype: bool


In [8]:
df_unique = df.drop_duplicates()
df_unique

Unnamed: 0,Name,Age,Salary
0,Alice,25.0,50000.0
1,Bob,,60000.0
2,Charlie,35.0,
3,James,36.0,80000.0
4,,28.0,60000.0


In [9]:
## Converting data types

df["Age"] = df["Age"].astype(float)

In [10]:
df

Unnamed: 0,Name,Age,Salary
0,Alice,25.0,50000.0
1,Bob,,60000.0
2,Charlie,35.0,
3,James,36.0,80000.0
4,,28.0,60000.0


In [11]:
###convert dates
df["Date"] = pd.to_datetime("2024-04-12")  # Fixed date example
print(df)

      Name   Age   Salary       Date
0    Alice  25.0  50000.0 2024-04-12
1      Bob   NaN  60000.0 2024-04-12
2  Charlie  35.0      NaN 2024-04-12
3    James  36.0  80000.0 2024-04-12
4     None  28.0  60000.0 2024-04-12


In [12]:
df

Unnamed: 0,Name,Age,Salary,Date
0,Alice,25.0,50000.0,2024-04-12
1,Bob,,60000.0,2024-04-12
2,Charlie,35.0,,2024-04-12
3,James,36.0,80000.0,2024-04-12
4,,28.0,60000.0,2024-04-12


Exercise

In [13]:
#Sample dataframe.
df = pd.read_csv("sample_data.csv")
df

Unnamed: 0,Name,Age,Salary,Join Date,City
0,Alice,25.0,50000.0,1/10/2023,New York
1,Bob,30.0,60000.0,,Los Angeles
2,Charlie,,55000.0,7/25/2022,Chicago
3,David,40.0,,5/15/2021,Miami
4,Alice,25.0,50000.0,1/10/2023,New York
5,Eve,29.0,70000.0,12/1/2023,San Francisco
6,Frank,35.0,75000.0,9/30/2022,Boston


In [14]:
print(df.isnull())

    Name    Age  Salary  Join Date   City
0  False  False   False      False  False
1  False  False   False       True  False
2  False   True   False      False  False
3  False  False    True      False  False
4  False  False   False      False  False
5  False  False   False      False  False
6  False  False   False      False  False


In [15]:
print(df.isnull().sum())

Name         0
Age          1
Salary       1
Join Date    1
 City        0
dtype: int64


In [16]:
df_filtered = df.dropna()
df_filtered

Unnamed: 0,Name,Age,Salary,Join Date,City
0,Alice,25.0,50000.0,1/10/2023,New York
4,Alice,25.0,50000.0,1/10/2023,New York
5,Eve,29.0,70000.0,12/1/2023,San Francisco
6,Frank,35.0,75000.0,9/30/2022,Boston


In [17]:
df_filled = df.fillna({"Age" : df["Age"].mean(), "Salary" : df["Salary"].median(), "Join Date" : "1/10/2023"})
df_filled

Unnamed: 0,Name,Age,Salary,Join Date,City
0,Alice,25.0,50000.0,1/10/2023,New York
1,Bob,30.0,60000.0,1/10/2023,Los Angeles
2,Charlie,30.666667,55000.0,7/25/2022,Chicago
3,David,40.0,57500.0,5/15/2021,Miami
4,Alice,25.0,50000.0,1/10/2023,New York
5,Eve,29.0,70000.0,12/1/2023,San Francisco
6,Frank,35.0,75000.0,9/30/2022,Boston


In [18]:
df.duplicated()

0    False
1    False
2    False
3    False
4     True
5    False
6    False
dtype: bool

In [19]:
df_unique = df_filled.drop_duplicates()
df_unique

Unnamed: 0,Name,Age,Salary,Join Date,City
0,Alice,25.0,50000.0,1/10/2023,New York
1,Bob,30.0,60000.0,1/10/2023,Los Angeles
2,Charlie,30.666667,55000.0,7/25/2022,Chicago
3,David,40.0,57500.0,5/15/2021,Miami
5,Eve,29.0,70000.0,12/1/2023,San Francisco
6,Frank,35.0,75000.0,9/30/2022,Boston


In [20]:
df_filled["Age"] = df_filled["Age"].astype(int)

In [21]:
df_filled

Unnamed: 0,Name,Age,Salary,Join Date,City
0,Alice,25,50000.0,1/10/2023,New York
1,Bob,30,60000.0,1/10/2023,Los Angeles
2,Charlie,30,55000.0,7/25/2022,Chicago
3,David,40,57500.0,5/15/2021,Miami
4,Alice,25,50000.0,1/10/2023,New York
5,Eve,29,70000.0,12/1/2023,San Francisco
6,Frank,35,75000.0,9/30/2022,Boston
