In [1]:
import pandas as pd

In [2]:
data = {
    'Name':['Anna', 'Brian', None, 'Diana', 'Eli'],
    'Age':[28, None, 34, 40, 22],
    'Salary':[50000, 60000, None, 80000, 45000]
}

df = pd.DataFrame(data)
df.isnull()
df.isnull().sum()

Name      1
Age       1
Salary    1
dtype: int64

In [3]:
df_cleaned = df.dropna() 
print(df_cleaned)

    Name   Age   Salary
0   Anna  28.0  50000.0
3  Diana  40.0  80000.0
4    Eli  22.0  45000.0


In [7]:
average_age = df['Age'].mean() 
df['Age'] = df['Age'].fillna(average_age) 
print(df)

    Name  Age   Salary
0   Anna  NaN  50000.0
1  Brian  NaN  60000.0
2   None  NaN      NaN
3  Diana  NaN  80000.0
4    Eli  NaN  45000.0


In [12]:
data_with_duplicates = {
    'Name': ['Anna', 'Brian', 'Brian', 'Diana', 'Eli'], 
    'Age': [28, 35, 35, 40, 22] 
} 
df_dup = pd.DataFrame(data_with_duplicates) 
df_dup.duplicated()

0    False
1    False
2     True
3    False
4    False
dtype: bool

In [15]:
df_no_dup = df_dup.drop_duplicates()
print(df_no_dup) 
df_no_dup = df_dup.drop_duplicates(subset=['Name'], keep='first')
print(df_no_dup) 

    Name  Age
0   Anna   28
1  Brian   35
3  Diana   40
4    Eli   22
    Name  Age
0   Anna   28
1  Brian   35
3  Diana   40
4    Eli   22


In [21]:
data = {
    'Age': ['25', '30', '35', '40'],
    'Salary': ['50000', '60000', '70000', '80000'],
    'JoinDate': ['2021-01-15', '2020-07-30', '2019-05-20', '2022-03-10']
} 
df = pd.DataFrame(data) 
print(df.dtypes)

Age         object
Salary      object
JoinDate    object
dtype: object


In [22]:
df['Age'] = pd.to_numeric(df['Age'], errors='coerce')
df['Salary'] = pd.to_numeric(df['Salary'], errors='coerce')
print(df.dtypes)
print(df)

Age          int64
Salary       int64
JoinDate    object
dtype: object
   Age  Salary    JoinDate
0   25   50000  2021-01-15
1   30   60000  2020-07-30
2   35   70000  2019-05-20
3   40   80000  2022-03-10


In [26]:
#Converting to DateTime
df['JoinDate'] = pd.to_datetime(df['JoinDate'])
print(df)
print(df.dtypes)

   Age  Salary   JoinDate
0   25   50000 2021-01-15
1   30   60000 2020-07-30
2   35   70000 2019-05-20
3   40   80000 2022-03-10
Age                  int64
Salary               int64
JoinDate    datetime64[ns]
dtype: object


In [28]:
df['Age'] = df['Age'].astype('int') #astype is inflexible if data contains problematic values. Will raise an error instead of coercing.

In [30]:
# Sample data with incorrect types
data = {
    'Age': ['25', '30', '35', 'forty'],  # note the 'forty' string
    'Salary': ['50000', '60000', 'seventy thousand', '80000'],  # 'seventy thousand' is invalid
    'JoinDate': ['2021-01-15', '2020-07-30', '2019-05-20', 'not a date']  # last is invalid
}

df = pd.DataFrame(data)
# Inspect original dtypes
print("Original data types:\n", df.dtypes)

# Convert Age and Salary to numeric, coercing errors to NaN
df['Age'] = pd.to_numeric(df['Age'], errors='coerce')
df['Salary'] = pd.to_numeric(df['Salary'], errors='coerce')

# Convert JoinDate to datetime, coercing errors
df['JoinDate'] = pd.to_datetime(df['JoinDate'], errors='coerce')

print("\nData types after conversion:\n", df.dtypes)
print("\nDataFrame after conversions:\n", df)

Original data types:
 Age         object
Salary      object
JoinDate    object
dtype: object

Data types after conversion:
 Age                float64
Salary             float64
JoinDate    datetime64[ns]
dtype: object

DataFrame after conversions:
     Age   Salary   JoinDate
0  25.0  50000.0 2021-01-15
1  30.0  60000.0 2020-07-30
2  35.0      NaN 2019-05-20
3   NaN  80000.0        NaT
