In [1]:
import pandas as pd
import numpy as np

# Create a DataFrame with synthetic data and missing values
df = pd.DataFrame({'A': [1, 2, np.nan, 4],
                   'B': [5, np.nan, np.nan, 8],
                   'C': [9, 10, 11, np.nan]})
print(df)

# Replace missing values with 0
df.fillna(0, inplace=True)
print(df)

# Replace missing values with the mean of the column
df = pd.DataFrame({'A': [1, 2, np.nan, 4],
                   'B': [5, np.nan, np.nan, 8],
                   'C': [9, 10, 11, np.nan]})
df = df.apply(lambda x: x.fillna(x.mean()),axis=0)
print(df)

# Drop rows with missing values
df.dropna(inplace=True)
print(df)


     A    B     C
0  1.0  5.0   9.0
1  2.0  NaN  10.0
2  NaN  NaN  11.0
3  4.0  8.0   NaN
     A    B     C
0  1.0  5.0   9.0
1  2.0  0.0  10.0
2  0.0  0.0  11.0
3  4.0  8.0   0.0
          A    B     C
0  1.000000  5.0   9.0
1  2.000000  6.5  10.0
2  2.333333  6.5  11.0
3  4.000000  8.0  10.0
          A    B     C
0  1.000000  5.0   9.0
1  2.000000  6.5  10.0
2  2.333333  6.5  11.0
3  4.000000  8.0  10.0


In [10]:
import pandas as pd
import numpy as np
from scipy import stats
# Create a DataFrame with synthetic data and outliers
df = pd.DataFrame({'A': [1, 2, 30, 4, 50],
                   'B': [5, 10, 15, 8, 20],
                   'C': [9, 10, 11, 12, 30]})
print(df)

# Identify and remove outliers based on z-score
z = np.abs(stats.zscore(df))
df = df[(z < 3).all(axis=1)]
print(df)

# Identify and remove outliers based on quantile
Q1 = df.quantile(0.25)
Q3 = df.quantile(0.75)
IQR = Q3 - Q1
df = df[~((df < (Q1 - 1.5 * IQR)) |(df > (Q3 + 1.5 * IQR))).any(axis=1)]
print(df)


    A   B   C
0   1   5   9
1   2  10  10
2  30  15  11
3   4   8  12
4  50  20  30
    A   B   C
0   1   5   9
1   2  10  10
2  30  15  11
3   4   8  12
4  50  20  30
    A   B   C
0   1   5   9
1   2  10  10
2  30  15  11
3   4   8  12


In [7]:
import pandas as pd

# Create a DataFrame with synthetic data and duplicate rows
df = pd.DataFrame({'A': [1, 2, 2, 4],
                   'B': [5, 6, 6, 8],
                   'C': [9, 10, 10, 12]})
print(df)

# Remove duplicate rows
df.drop_duplicates(inplace=True)
print(df)


   A  B   C
0  1  5   9
1  2  6  10
2  2  6  10
3  4  8  12
   A  B   C
0  1  5   9
1  2  6  10
3  4  8  12


In [11]:
import pandas as pd

# Create a DataFrame with synthetic data and wrong data types
df = pd.DataFrame({'A': [1, '2', '3', 4],
                   'B': [5, '6', '7', 8],
                   'C': [9, 10, '11', 12]})
print(df)

# Change data type of column 'A' from string to int
df['A'] = df['A'].astype(int)
print(df)

#Change data type of column 'B' and 'C' from string to int
df[['B','C']] = df[['B','C']].apply(pd.to_numeric, errors='coerce')
print(df)



   A  B   C
0  1  5   9
1  2  6  10
2  3  7  11
3  4  8  12
   A  B   C
0  1  5   9
1  2  6  10
2  3  7  11
3  4  8  12
   A  B   C
0  1  5   9
1  2  6  10
2  3  7  11
3  4  8  12


In [12]:
import pandas as pd

# Create a DataFrame with synthetic data and wrong data types
df = pd.DataFrame({'A': [1, '2', '3', 4],
                   'B': [5, '6', '7', 8],
                   'C': [9, 10, '11', 12]})
print(df)

# Change data type of column 'A' from string to int
df['A'] = df['A'].astype(int)
print(df)

#Change data type of column 'B' and 'C' from string to int
df[['B','C']] = df[['B','C']].apply(pd.to_numeric, errors='coerce')
print(df)



   A  B   C
0  1  5   9
1  2  6  10
2  3  7  11
3  4  8  12
   A  B   C
0  1  5   9
1  2  6  10
2  3  7  11
3  4  8  12
   A  B   C
0  1  5   9
1  2  6  10
2  3  7  11
3  4  8  12
