# Dealing with missing data

In [112]:
import pandas as pd
import numpy as np

# Create a DataFrame with synthetic data and missing values
df = pd.DataFrame({'A': [1, 2, np.nan, 4],
                   'B': [5, np.nan, np.nan, 8],
                   'C': [9, 10, 11, np.nan]})
df


Unnamed: 0,A,B,C
0,1.0,5.0,9.0
1,2.0,,10.0
2,,,11.0
3,4.0,8.0,


In [111]:

# Replace missing values with 0
df.fillna(0, inplace=True)
print(df)

# Replace missing values with the mean of the column
df = pd.DataFrame({'A': [1, 2, np.nan, 4],
                   'B': [5, np.nan, np.nan, 8],
                   'C': [9, 10, 11, np.nan]})
df = df.apply(lambda x: x.fillna(x.mean()),axis=0)
print(df)

# Drop rows with missing values
# Replace missing values with the mean of the column
df = pd.DataFrame({'A': [1, 2, np.nan, 4],
                   'B': [5, np.nan, np.nan, 8],
                   'C': [9, 10, 11, np.nan]})

df.dropna(inplace=True)
print(df)


     A    B    C
0  1.0  5.0  9.0
          A    B     C
0  1.000000  5.0   9.0
1  2.000000  6.5  10.0
2  2.333333  6.5  11.0
3  4.000000  8.0  10.0
     A    B    C
0  1.0  5.0  9.0


# Dealing with outliers

In [114]:
import pandas as pd
import numpy as np
from scipy.stats import zscore
# Create a DataFrame with synthetic data and outliers
df = pd.DataFrame({'A': [1, 2, 300, 4, 15],
                   'B': [5, 10, 15, 8, 16],
                   'C': [9, 10, 11, 12, 11]})


df

Unnamed: 0,A,B,C
0,1,5,9
1,2,10,10
2,300,15,11
3,4,8,12
4,15,16,11


In [None]:
# Identify and remove outliers based on quantile
Q1 = df.quantile(0.25)
Q3 = df.quantile(0.75)
IQR = Q3 - Q1
outliers = df[((df < (Q1 - 1.5 * IQR)) |(df > (Q3 + 1.5 * IQR))).any(axis=1)]
print('Data with outliers: \n',outliers)

df = df[~((df < (Q1 - 1.5 * IQR)) |(df > (Q3 + 1.5 * IQR))).any(axis=1)]
print('\n Data without outliers: \n',df)

df = pd.DataFrame({'A': [1, 2, 300, 4, 15],
                   'B': [5, 10, 15, 8, 16],
                   'C': [9, 10, 11, 12, 11]})

# Identify and remove outliers based on z-score
z = np.abs(stats.zscore(df))
outliers=df[z>1.8]
print('Data with outliers: \n',outliers)

df = df[(z < 1.8).all(axis=1)]
print('\n Data without outliers: \n',df)


# Dealing with duplicate and incorrect data

In [116]:
data = {'Name': ['Alice', 'Bob', 'Charlie', 'David','David'],
        'Age': [25, -3, 30, 40, 40],
        'Gender': ['F', 'M', 'M', 'M', 'M']}
df = pd.DataFrame(data)



# Remove duplicate rows
df.drop_duplicates(inplace=True)
print(df)

# Replace incorrect data with NaN
df.loc[df['Age'] < 0, 'Age'] = np.nan
print(df)

      Name  Age Gender
0    Alice   25      F
1      Bob   -3      M
2  Charlie   30      M
3    David   40      M
      Name   Age Gender
0    Alice  25.0      F
1      Bob   NaN      M
2  Charlie  30.0      M
3    David  40.0      M


In [115]:
df

Unnamed: 0,A,B,C
0,1,5,9
1,2,10,10
2,300,15,11
3,4,8,12
4,15,16,11
