In [1]:
import pandas as pd
import numpy as np

In [3]:
data = {
    'age': [25, 28, np.nan, 35, 45, 45, 52, 80],
    'salary': [50000, 54000, 50000, 60000, 200000, 54000, 75000, 80000],
    'city': ['New York', 'Los Angeles', 'New York', 'Chicago', 'Chicago', 'Los Angeles', 'New York', 'Boston']
}
df = pd.DataFrame(data)
print (df)

    age  salary         city
0  25.0   50000     New York
1  28.0   54000  Los Angeles
2   NaN   50000     New York
3  35.0   60000      Chicago
4  45.0  200000      Chicago
5  45.0   54000  Los Angeles
6  52.0   75000     New York
7  80.0   80000       Boston


In [5]:
duplicate_row = pd.DataFrame({'age': [28], 'salary': [54000], 'city': ['Los Angeles']})
df = pd.concat([df, duplicate_row], ignore_index=True)
print(df)

    age  salary         city
0  25.0   50000     New York
1  28.0   54000  Los Angeles
2   NaN   50000     New York
3  35.0   60000      Chicago
4  45.0  200000      Chicago
5  45.0   54000  Los Angeles
6  52.0   75000     New York
7  80.0   80000       Boston
8  28.0   54000  Los Angeles
9  28.0   54000  Los Angeles


In [7]:
median_age = df['age'].median()
df['age'].fillna(median_age, inplace=True)
print(df)

    age  salary         city
0  25.0   50000     New York
1  28.0   54000  Los Angeles
2  35.0   50000     New York
3  35.0   60000      Chicago
4  45.0  200000      Chicago
5  45.0   54000  Los Angeles
6  52.0   75000     New York
7  80.0   80000       Boston
8  28.0   54000  Los Angeles
9  28.0   54000  Los Angeles


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['age'].fillna(median_age, inplace=True)


In [8]:
df.drop_duplicates(inplace=True)
print(df)

    age  salary         city
0  25.0   50000     New York
1  28.0   54000  Los Angeles
2  35.0   50000     New York
3  35.0   60000      Chicago
4  45.0  200000      Chicago
5  45.0   54000  Los Angeles
6  52.0   75000     New York
7  80.0   80000       Boston


In [10]:
Q1 = df['salary'].quantile(0.25)
Q3 = df['salary'].quantile(0.75)
IQR = Q3 - Q1
print(IQR)

23250.0


In [12]:
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
df_cleaned = df[(df['salary'] >= lower_bound) & (df['salary'] <= upper_bound)]
print (df_cleaned)

    age  salary         city
0  25.0   50000     New York
1  28.0   54000  Los Angeles
2  35.0   50000     New York
3  35.0   60000      Chicago
5  45.0   54000  Los Angeles
6  52.0   75000     New York
7  80.0   80000       Boston
