In [1]:
import pandas as pd

data = {
    'Name': ['Alice', 'Bobby', 'Charlie', 'Davyd', None],
    'Age': [24, 30, None, 22, 35],
    'Salary': [48000, None, 57000, None, 60000]
}
df = pd.DataFrame(data)

df['Age'].fillna(df['Age'].mean(), inplace=True)
df['Salary'].fillna(df['Salary'].median(), inplace=True)
df.dropna(subset=['Name'], inplace=True)
print ('After cleaning:/n', df)

After cleaning:/n       Name    Age   Salary
0    Alice  24.00  48000.0
1    Bobby  30.00  57000.0
2  Charlie  27.75  57000.0
3    Davyd  22.00  57000.0


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Age'].fillna(df['Age'].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Salary'].fillna(df['Salary'].median(), inplace=True)


In [2]:
data = {
    'Product': ['Laptop', 'Laptop', 'Desktop', 'Tablet', 'tablet'],
    'Category': ['Electronics', 'electronics', 'Electronics', 'Gadget', 'gadget']
}
df = pd.DataFrame(data)

df['Category'] = df['Category'].str.capitalize()
print('Standardized Data:/n', df)

Standardized Data:/n    Product     Category
0   Laptop  Electronics
1   Laptop  Electronics
2  Desktop  Electronics
3   Tablet       Gadget
4   tablet       Gadget


In [3]:
import pandas as pd

df = pd.read_csv("/home/hakkan/Downloads/datasets/titanic.csv")

if "Age" in df.columns:
    df["Age"] = df["Age"].fillna(df["Age"].mean())

if "Fare" in df.columns:
    df["Fare"] = df["Fare"].fillna(df["Fare"].mean())

numeric_cols = ["Age", "Fare"]
numeric_cols = [col for col in numeric_cols if col in df.columns]  # cek kolom yang ada

for col in numeric_cols:
    min_val = df[col].min()
    max_val = df[col].max()
    df[col] = (df[col] - min_val) / (max_val - min_val)

categorical_cols = ["Sex", "Embarked", "Pclass"]
categorical_cols = [col for col in categorical_cols if col in df.columns]

df = pd.get_dummies(df, columns=categorical_cols, drop_first=True)

df = df.drop_duplicates()

print(df.head())
print(df.isnull().sum())
print(df.shape)


   PassengerId  Survived                                               Name  \
0            1         0                            Braund, Mr. Owen Harris   
1            2         1  Cumings, Mrs. John Bradley (Florence Briggs Th...   
2            3         1                             Heikkinen, Miss. Laina   
3            4         1       Futrelle, Mrs. Jacques Heath (Lily May Peel)   
4            5         0                           Allen, Mr. William Henry   

        Age  SibSp  Parch            Ticket      Fare Cabin  Sex_male  \
0  0.271174      1      0         A/5 21171  0.014151   NaN      True   
1  0.472229      1      0          PC 17599  0.139136   C85     False   
2  0.321438      0      0  STON/O2. 3101282  0.015469   NaN     False   
3  0.434531      1      0            113803  0.103644  C123     False   
4  0.434531      0      0            373450  0.015713   NaN      True   

   Embarked_Q  Embarked_S  Pclass_2  Pclass_3  
0       False        True     False   

In [5]:
import pandas as pd

df = pd.read_csv("/home/hakkan/Downloads/datasets/netflix_titles.csv")

df = df.drop_duplicates().reset_index(drop=True)

num_cols = df.select_dtypes(include=["int64", "float64"]).columns
cat_cols = df.select_dtypes(include=["object"]).columns

df[num_cols] = df[num_cols].fillna(df[num_cols].mean())
df[cat_cols] = df[cat_cols].fillna(df[cat_cols].mode().iloc[0])

for col in num_cols:
    Q1, Q3 = df[col].quantile([0.25, 0.75])
    IQR = Q3 - Q1
    df = df[(df[col] >= Q1 - 1.5 * IQR) & (df[col] <= Q3 + 1.5 * IQR)]

for col in num_cols:
    df[col] = (df[col] - df[col].min()) / (df[col].max() - df[col].min())

df = pd.get_dummies(df, columns=cat_cols, drop_first=True)

print(df.head())
print("\nFinal shape:", df.shape)


   release_year  show_id_s10  show_id_s100  show_id_s1000  show_id_s1001  \
0      0.941176        False         False          False          False   
1      1.000000        False         False          False          False   
2      1.000000        False         False          False          False   
3      1.000000        False         False          False          False   
4      1.000000        False         False          False          False   

   show_id_s1002  show_id_s1003  show_id_s1004  show_id_s1005  show_id_s1006  \
0          False          False          False          False          False   
1          False          False          False          False          False   
2          False          False          False          False          False   
3          False          False          False          False          False   
4          False          False          False          False          False   

   ...  \
0  ...   
1  ...   
2  ...   
3  ...   
4  ...   

 