## Prac1: Data Cleaning

In [None]:
'''
    Apply data cleaning techniques on any dataset (e.g., Paper Reviews dataset in UCI repository).
    Techniques may include handling missing values, outliers and inconsistent values. 
    A set of validation rules can be prepared based on the dataset and validations can be performed.
'''

In [2]:
import pandas as pd
import numpy as np

In [3]:
df = pd.read_csv('fruit_classification_dataset.csv')
print(df.head())

   size (cm)  shape  weight (g)  avg_price (â‚¹)   color  taste     fruit_name
0       25.4  round      3089.2          137.1   green  sweet     watermelon
1       24.6  round      3283.9          163.8   green  sweet     watermelon
2        7.8  round       319.0           91.3   green  sweet  custard apple
3       20.0   oval      1607.0           85.7  orange  sweet         papaya
4       10.2   long       131.5           37.8  yellow  sweet         banana


### 1. Handling Missing Value

In [4]:
# for numeric columns -> replace missing values with median
num_cols = df.select_dtypes(include=['int64', 'float64']).columns
df[num_cols] = df[num_cols].fillna(df[num_cols].median())

In [5]:
# for categorical columns -> replace missing values with mode
cat_cols = df.select_dtypes(include=['object']).columns
df[cat_cols] = df[cat_cols].fillna(df[cat_cols].mode().iloc[0])

### 2. Handling Outliers (z-score method)

In [6]:
# handling outliers z-score method
for col in num_cols:
    mean = df[col].mean()
    std = df[col].std()
    z_scores = (df[col] - mean) / std

In [7]:
# replace outliner > 3 and < -3 with median
# df.loc[z_scores > 3, num_cols] = df[num_cols].median()
# df.loc[z_scores < -3, num_cols] = df[num_cols].median()

df.loc[z_scores > 3, col] = df[col].median()
df.loc[z_scores < -3, col] = df[col].median()

### 3. Fixing Inconsistent Values

In [10]:
# fruit_name column has inconsistent words
if 'fruit_name' in df.columns:
    df['fruit_name'] = df['fruit_name'].str.lower()
# size column might contain text variations
if 'size' in df.columns:
    df['size'] = df['size'].str.lower()

### 4. Validation Rules

In [22]:
rules = {}
# Rule 1: Weight must be positive
if 'weight' in df.columns:
    rules['weight_valid'] = (df['weight'] > 0).sum()

# Rule 2: Fruit name must be one of known fruits
if 'fruit_name' in df.columns:
    # valid_fruit = ['apple', 'banana', 'mango', 'orange', 'grape']
    valid_fruits = ['watermelon','custard apple','papaya','banana','pomegranate','grape','plum','guava','blueberry','coconut','cherry','mango','pineapple','pear','lychee','apple','orange','dragon fruit','kiwi','strawberry']
    rules['fruit_name_valid'] = df['fruit_name'].isin(valid_fruits).sum()
# Rule 3: Size must be small/medium/large
if 'size' in df.columns:
    rules['size_valid'] = df['size'].isin(['small', 'medium', 'large']).sum()

print("\nValidation Summary: ")
print(rules)


Validation Summary: 
{'fruit_name_valid': np.int64(10000)}


In [23]:
# Save cleaned data
df.to_csv('fruit_classification_dataset_cleaned.csv', index=False)
print("\nData cleaning completed!")


Data cleaning completed!
