In [None]:
# Data Cleaning, Munging, and Transformation

## 1. Import Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
## 2. Load Dataset

In [None]:
df = pd.read_csv('Crop_recommendationV2.csv')
print("Dataset loaded successfully.")

In [None]:
## 3. Initial Data Exploration

In [None]:
print("\nFirst 5 rows of the dataset:")
display(df.head())

print("\nDataset Information:")
df.info()

print("\nDescriptive Statistics:")
display(df.describe())

print("\nDataset Shape:", df.shape)

In [None]:
## 4. Data Cleaning: Handling Missing Values

In [None]:
print("\nChecking for missing values:")
missing_values = df.isnull().sum()
print(missing_values[missing_values > 0])

if missing_values.sum() == 0:
    print("No missing values found in the dataset.")
else:
    print("\nHandling missing values (example: dropping rows with any missing values):")
    df_cleaned = df.dropna()
    print("Shape after dropping missing values:", df_cleaned.shape)
    print("Missing values after dropping:", df_cleaned.isnull().sum().sum())
    df = df_cleaned # Update df to the cleaned version

print("\nChecking for duplicate rows:")
duplicates = df.duplicated().sum()
print(f"Number of duplicate rows: {duplicates}")
if duplicates > 0:
    print("Dropping duplicate rows...")
    df = df.drop_duplicates()
    print("Shape after dropping duplicates:", df.shape)

In [None]:
## 5. Data Transformation: Data Type Conversion

In [None]:
print("\nCurrent data types:")
print(df.dtypes)

# Example of converting a column to a specific type if needed
# For instance, if 'N' was an object type and should be numeric:
# df['N'] = pd.to_numeric(df['N'], errors='coerce')
# print("\nData types after potential conversion:")
# print(df.dtypes)

In [None]:
## 6. Data Munging: Basic Feature Engineering (Example)

In [None]:
# Example: Create a new feature 'NPK_Ratio' if N, P, K columns exist
if all(col in df.columns for col in ['N', 'P', 'K']):
    df['NPK_Ratio'] = (df['N'] + df['P'] + df['K']) / 3
    print("\n'NPK_Ratio' feature created. First 5 rows with new feature:")
    display(df[['N', 'P', 'K', 'NPK_Ratio']].head())
else:
    print("\nN, P, or K columns not found for 'NPK_Ratio' feature engineering example.")

# Example: Binning a continuous variable (e.g., 'temperature')
if 'temperature' in df.columns:
    df['temperature_band'] = pd.cut(df['temperature'], bins=3, labels=['low', 'medium', 'high'])
    print("\n'temperature_band' feature created. Value counts:")
    print(df['temperature_band'].value_counts())
else:
    print("\n'temperature' column not found for binning example.")

In [None]:
## 7. Final Data Overview after Initial Phases

In [None]:
print("\nFinal dataset head after initial cleaning and transformation:")
display(df.head())

print("\nFinal dataset info:")
df.info()