In [None]:
import pandas as pd

# Read the individual CSV files
df_auto = pd.read_csv('C:/Users/harsh/OneDrive/Desktop/INTERSHIP1/auto_irrigation.csv')
df_smart = pd.read_csv('C:/Users/harsh/OneDrive/Desktop/INTERSHIP1/smart_irrigation.csv')
df_machine = pd.read_csv('C:/Users/harsh/OneDrive/Desktop/INTERSHIP1/irrigation_machine.csv')
df_crop = pd.read_csv('C:/Users/harsh/OneDrive/Desktop/INTERSHIP1/crop_irrigation.csv')

In [None]:
print("Auto irrigation columns:")
print(df_auto.columns.tolist())

print("\nSmart irrigation columns:")
print(df_smart.columns.tolist())

print("\nMachine dataset columns:")
print(df_machine.columns.tolist())

print("\nCrop irrigation columns:")
print(df_crop.columns.tolist())


In [None]:
# Auto irrigation
df_auto = df_auto.rename(columns={
    'temp': 'Temperature',
    'humidity': 'Humidity',
    'soil_moisture': 'SoilMoisture',
    'rain': 'Rainfall',
    'time': 'DateTime'
})

# Smart irrigation
df_smart = df_smart.rename(columns={
    'temp': 'Temperature',
    'soil_moisture': 'SoilMoisture',
    'pump_voltage': 'PumpVoltage',
    'pump_current': 'PumpCurrent',
    'irrigation_time': 'IrrigationDuration',
    'timestamp': 'DateTime'
})

# Machine dataset
df_machine = df_machine.rename(columns={
    'voltage': 'PumpVoltage',
    'current': 'PumpCurrent',
    'duration': 'IrrigationDuration',
    'energy': 'Energy_Consumption',
    'time': 'DateTime'
})

# Crop irrigation
df_crop = df_crop.rename(columns={
    'crop': 'CropType',
    'soil': 'SoilType',
    'temp': 'Temperature',
    'humidity': 'Humidity',
    'irrigation_time': 'IrrigationDuration'
})


In [None]:
df = pd.concat([df_auto, df_smart, df_machine, df_crop], ignore_index=True)

print("Combined dataset shape:", df.shape)
print(df.head())

In [None]:
print("Before removing duplicates:", df.shape)
df = df.drop_duplicates()
print("After removing duplicates:", df.shape)
print(df.columns)

In [None]:
# Handle missing values safely
num_cols = ['Temperature', 'Humidity', 'SoilMoisture', 'PumpVoltage', 'PumpCurrent', 'IrrigationDuration']
cat_cols = ['SoilType', 'CropType']

# Fill numeric columns
for col in num_cols:
    if col in df.columns:
        df[col] = df[col].fillna(df[col].mean())

# Fill categorical columns
for col in cat_cols:
    if col in df.columns:
        df[col] = df[col].fillna(df[col].mode()[0])

print("Missing values handled successfully.")
print(df.columns)


In [None]:
# Example: Convert temperature to Celsius if recorded in Fahrenheit
if df['Temperature'].max() > 60:  # Assuming values >60 means Fahrenheit
    df['Temperature'] = (df['Temperature'] - 32) * 5/9

# Ensure SoilMoisture is in percentage (0â€“100)
if df['SoilMoisture'].max() <= 1:
    df['SoilMoisture'] = df['SoilMoisture'] * 100


In [None]:
datetime_col = None
for col in df.columns:
    if 'date' in col.lower() or 'time' in col.lower():
        datetime_col = col
        break

if datetime_col:
    df[datetime_col] = pd.to_datetime(df[datetime_col], errors='coerce')
    print(f" Converted '{datetime_col}' to datetime format.")
else:
    print(" No datetime-like column found. Skipping datetime conversion.")
print(df.columns)

In [None]:
import pandas as pd

# Convert DateTime column safely
if 'date' in df.columns:
    df['date'] = pd.to_datetime(df['date'], errors='coerce')

# Define all possible numeric columns
possible_numeric = ['Temperature', 'Humidity', 'SoilMoisture', 
                    'PumpVoltage', 'PumpCurrent', 'IrrigationDuration']

# Keep only columns that exist in the dataset
numeric_cols = [col for col in possible_numeric if col in df.columns]

# Convert existing numeric columns to numeric data type
df[numeric_cols] = df[numeric_cols].apply(pd.to_numeric, errors='coerce')

print(df.head())  


In [None]:
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()

# Identify categorical columns to encode
label_cols = ['CropType', 'crop', 'class']

for col in label_cols:
    if col in df.columns and df[col].dtype == 'object':
        df[col] = encoder.fit_transform(df[col])
        print(f" Encoded column: {col}")
    else:
        print(f" Skipped: {col} (not found or already numeric)")

print("All applicable categorical columns encoded successfully!")


In [None]:
df['time'] = pd.to_datetime(df['time'], errors='coerce')
df['Hour'] = df['time'].dt.hour
df['Day'] = df['time'].dt.day
df['Month'] = df['time'].dt.month
df['Year'] = df['time'].dt.year
print(df[['time', 'Hour', 'Day', 'Month', 'Year']].head())


In [None]:
df['DateTime'] = pd.to_datetime(df['time'], errors='coerce')

df['DayOfWeek'] = df['DateTime'].dt.day_name()
df['Month'] = df['DateTime'].dt.month_name()

def get_season(month):
    if month in [12, 1, 2]:
        return 'Winter'
    elif month in [3, 4, 5]:
        return 'Spring'
    elif month in [6, 7, 8]:
        return 'Summer'
    else:
        return 'Autumn'

df['Season'] = df['DateTime'].dt.month.apply(get_season)

print(df[['DateTime', 'DayOfWeek', 'Month', 'Season']].head())



In [None]:
df['PumpVoltage'] = [220] * len(df)
df['PumpCurrent'] = [5] * len(df)
df['IrrigationDuration'] = [30] * len(df)

df['Energy_Consumption'] = df['PumpVoltage'] * df['PumpCurrent'] * df['IrrigationDuration'] / 60
print(df[['Energy_Consumption']].head())


In [None]:
import pandas as pd
# Just to confirm the object
print(type(df))

# Save cleanly
df.to_csv('clean_irrigation_data.csv', index=False)
print(" Clean dataset saved as 'clean_irrigation_data.csv'")
