In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.preprocessing import StandardScaler
import warnings
warnings.filterwarnings('ignore')

# Load data
df = pd.read_csv('../data/plant_watering.csv')
print('Dataset shape:', df.shape)
print('\nFirst 5 rows:')
print(df.head())
print('\nDataset info:')
print(df.info())
print('\nMissing values:')
print(df.isnull().sum())
print('\nDescriptive statistics:')
print(df.describe())

# Target variable analysis
print('\nTarget variable distribution:')
print(df['needs_watering'].value_counts())
plt.figure(figsize=(12, 4))
plt.subplot(1, 3, 1)
df['needs_watering'].value_counts().plot(kind='bar')
plt.title('Target Distribution')
plt.subplot(1, 3, 2)
sns.countplot(data=df, x='plant_type', hue='needs_watering')
plt.xticks(rotation=45)
plt.title('Plant Type vs Watering Need')
plt.subplot(1, 3, 3)
df['needs_watering'].value_counts().plot(kind='pie', autopct='%1.1f%%')
plt.title('Target Percentage')
plt.tight_layout()
plt.show()

# Numerical features analysis
numerical_cols = ['soil_moisture', 'temperature', 'humidity', 'time_since_last_watering']
plt.figure(figsize=(15, 10))
for i, col in enumerate(numerical_cols, 1):
    plt.subplot(2, 2, i)
    sns.boxplot(data=df, x='needs_watering', y=col)
    plt.title(f'{col} by Watering Need')
plt.tight_layout()
plt.show()

# Correlation analysis
plt.figure(figsize=(10, 8))
correlation_matrix = df[numerical_cols + ['needs_watering']].corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0)
plt.title('Feature Correlation Matrix')
plt.show()

# Pairplot
sns.pairplot(df, hue='needs_watering', vars=numerical_cols)
plt.show()

# Encode categorical data
df_encoded = pd.get_dummies(df, columns=['plant_type'])
X = df_encoded.drop('needs_watering', axis=1)
y = df_encoded['needs_watering']

# Split dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Model 1: Random Forest with hyperparameter tuning
print('\n=== Random Forest Model ===')
rf_params = {
    'n_estimators': [50, 100, 200],
    'max_depth': [3, 5, 10, None],
    'min_samples_split': [2, 5, 10]
}
rf = RandomForestClassifier(random_state=42)
rf_grid = GridSearchCV(rf, rf_params, cv=3, scoring='roc_auc')
rf_grid.fit(X_train, y_train)
print(f'Best RF params: {rf_grid.best_params_}')
rf_pred = rf_grid.predict(X_test)
print(f'RF AUC: {roc_auc_score(y_test, rf_grid.predict_proba(X_test)[:, 1]):.3f}')
print(classification_report(y_test, rf_pred))

# Model 2: Logistic Regression with hyperparameter tuning
print('\n=== Logistic Regression Model ===')
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

lr_params = {
    'C': [0.1, 1, 10, 100],
    'penalty': ['l1', 'l2'],
    'solver': ['liblinear']
}
lr = LogisticRegression(random_state=42, max_iter=1000)
lr_grid = GridSearchCV(lr, lr_params, cv=3, scoring='roc_auc')
lr_grid.fit(X_train_scaled, y_train)
print(f'Best LR params: {lr_grid.best_params_}')
lr_pred = lr_grid.predict(X_test_scaled)
print(f'LR AUC: {roc_auc_score(y_test, lr_grid.predict_proba(X_test_scaled)[:, 1]):.3f}')
print(classification_report(y_test, lr_pred))

# Feature importance (Random Forest)
plt.figure(figsize=(10, 6))
feature_importance = pd.DataFrame({
    'feature': X.columns,
    'importance': rf_grid.best_estimator_.feature_importances_
}).sort_values('importance', ascending=False)

sns.barplot(data=feature_importance, x='importance', y='feature')
plt.title('Feature Importance (Random Forest)')
plt.tight_layout()
plt.show()

# Save best model
import pickle
best_model = rf_grid.best_estimator_
with open('../model.pkl', 'wb') as f:
    pickle.dump(best_model, f)
print('\nBest model saved as model.pkl')