ModuleNotFoundError: No module named 'skopt'

In [5]:
pip install scikit-optimize


Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [11]:
import pandas as pd
import numpy as np
import joblib
import matplotlib.pyplot as plt
import os
from typing import Tuple
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from optimization import cost_function, optimize_model, plot_error_distribution
from multilingual_support import assistant
from chat_interface import RealEstateChatbot


def load_data(file_path):
    """Load the cleaned dataset"""
    try:
        df = pd.read_csv(file_path)
        print(f"Data loaded successfully. Shape: {df.shape}")
        return df
    except Exception as e:
        print(f"Error loading data: {e}")
        return None

def preprocess_data(df, target_column):
    """
    Preprocess data by:
    1. Adding new features
    2. Separating features and target
    3. Splitting into train/test sets
    4. Creating preprocessing pipelines
    """
    # Add new feature
    df['prix_per_surface'] = df['prix'] / df['surface']
    
    # Separate features and target
    X = df.drop(columns=[target_column])
    y = df[target_column]
    
    # Identify numerical and categorical columns
    numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns
    categorical_cols = X.select_dtypes(include=['object', 'category']).columns
    
    # Drop redundant columns if needed
    if 'chambres' in numerical_cols and 'bedrooms' in numerical_cols:
        X = X.drop(columns=['chambres', 'salles_de_bains'])
        numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns
    
    # Create preprocessing transformers
    numerical_transformer = StandardScaler()
    categorical_transformer = OneHotEncoder(handle_unknown='ignore')
    
    # Bundle preprocessing
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numerical_transformer, numerical_cols),
            ('cat', categorical_transformer, categorical_cols)
        ])
    
    # Split data into training and test sets (80%/20%)
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42)
    
    return X_train, X_test, y_train, y_test, preprocessor

def build_model(preprocessor):
    """Build and return a machine learning pipeline"""
    model = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('regressor', RandomForestRegressor(
            n_estimators=100,
            random_state=42,
            n_jobs=-1))
    ])
    return model

def train_and_evaluate(model, X_train, X_test, y_train, y_test):
    """Train the model and evaluate its performance"""
    print("\nTraining the model...")
    model.fit(X_train, y_train)
    
    # Make predictions
    train_preds = model.predict(X_train)
    test_preds = model.predict(X_test)
    
    # Evaluate performance
    train_rmse = np.sqrt(mean_squared_error(y_train, train_preds))
    test_rmse = np.sqrt(mean_squared_error(y_test, test_preds))
    train_r2 = r2_score(y_train, train_preds)
    test_r2 = r2_score(y_test, test_preds)
    
    print("\nModel Performance:")
    print(f"Training RMSE: {train_rmse:.2f}")
    print(f"Testing RMSE: {test_rmse:.2f}")
    print(f"Training R²: {train_r2:.4f}")
    print(f"Testing R²: {test_r2:.4f}")
    
    # Analyze residuals
    residuals = test_preds - y_test
    large_errors = residuals[np.abs(residuals) > 200000]  # Example threshold
    if len(large_errors) > 0:
        print(f"\nLarge prediction errors found: {len(large_errors)} samples")
    
    return model, test_preds, residuals

def save_model(model, file_path):
    """Save the trained model to disk"""
    try:
        os.makedirs(os.path.dirname(file_path), exist_ok=True)
        joblib.dump(model, file_path)
        print(f"\nModel saved to {file_path}")
    except Exception as e:
        print(f"Error saving model: {e}")

def generate_conversation(predicted_price: float, features: list) -> str:
    """Handles multilingual conversations about predictions"""
    print("\nAvailable languages: English (en), French (fr), Arabic (ar)")
    user_input = input("Ask about the prediction (type 'exit' to end): ")
    
    if user_input.lower() == 'exit':
        return "Conversation ended"
    
    context = {
        'predicted_price': predicted_price,
        'important_features': features[:3]  # Top 3 features
    }
    
    return assistant.generate_response(user_input, context)
def chat_interface(model_path: str):
    """Run interactive chat prediction"""
    bot = RealEstateChatbot(model_path)
    
    print("\n🏠 Real Estate Chatbot (type 'quit' to exit)")
    print("Example input: '3 bedroom apartment with 120m² in Casablanca'")
    
    while True:
        user_input = input("\nYou: ")
        if user_input.lower() in ['quit', 'exit']:
            break
            
        price = bot.predict_from_text(user_input)
        if price:
            context = {
                'predicted_price': price,
                'input_features': bot.extract_features(user_input)
            }
            response = assistant.generate_response(user_input, context)
            print(f"AI: {response}")

def main():
    # Configuration
    data_path = r'C:\Users\hp\Documents\INOCOD\Advanced EvalioIA\data\data_cleaned_no_outliers.csv'
    model_path = r'C:\Users\hp\Documents\INOCOD\Advanced EvalioIA\models\real_estate_model.joblib'
    target_column = 'prix'
    
    # Load data
    df = load_data(data_path)
    if df is None:
        return
    
    # Preprocess data and split into train/test
    X_train, X_test, y_train, y_test, preprocessor = preprocess_data(df, target_column)
    
    print("\nData Split Summary:")
    print(f"Training set size: {len(X_train)} samples")
    print(f"Test set size: {len(X_test)} samples")
    
    # Build and optimize model
    print("\nOptimizing model hyperparameters...")
    optimized_model = optimize_model(X_train, y_train, preprocessor, n_iter=30)  # Increased iterations
    
    # Train and evaluate final model
    trained_model, test_predictions, residuals = train_and_evaluate(
        optimized_model, X_train, X_test, y_train, y_test)
    
    # Calculate and print cost
    train_cost = cost_function(trained_model, X_train, y_train)
    test_cost = cost_function(trained_model, X_test, y_test)
    print(f"\nTraining Cost: {train_cost:.2f}")
    print(f"Testing Cost: {test_cost:.2f}")
    
    # Save the trained model
    save_model(trained_model, model_path)
    
    # Create a DataFrame with actual and predicted values
    results_df = pd.DataFrame({
        'Actual': y_test,
        'Predicted': test_predictions,
        'Difference': residuals,
        'Percentage_Difference': (residuals / y_test) * 100
    })
    
    print("\nSample Predictions vs Actual:")
    print(results_df.head(10).to_string())
    
    # Save results for analysis
    results_df.to_csv(os.path.join(os.path.dirname(model_path), 'prediction_results.csv'), index=False)
    print(f"\nPrediction results saved to {os.path.join(os.path.dirname(model_path), 'prediction_results.csv')}")
    
    # Plot error distribution
    plot_error_distribution(y_test, test_predictions)
    
    # Plot feature importances if using RandomForest
    if hasattr(trained_model.named_steps['regressor'], 'feature_importances_'):
        try:
            feature_names = []
            feature_names.extend(X_train.select_dtypes(include=['int64', 'float64']).columns)
            ohe = trained_model.named_steps['preprocessor'].named_transformers_['cat']
            cat_features = ohe.get_feature_names_out(X_train.select_dtypes(include=['object', 'category']).columns)
            feature_names.extend(cat_features)
            
            importances = trained_model.named_steps['regressor'].feature_importances_
            indices = np.argsort(importances)[-10:]
            
            plt.figure(figsize=(10, 6))
            plt.title('Top 10 Feature Importances')
            plt.barh(range(len(indices)), importances[indices], align='center')
            plt.yticks(range(len(indices)), [feature_names[i] for i in indices])
            plt.xlabel('Relative Importance')
            plt.tight_layout()
            plt.show()
        except Exception as e:
            print(f"\nCould not plot feature importances: {e}")
    
    # Add conversation after predictions
    # while True:
    #     response = generate_conversation(
    #         predicted_price=test_predictions[0],  # First prediction
    #         features=list(X_train.columns[:3])  # First 3 features
    #     )
    #     print("\nAI:", response)
        
    #     if "Conversation ended" in response:
    #         break

    # After saving the model:
    if input("\nStart chat interface? (y/n): ").lower() == 'y':
        chat_interface(model_path, list(X_train.columns))

if __name__ == "__main__":
    main()

ModuleNotFoundError: No module named 'skopt'

In [12]:
pip install scikit-optimize


Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [13]:
from skopt import BayesSearchCV


ModuleNotFoundError: No module named 'skopt'

In [14]:
import pandas as pd
import numpy as np
import joblib
import matplotlib.pyplot as plt
import os
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from optimization import cost_function, optimize_model, plot_error_distribution 

def load_data(file_path):
    """Load the cleaned dataset"""
    try:
        df = pd.read_csv(file_path)
        print(f"Data loaded successfully. Shape: {df.shape}")
        return df
    except Exception as e:
        print(f"Error loading data: {e}")
        return None

def preprocess_data(df, target_column):
    """
    Preprocess data by:
    1. Separating features and target
    2. Splitting into train/test sets
    3. Creating preprocessing pipelines
    """
    # Separate features and target
    X = df.drop(columns=[target_column])
    y = df[target_column]
    
    # Identify numerical and categorical columns
    numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns
    categorical_cols = X.select_dtypes(include=['object', 'category']).columns
    
    # Create preprocessing transformers
    numerical_transformer = StandardScaler()
    categorical_transformer = OneHotEncoder(handle_unknown='ignore')
    
    # Bundle preprocessing
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numerical_transformer, numerical_cols),
            ('cat', categorical_transformer, categorical_cols)
        ])
    
    # Split data into training and test sets (80%/20%)
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42)
    
    return X_train, X_test, y_train, y_test, preprocessor

def build_model(preprocessor):
    """Build and return a machine learning pipeline"""
    model = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('regressor', RandomForestRegressor(
            n_estimators=100,
            random_state=42,
            n_jobs=-1)) 
    ])
    return model

def train_and_evaluate(model, X_train, X_test, y_train, y_test):
    """Train the model and evaluate its performance"""
    # Train the model
    print("\nTraining the model...")
    model.fit(X_train, y_train)
    
    # Make predictions
    train_preds = model.predict(X_train)
    test_preds = model.predict(X_test)
    
    # Evaluate performance
    train_rmse = np.sqrt(mean_squared_error(y_train, train_preds))
    test_rmse = np.sqrt(mean_squared_error(y_test, test_preds))
    train_r2 = r2_score(y_train, train_preds)
    test_r2 = r2_score(y_test, test_preds)
    
    print("\nModel Performance:")
    print(f"Training RMSE: {train_rmse:.2f}")
    print(f"Testing RMSE: {test_rmse:.2f}")
    print(f"Training R²: {train_r2:.4f}")
    print(f"Testing R²: {test_r2:.4f}")
    
    return model, test_preds

def save_model(model, file_path):
    """Save the trained model to disk"""
    try:
        os.makedirs(os.path.dirname(file_path), exist_ok=True)
        joblib.dump(model, file_path)
        print(f"\nModel saved to {file_path}")
    except Exception as e:
        print(f"Error saving model: {e}")

def main():
    # Configuration
    data_path = r'C:\Users\hp\Documents\INOCOD\Advanced EvalioIA\data\data_cleaned_no_outliers.csv'
    model_path = r'C:\Users\hp\Documents\INOCOD\Advanced EvalioIA\models\real_estate_model.joblib'
    target_column = 'prix'
    
    # Load data
    df = load_data(data_path)
    if df is None:
        return
    
    # Preprocess data and split into train/test
    X_train, X_test, y_train, y_test, preprocessor = preprocess_data(df, target_column)
    
    print("\nData Split Summary:")
    print(f"Training set size: {len(X_train)} samples")
    print(f"Test set size: {len(X_test)} samples")
    
    # Build and optimize model
    print("\nOptimizing model hyperparameters...")
    optimized_model = optimize_model(X_train, y_train, preprocessor)
    
    # Train and evaluate final model
    trained_model, test_predictions = train_and_evaluate(
        optimized_model, X_train, X_test, y_train, y_test)
    
    # Calculate and print cost
    train_cost = cost_function(trained_model, X_train, y_train)
    test_cost = cost_function(trained_model, X_test, y_test)
    print(f"\nTraining Cost: {train_cost:.2f}")
    print(f"Testing Cost: {test_cost:.2f}")
    
    # Save the trained model
    save_model(trained_model, model_path)
    
    # Create a DataFrame with actual and predicted values
    results_df = pd.DataFrame({
        'titre': X_test['titre'],
        'surface': X_test['surface'],
        'location': X_test['location'],
        'city': X_test['city'],
        'Actual': y_test,
        'Predicted': test_predictions,
        'Difference': test_predictions - y_test,
        'Percentage_Difference': ((test_predictions - y_test) / y_test) * 100
    })
    
    print("\nSample Predictions vs Actual:")
    print(results_df.head(10))
    
    # Plot error distribution
    plot_error_distribution(y_test, test_predictions)
    
    # Plot feature importances if using RandomForest
    if hasattr(trained_model.named_steps['regressor'], 'feature_importances_'):
        try:
            # Get feature names after preprocessing
            feature_names = []
            # Numerical features
            feature_names.extend(X_train.select_dtypes(include=['int64', 'float64']).columns)
            # Categorical features (after one-hot encoding)
            ohe = trained_model.named_steps['preprocessor'].named_transformers_['cat']
            cat_features = ohe.get_feature_names_out(X_train.select_dtypes(include=['object', 'category']).columns)
            feature_names.extend(cat_features)
            
            importances = trained_model.named_steps['regressor'].feature_importances_
            indices = np.argsort(importances)[-10:]  # Top 10 features
            
            plt.figure(figsize=(10, 6))
            plt.title('Top 10 Feature Importances')
            plt.barh(range(len(indices)), importances[indices], align='center')
            plt.yticks(range(len(indices)), [feature_names[i] for i in indices])
            plt.xlabel('Relative Importance')
            plt.tight_layout()
            plt.show()
        except Exception as e:
            print(f"\nCould not plot feature importances: {e}")

if __name__ == "__main__":
    main()

ModuleNotFoundError: No module named 'skopt'