In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.decomposition import PCA
import requests
import warnings
warnings.filterwarnings('ignore')

class UrbanLandCoverClassifier:
    """
    Urban Land Cover Classification using multiple ML algorithms
    """

    def __init__(self):
        self.data = None
        self.X = None
        self.y = None
        self.X_train = None
        self.X_test = None
        self.y_train = None
        self.y_test = None
        self.scaler = StandardScaler()
        self.label_encoder = LabelEncoder()
        self.models = {}
        self.results = {}

    def load_data(self):
        """
        Load the Urban Land Cover dataset
        """
        try:
            # UCI ML Repository URL for Urban Land Cover dataset
            url = "https://archive.ics.uci.edu/ml/machine-learning-databases/00295/urban_land_cover.csv"

            print("Loading Urban Land Cover dataset...")
            # Try to load from UCI repository
            try:
                self.data = pd.read_csv(url)
            except:
                # If direct URL doesn't work, create sample data with similar structure
                print("Creating sample dataset with similar structure...")
                self.create_sample_data()

            print(f"Dataset loaded successfully! Shape: {self.data.shape}")
            self.display_dataset_info()

        except Exception as e:
            print(f"Error loading data: {e}")
            print("Creating sample dataset...")
            self.create_sample_data()

    def create_sample_data(self):
        """
        Create a sample dataset similar to Urban Land Cover dataset
        Features typically include spectral, spatial, and textural information
        """
        np.random.seed(42)
        n_samples = 675  # Similar to original dataset size

        # Generate synthetic features similar to urban land cover data
        # Spectral features (like satellite imagery bands)
        spectral_features = np.random.normal(0, 1, (n_samples, 9))

        # Size and shape features
        size_shape_features = np.random.exponential(2, (n_samples, 3))

        # Texture features
        texture_features = np.random.gamma(2, 1, (n_samples, 36))

        # Combine all features
        features = np.hstack([spectral_features, size_shape_features, texture_features])

        # Create feature names
        feature_names = (
            [f'spectral_{i}' for i in range(9)] +
            ['size_feature_1', 'size_feature_2', 'shape_compactness'] +
            [f'texture_{i}' for i in range(36)]
        )

        # Generate target classes (9 classes as in original dataset)
        class_names = ['asphalt', 'concrete', 'glass', 'grass', 'painted_metal',
                      'red_roof', 'shadow', 'soil', 'tree']

        # Create realistic class distribution
        class_probs = [0.15, 0.12, 0.08, 0.18, 0.10, 0.12, 0.05, 0.10, 0.10]
        y = np.random.choice(class_names, n_samples, p=class_probs)

        # Create DataFrame
        self.data = pd.DataFrame(features, columns=feature_names)
        self.data['class'] = y

        print("Sample dataset created with 48 features and 9 classes")

    def display_dataset_info(self):
        """
        Display information about the dataset
        """
        print("\n" + "="*50)
        print("DATASET INFORMATION")
        print("="*50)
        print(f"Dataset shape: {self.data.shape}")
        print(f"Features: {self.data.shape[1] - 1}")
        print(f"Samples: {self.data.shape[0]}")

        if 'class' in self.data.columns:
            print(f"\nClass distribution:")
            print(self.data['class'].value_counts().sort_index())

        print(f"\nFirst few rows:")
        print(self.data.head())

        print(f"\nDataset info:")
        print(self.data.info())

    def preprocess_data(self):
        """
        Preprocess the data: handle missing values, encode labels, split data
        """
        print("\n" + "="*50)
        print("DATA PREPROCESSING")
        print("="*50)

        # Separate features and target
        if 'class' in self.data.columns:
            self.X = self.data.drop('class', axis=1)
            self.y = self.data['class']
        else:
            # Assume last column is target
            self.X = self.data.iloc[:, :-1]
            self.y = self.data.iloc[:, -1]

        # Handle missing values
        missing_values = self.X.isnull().sum().sum()
        if missing_values > 0:
            print(f"Handling {missing_values} missing values...")
            self.X = self.X.fillna(self.X.mean())

        # Encode labels
        self.y_encoded = self.label_encoder.fit_transform(self.y)

        # Split the data
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(
            self.X, self.y_encoded, test_size=0.2, random_state=42,
            stratify=self.y_encoded
        )

        # Scale the features
        self.X_train_scaled = self.scaler.fit_transform(self.X_train)
        self.X_test_scaled = self.scaler.transform(self.X_test)

        print(f"Training set size: {self.X_train.shape}")
        print(f"Test set size: {self.X_test.shape}")
        print(f"Number of classes: {len(np.unique(self.y_encoded))}")
        print("Data preprocessing completed!")

    def perform_eda(self):
        """
        Perform Exploratory Data Analysis
        """
        print("\n" + "="*50)
        print("EXPLORATORY DATA ANALYSIS")
        print("="*50)

        plt.figure(figsize=(15, 10))

        # Class distribution
        plt.subplot(2, 3, 1)
        self.y.value_counts().plot(kind='bar')
        plt.title('Class Distribution')
        plt.xticks(rotation=45)

        # Feature correlation heatmap (subset of features)
        plt.subplot(2, 3, 2)
        # Select first 10 features for visualization
        corr_matrix = self.X.iloc[:, :10].corr()
        sns.heatmap(corr_matrix, annot=False, cmap='coolwarm', center=0)
        plt.title('Feature Correlation (First 10 Features)')

        # Feature distributions (first few features)
        for i in range(4):
            plt.subplot(2, 3, i+3)
            self.X.iloc[:, i].hist(bins=20)
            plt.title(f'Distribution: {self.X.columns[i]}')

        plt.tight_layout()
        plt.show()

        # PCA Analysis
        pca = PCA()
        X_pca = pca.fit_transform(self.X_train_scaled)

        plt.figure(figsize=(12, 4))

        plt.subplot(1, 2, 1)
        plt.plot(np.cumsum(pca.explained_variance_ratio_))
        plt.xlabel('Number of Components')
        plt.ylabel('Cumulative Explained Variance Ratio')
        plt.title('PCA - Explained Variance')
        plt.grid(True)

        plt.subplot(1, 2, 2)
        plt.scatter(X_pca[:, 0], X_pca[:, 1], c=self.y_train, alpha=0.6)
        plt.xlabel('First Principal Component')
        plt.ylabel('Second Principal Component')
        plt.title('PCA - First Two Components')
        plt.colorbar()

        plt.tight_layout()
        plt.show()

    def train_models(self):
        """
        Train multiple machine learning models
        """
        print("\n" + "="*50)
        print("MODEL TRAINING")
        print("="*50)

        # Define models
        self.models = {
            'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
            'SVM': SVC(kernel='rbf', random_state=42),
            'Logistic Regression': LogisticRegression(random_state=42, max_iter=1000),
            'Gradient Boosting': GradientBoostingClassifier(random_state=42),
            'Neural Network': MLPClassifier(hidden_layer_sizes=(100, 50), random_state=42, max_iter=500)
        }

        # Train and evaluate each model
        for name, model in self.models.items():
            print(f"\nTraining {name}...")

            # Train the model
            model.fit(self.X_train_scaled, self.y_train)

            # Make predictions
            y_pred = model.predict(self.X_test_scaled)

            # Calculate accuracy
            accuracy = accuracy_score(self.y_test, y_pred)

            # Cross-validation
            cv_scores = cross_val_score(model, self.X_train_scaled, self.y_train, cv=5)

            # Store results
            self.results[name] = {
                'model': model,
                'accuracy': accuracy,
                'cv_mean': cv_scores.mean(),
                'cv_std': cv_scores.std(),
                'predictions': y_pred
            }

            print(f"Accuracy: {accuracy:.4f}")
            print(f"CV Mean: {cv_scores.mean():.4f} (+/- {cv_scores.std() * 2:.4f})")

    def hyperparameter_tuning(self):
        """
        Perform hyperparameter tuning for the best model
        """
        print("\n" + "="*50)
        print("HYPERPARAMETER TUNING")
        print("="*50)

        # Find the best performing model
        best_model_name = max(self.results.keys(), key=lambda k: self.results[k]['accuracy'])
        print(f"Best model: {best_model_name}")

        # Define hyperparameters for Random Forest (as an example)
        if best_model_name == 'Random Forest':
            param_grid = {
                'n_estimators': [50, 100, 200],
                'max_depth': [None, 10, 20],
                'min_samples_split': [2, 5, 10]
            }

            grid_search = GridSearchCV(
                RandomForestClassifier(random_state=42),
                param_grid, cv=5, scoring='accuracy', n_jobs=-1
            )

            print("Performing Grid Search...")
            grid_search.fit(self.X_train_scaled, self.y_train)

            print(f"Best parameters: {grid_search.best_params_}")
            print(f"Best CV score: {grid_search.best_score_:.4f}")

            # Update the best model
            self.results['Best Tuned Model'] = {
                'model': grid_search.best_estimator_,
                'accuracy': accuracy_score(self.y_test, grid_search.predict(self.X_test_scaled)),
                'predictions': grid_search.predict(self.X_test_scaled)
            }