# Land Cover Classification using NDVI Time-Series Data

This notebook implements a solution for the Summer Analytics 2025 Hackathon challenge to classify land cover types using NDVI time-series data from satellite imagery.

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from scipy import stats
from sklearn.utils import class_weight

## 1. Data Loading and Exploration

In [None]:
# Load the data
traindf = pd.read_csv("/kaggle/input/summer-analytics-mid-hackathon/hacktrain.csv")
testdf = pd.read_csv("/kaggle/input/summer-analytics-mid-hackathon/hacktest.csv")

# Display basic information
print("Training data shape:", traindf.shape)
print("Test data shape:", testdf.shape)

# Display first few rows of training data
traindf.head()

In [None]:
# Check for missing values
print("Missing values in training data:")
print(traindf.isnull().sum())

# Check class distribution
print("\nClass distribution:")
class_counts = traindf['class'].value_counts()
print(class_counts)

# Visualize class distribution
plt.figure(figsize=(10, 6))
class_counts.plot(kind='bar')
plt.title('Land Cover Class Distribution')
plt.xlabel('Land Cover Type')
plt.ylabel('Count')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

## 2. Data Preprocessing and Feature Engineering

In [None]:
# Drop unnecessary columns
traindf.drop(['Unnamed: 0'], axis=1, inplace=True)
testdf.drop(['Unnamed: 0'], axis=1, inplace=True)

In [None]:
# Feature engineering - create statistical features from NDVI time series
def create_features(df):
    # Get only NDVI columns
    ndvi_cols = [col for col in df.columns if '_N' in col]
    
    # Create a copy to avoid modifying the original dataframe
    df_features = df.copy()
    
    # Calculate statistical features
    df_features['ndvi_mean'] = df[ndvi_cols].mean(axis=1)
    df_features['ndvi_std'] = df[ndvi_cols].std(axis=1)
    df_features['ndvi_min'] = df[ndvi_cols].min(axis=1)
    df_features['ndvi_max'] = df[ndvi_cols].max(axis=1)
    df_features['ndvi_range'] = df_features['ndvi_max'] - df_features['ndvi_min']
    
    # Calculate first and second half differences (temporal change)
    half_point = len(ndvi_cols) // 2
    first_half = ndvi_cols[:half_point]
    second_half = ndvi_cols[half_point:]
    
    df_features['first_half_mean'] = df[first_half].mean(axis=1)
    df_features['second_half_mean'] = df[second_half].mean(axis=1)
    df_features['half_diff'] = df_features['second_half_mean'] - df_features['first_half_mean']
    
    # Calculate quarterly statistics if we have enough data points
    if len(ndvi_cols) >= 4:
        quarter_size = len(ndvi_cols) // 4
        q1 = ndvi_cols[:quarter_size]
        q2 = ndvi_cols[quarter_size:2*quarter_size]
        q3 = ndvi_cols[2*quarter_size:3*quarter_size]
        q4 = ndvi_cols[3*quarter_size:]
        
        if q1:
            df_features['q1_mean'] = df[q1].mean(axis=1)
        if q2:
            df_features['q2_mean'] = df[q2].mean(axis=1)
        if q3:
            df_features['q3_mean'] = df[q3].mean(axis=1)
        if q4:
            df_features['q4_mean'] = df[q4].mean(axis=1)
    
    return df_features

# Apply feature engineering
train_features = create_features(traindf)
test_features = create_features(testdf)

# Display the new features
print("New features added:")
new_features = [col for col in train_features.columns if col not in traindf.columns]
print(new_features)

# Handle missing values with median imputation
train_features.fillna(train_features.median(numeric_only=True), inplace=True)
test_features.fillna(test_features.median(numeric_only=True), inplace=True)

# Verify no missing values remain
print("\nMissing values after imputation:")
print(train_features.isnull().sum().sum())

## 3. Visualize NDVI Patterns by Land Cover Class

In [None]:
# Get NDVI columns
ndvi_cols = [col for col in traindf.columns if '_N' in col]

# Visualize average NDVI time series for each class
plt.figure(figsize=(15, 10))
for i, class_name in enumerate(traindf['class'].unique()):
    class_data = traindf[traindf['class'] == class_name][ndvi_cols].mean()
    plt.plot(range(len(ndvi_cols)), class_data, label=class_name, linewidth=2)

plt.title('Average NDVI Time Series by Land Cover Class')
plt.xlabel('Time Point')
plt.ylabel('NDVI Value')
plt.legend()
plt.grid(True, linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()

## 4. Model Building with Logistic Regression

In [None]:
# Prepare data for modeling
X = train_features.drop(columns=['class', 'ID'])
y = train_features['class']

# Split data
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.25, random_state=42)

print(f"Training set: {X_train.shape[0]} samples")
print(f"Validation set: {X_val.shape[0]} samples")

In [None]:
# Create a pipeline with preprocessing and model
pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler()),
    ('classifier', LogisticRegression(max_iter=1000, multi_class='multinomial', solver='newton-cg'))
])

# Define parameters for grid search
param_grid = {
    'classifier__C': [1, 1.5, 2, 3, 4],
    'classifier__tol': [0.0001, 0.00001],
    'classifier__class_weight': [None, 'balanced']
}

# Create grid search
grid_search = GridSearchCV(
    pipeline,
    param_grid,
    cv=5,
    scoring='accuracy',
    n_jobs=-1,
    verbose=1
)

# Fit the model
grid_search.fit(X_train, y_train)

In [None]:
# Print best parameters
print("Best parameters:", grid_search.best_params_)
print("Best cross-validation score:", grid_search.best_score_)

## 5. Model Evaluation

In [None]:
# Evaluate on validation set
y_pred = grid_search.predict(X_val)
print("Classification Report on Validation Set:")
print(classification_report(y_val, y_pred))

# Calculate accuracy
accuracy = accuracy_score(y_val, y_pred)
print(f"Validation Accuracy: {accuracy:.4f} ({accuracy*100:.2f}%)")

# Plot confusion matrix
plt.figure(figsize=(10, 8))
cm = confusion_matrix(y_val, y_pred)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
            xticklabels=np.unique(y), 
            yticklabels=np.unique(y))
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Confusion Matrix')
plt.tight_layout()
plt.show()

## 6. Feature Importance Analysis

In [None]:
# Analyze feature importance
best_model = grid_search.best_estimator_.named_steps['classifier']
feature_names = X.columns
coefficients = best_model.coef_
classes = best_model.classes_

# Plot feature importance for each class
plt.figure(figsize=(15, 15))
for i, class_name in enumerate(classes):
    plt.subplot(3, 2, i+1)
    sorted_idx = np.argsort(np.abs(coefficients[i]))
    top_features = sorted_idx[-10:]
    plt.barh(np.array(feature_names)[top_features], coefficients[i][top_features])
    plt.title(f'Top 10 Features for {class_name}')
    plt.tight_layout()
plt.show()

## 7. Final Model and Predictions

In [None]:
# Train final model on all training data
final_model = grid_search.best_estimator_
final_model.fit(X, y)

# Prepare test data
X_test = test_features.drop(columns=['ID'])

# Make predictions
test_predictions = final_model.predict(X_test)

# Display prediction distribution
print("Prediction distribution:")
pred_counts = pd.Series(test_predictions).value_counts()
print(pred_counts)

# Visualize prediction distribution
plt.figure(figsize=(10, 6))
pred_counts.plot(kind='bar')
plt.title('Predicted Land Cover Class Distribution')
plt.xlabel('Land Cover Type')
plt.ylabel('Count')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
# Create submission file
submission = pd.DataFrame({
    'ID': testdf['ID'],
    'class': test_predictions
})

# Display first few rows of submission
submission.head()

In [None]:
# Save submission
submission.to_csv('submission.csv', index=False)
print("Submission file created: submission.csv")

## 8. Summary and Conclusion

In this notebook, we've built a Logistic Regression model to classify land cover types using NDVI time-series data. Our approach included:

1. **Feature Engineering**: Created statistical features from NDVI time series to capture temporal patterns
2. **Data Preprocessing**: Handled missing values with median imputation and standardized features
3. **Model Optimization**: Used GridSearchCV to find optimal hyperparameters
4. **Model Evaluation**: Achieved high accuracy on the validation set
5. **Feature Analysis**: Identified the most important features for each land cover class