In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import cross_val_score

# Load training and test data from CSV files
train_df = pd.read_csv('/kaggle/input/hackathon-ndvi/hacktrain.csv')
test_df = pd.read_csv('/kaggle/input/hackathon-ndvi/hacktest.csv')

# Drop any unnamed index columns if present
for col in ['Unnamed: 0']:
    if col in train_df.columns:
        train_df.drop(columns=[col], inplace=True)
    if col in test_df.columns:
        test_df.drop(columns=[col], inplace=True)

# Identify NDVI columns (those ending with '_N') and sort them by date
ndvi_cols = sorted([c for c in train_df.columns if c.endswith('_N')])

# Interpolate missing NDVI values row-wise for the training set
train_ndvi = train_df[ndvi_cols].astype(float)
train_ndvi_interp = train_ndvi.interpolate(axis=1, limit_direction='both')
# If any values remain missing after interpolation, fill them with column medians
train_ndvi_interp = train_ndvi_interp.fillna(train_ndvi.median(axis=0))

# Interpolate missing NDVI values row-wise for the test set (should be no missing, but for safety)
test_ndvi = test_df[ndvi_cols].astype(float)
test_ndvi_interp = test_ndvi.interpolate(axis=1, limit_direction='both')
test_ndvi_interp = test_ndvi_interp.fillna(test_ndvi.median(axis=0))

# Function to extract features from NDVI time series
def extract_features(df_ndvi):
    features = pd.DataFrame(index=df_ndvi.index)
    # Basic statistics for each time-series row
    features['ndvi_mean']   = df_ndvi.mean(axis=1)
    features['ndvi_std']    = df_ndvi.std(axis=1)
    features['ndvi_min']    = df_ndvi.min(axis=1)
    features['ndvi_max']    = df_ndvi.max(axis=1)
    features['ndvi_range']  = features['ndvi_max'] - features['ndvi_min']
    features['ndvi_median'] = df_ndvi.median(axis=1)
    # First (earliest) and last (latest) NDVI values in the series
    features['ndvi_first']  = df_ndvi.iloc[:, 0]
    features['ndvi_last']   = df_ndvi.iloc[:, -1]
    # Compute trend (slope) of NDVI over time for each row using least squares
    x = np.arange(df_ndvi.shape[1])
    x_mean = x.mean()
    x_var = np.sum((x - x_mean)**2)
    slopes = []
    for _, row in df_ndvi.iterrows():
        y = row.values
        mask = ~np.isnan(y)
        # If too few points, set slope to 0
        if np.sum(mask) <= 1:
            slopes.append(0.0)
        else:
            y_masked = y[mask]
            x_masked = x[mask]
            y_mean = y_masked.mean()
            # slope = Cov(x,y) / Var(x)
            slope = np.sum((x_masked - x_mean) * (y_masked - y_mean)) / x_var
            slopes.append(slope)
    features['ndvi_slope'] = slopes
    return features

# Extract features for training and test sets
train_features = extract_features(train_ndvi_interp)
test_features  = extract_features(test_ndvi_interp)

# Prepare training labels
label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(train_df['class'])

# Standardize features for logistic regression
scaler = StandardScaler()
X_train = scaler.fit_transform(train_features)
X_test  = scaler.transform(test_features)

# Initialize Logistic Regression with multiclass setting and balanced class weights
clf = LogisticRegression(multi_class='multinomial', solver='lbfgs', 
                         class_weight='balanced', max_iter=1000, C=1.0)

# Optional: check cross-validated accuracy (e.g., 5-fold)
# cv_scores = cross_val_score(clf, X_train, y_train, cv=5, scoring='accuracy')
# print(f"CV Accuracy: {cv_scores.mean():.3f}")

# Train on all training data
clf.fit(X_train, y_train)

# Predict on test set
y_pred = clf.predict(X_test)
pred_labels = label_encoder.inverse_transform(y_pred)

# Prepare the submission DataFrame
submission = pd.DataFrame({
    'ID': test_df['ID'],
    'class': pred_labels
})

# Save to CSV
submission.to_csv('submission.csv', index=False)
