In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
import matplotlib.pyplot as plt

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df = pd.read_csv("/kaggle/input/summer-analytics-mid-hackathon/hacktrain.csv")
df 

In [None]:

# Step 2: Identify date columns
date_cols = [col for col in df.columns if col.endswith('_N')]

# Step 3: Convert date columns to numeric (if any garbage strings)
df[date_cols] = df[date_cols].apply(pd.to_numeric, errors='coerce')

# Step 4: Calculate 1 single overall mean per class
class_means = df.groupby('class')[date_cols].apply(lambda x: x.stack().mean())

# Now:
# class_means['forest'] → single float value for forest
# class_means['impervious'] → single float value for impervious

# Step 5: Function to fill NaNs using class-specific value
def fill_with_class_mean(row):
    class_type = row['class']
    mean_value = class_means[class_type]
    row[date_cols] = row[date_cols].fillna(mean_value).infer_objects(copy=False)
    return row
import warnings
warnings.filterwarnings('ignore', category=FutureWarning)
# Step 6: Apply to DataFrame
df = df.apply(fill_with_class_mean, axis=1)

In [None]:
print(df[df['class'] == 'forest'][date_cols].apply(lambda x: x.nunique(), axis=1).max())  # should be 1 if all same


In [None]:
df

In [None]:
!pip install imbalanced-learn==0.11.0 --quiet

In [None]:
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
import pandas as pd

# Assume df is already defined and cleaned

# Identify reflectance columns
date_cols = [col for col in df.columns if col.endswith('_N')]

# Fill missing values temporarily to allow SMOTE
X = df[date_cols].fillna(df[date_cols].mean())
y = df['class']

# Apply SMOTE
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

# New balanced DataFrame
df_resampled = pd.DataFrame(X_resampled, columns=date_cols)
df_resampled['class'] = y_resampled
print(df_balanced['class'].value_counts())

In [None]:
# Step 1: Check class distribution
class_counts = df['class'].value_counts()
print(class_counts)

In [None]:
from sklearn.preprocessing import StandardScaler

# Step 1: Separate features and target
X = df_balanced[date_cols]
y = df_balanced['class']

# Step 2: Initialize and fit-transform StandardScaler
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Step 3: Convert scaled features back to DataFrame
X_scaled_df = pd.DataFrame(X_scaled, columns=date_cols)

# Step 4: Add class column back
df_scaled = X_scaled_df.copy()
df_scaled['class'] = y.values

# Step 5: (Optional) Check final resul
df_scaled

In [None]:
# Step 1: Separate features and class
X = df_scaled[date_cols]
y = df_scaled['class']

# Step 2: Calculate IQR for each column
Q1 = X.quantile(0.25)
Q3 = X.quantile(0.75)
IQR = Q3 - Q1

# Step 3: Detect outliers in a boolean DataFrame
outlier_mask = (X < (Q1 - 1.5 * IQR)) | (X > (Q3 + 1.5 * IQR))

class_means = df_scaled.groupby('class')[date_cols].mean()

X_corrected = X.copy()
for col in date_cols:
    for idx in X.index:
        if outlier_mask.loc[idx, col]:
            class_type = y.loc[idx]
            X_corrected.loc[idx, col] = class_means.loc[class_type, col]

df_corrected = X_corrected.copy()
df_corrected['class'] = y.values

# Step 7: (Optional) Check how many were replaced
total_outliers = outlier_mask.sum().sum()
print(f"Total outlier cells replaced: {total_outliers}")

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Step 1: Features and Labels
X_final = X_corrected  # Already scaled and cleaned
y_final = y            # Class labels

# Step 2: Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X_final, y_final, test_size=0.2, random_state=42, stratify=y_final)

# Step 3: Initialize and Train the Model
model = LogisticRegression(max_iter=1000, solver='lbfgs', multi_class='multinomial')
model.fit(X_train, y_train)

# Step 4: Predictions
y_pred = model.predict(X_test)

# Step 5: Evaluation
print("🔍 Classification Report:\n", classification_report(y_test, y_pred))
print("✅ Accuracy Score:", accuracy_score(y_test, y_pred))
print("📉 Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

In [None]:
test_data = pd.read_csv("/kaggle/input/summer-analytics-mid-hackathon/hacktest.csv")
test_data.shape

In [None]:
# Step 1: Prepare test data
test_ids = test_data['ID']                          # Keep IDs for final output
test_features = test_data.drop(['Unnamed: 0', 'ID'], axis=1)

# Step 2: Scale using the trained scaler
test_scaled = scaler.transform(test_features)     # Use SAME scaler from training

# Step 3: Predict using the trained model
test_preds = model.predict(test_scaled)

# Step 4: Create final output DataFrame
output_df = pd.DataFrame({
    'ID': test_ids,
    'Predicted_Class': test_preds
})
output_df

In [None]:
output_df.to_csv('submission.csv', index=False)