In [30]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/summer-analytics-mid-hackathon/hacktest.csv
/kaggle/input/summer-analytics-mid-hackathon/hacktrain.csv
/kaggle/input/hacktest/hacktest.csv
/kaggle/input/hacktrain-csv/hacktrain.csv


In [55]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline

def load_data(train_path, test_path):
    """Load data with robust column checking"""
    train = pd.read_csv("/kaggle/input/hacktrain-csv/hacktrain.csv")
    test = pd.read_csv("/kaggle/input/hacktest/hacktest.csv")
    
    # Check for target column (case insensitive)
    target_col = None
    for col in train.columns:
        if col.lower() == 'class':
            target_col = col
            break
    
    if target_col is None:
        raise ValueError("Could not find target 'class' column in training data")
    
    # Standardize column names
    train = train.rename(columns={target_col: 'class'})
    id_col = 'ID' if 'ID' in train.columns else train.columns[0]
    train = train.rename(columns={id_col: 'ID'})
    
    # Same for test data
    id_col_test = 'ID' if 'ID' in test.columns else test.columns[0]
    test = test.rename(columns={id_col_test: 'ID'})
    
    return train, test

# Load data
train_data, test_data = load_data('train.csv', 'test.csv')
print("Training data columns:", train_data.columns.tolist())
print("Test data columns:", test_data.columns.tolist())

Training data columns: ['Unnamed: 0', 'ID', 'class', '20150720_N', '20150602_N', '20150517_N', '20150501_N', '20150415_N', '20150330_N', '20150314_N', '20150226_N', '20150210_N', '20150125_N', '20150109_N', '20141117_N', '20141101_N', '20141016_N', '20140930_N', '20140813_N', '20140626_N', '20140610_N', '20140525_N', '20140509_N', '20140423_N', '20140407_N', '20140322_N', '20140218_N', '20140202_N', '20140117_N', '20140101_N']
Test data columns: ['Unnamed: 0', 'ID', '20150720_N', '20150602_N', '20150517_N', '20150501_N', '20150415_N', '20150330_N', '20150314_N', '20150226_N', '20150210_N', '20150125_N', '20150109_N', '20141117_N', '20141101_N', '20141016_N', '20140930_N', '20140813_N', '20140626_N', '20140610_N', '20140525_N', '20140509_N', '20140423_N', '20140407_N', '20140322_N', '20140218_N', '20140202_N', '20140117_N', '20140101_N']


In [56]:
def preprocess_data(df, is_train=True):
    """Robust preprocessing that handles different column formats"""
    # Make copy to avoid SettingWithCopyWarning
    df = df.copy()
    
    # Get ID and class columns (case insensitive)
    id_col = [col for col in df.columns if col.upper() == 'ID']
    if not id_col:
        id_col = [col for col in df.columns if 'id' in col.lower()]
    if not id_col:
        id_col = [df.columns[0]]  # Fallback to first column
    
    if is_train:
        class_col = [col for col in df.columns if col.lower() == 'class']
        if not class_col:
            raise ValueError("Could not find 'class' column in training data")
    
    # Extract columns
    ids = df[id_col[0]]
    if is_train:
        classes = df[class_col[0]]
        ndvi_cols = [col for col in df.columns 
                    if col not in [id_col[0], class_col[0]] 
                    and ('N' in col or 'NDVI' in col.upper())]
    else:
        ndvi_cols = [col for col in df.columns 
                    if col != id_col[0] 
                    and ('N' in col or 'NDVI' in col.upper())]
    
    # Handle case where no NDVI columns found
    if not ndvi_cols:
        ndvi_cols = [col for col in df.columns 
                    if col not in ([id_col[0]] + ([class_col[0]] if is_train else []))]
    
    df_ndvi = df[ndvi_cols]
    
    # Convert column names to datetime for sorting (more robust)
    date_columns = []
    for col in df_ndvi.columns:
        try:
            # Try to extract date from column name
            date_part = col.split('_')[0]
            pd.to_datetime(date_part)
            date_columns.append(col)
        except:
            continue
    
    if not date_columns:
        # If no dates found, just use original order
        date_columns_sorted = df_ndvi.columns
    else:
        date_columns_sorted = sorted(date_columns, 
                                   key=lambda x: pd.to_datetime(x.split('_')[0]))
    
    # Reorder columns chronologically
    df_ndvi = df_ndvi[date_columns_sorted]
    
    # Handle missing values
    df_ndvi = df_ndvi.interpolate(axis=1, limit_direction='both')
    imputer = SimpleImputer(strategy='mean')
    df_imputed = pd.DataFrame(imputer.fit_transform(df_ndvi), 
                             columns=df_ndvi.columns)
    
    # Feature engineering
    features = pd.DataFrame()
    features['mean_ndvi'] = df_imputed.mean(axis=1)
    features['std_ndvi'] = df_imputed.std(axis=1)
    features['min_ndvi'] = df_imputed.min(axis=1)
    features['max_ndvi'] = df_imputed.max(axis=1)
    features['range_ndvi'] = features['max_ndvi'] - features['min_ndvi']
    features['median_ndvi'] = df_imputed.median(axis=1)
    
    # Add ID back
    features['ID'] = ids.values
    
    if is_train:
        features['class'] = classes.values
    
    return features

# Preprocess data
try:
    train_processed = preprocess_data(train_data)
    test_processed = preprocess_data(test_data, is_train=False)
    print("Preprocessing completed successfully!")
    print("Processed training data shape:", train_processed.shape)
except Exception as e:
    print(f"Error during preprocessing: {str(e)}")
    print("Available columns in train data:", train_data.columns.tolist())

Preprocessing completed successfully!
Processed training data shape: (8000, 8)


In [57]:
def train_model(train_df):
    """Train logistic regression model with error handling"""
    try:
        X = train_df.drop(['ID', 'class'], axis=1)
        y = train_df['class']
        
        # Basic validation
        if len(X) == 0:
            raise ValueError("No features available for training")
        if len(y.unique()) < 2:
            raise ValueError("Need at least 2 classes for classification")
        
        pipeline = Pipeline([
            ('scaler', StandardScaler()),
            ('classifier', LogisticRegression(
                multi_class='multinomial',
                solver='lbfgs',
                max_iter=1000,
                class_weight='balanced',
                random_state=42
            ))
        ])
        
        pipeline.fit(X, y)
        return pipeline
    
    except Exception as e:
        print(f"Error during model training: {str(e)}")
        return None

# Train model
model = train_model(train_processed)
if model is not None:
    print("Model trained successfully!")

Model trained successfully!


In [58]:
def make_predictions(model, test_df):
    """Generate predictions with checks"""
    if model is None:
        print("No model available for predictions")
        return None
    
    try:
        X_test = test_df.drop(['ID'], axis=1)
        test_preds = model.predict(X_test)
        
        submission = pd.DataFrame({
            'ID': test_df['ID'],
            'class': test_preds
        })
        return submission
    except Exception as e:
        print(f"Error during prediction: {str(e)}")
        return None

# Generate submission
submission = make_predictions(model, test_processed)
if submission is not None:
    submission.to_csv('submission.csv', index=False)
    print("Submission file created successfully!")
    print(submission.head())
else:
    print("Failed to create submission file")

Submission file created successfully!
   ID    class
0   1   forest
1   2  orchard
2   3  orchard
3   4  orchard
4   5  orchard
