In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV 
from sklearn.preprocessing import LabelEncoder, StandardScaler, PolynomialFeatures
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix 


# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory
from sklearn.impute import KNNImputer

import os
for dirname, _, filenames in os.walk('./'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

%matplotlib inline

In [None]:
df = pd.read_csv("./hacktrain.csv")
df.drop(columns=['Unnamed: 0'], inplace=True)
df

In [None]:
df.info()

In [None]:
# Copy df to avoid modifying ID and class columns
df_numeric = df.select_dtypes(include='number').copy()

# Group by 'class' and fill NaNs with group mean using transform
df_numeric_imputed = df.groupby('class')[df_numeric.columns].transform(lambda x: x.fillna(x.mean()))

# Assign imputed numeric columns back to original df
df[df_numeric.columns] = df_numeric_imputed

print(df.head())


In [None]:
print(df.isnull().sum(), '\n\n')

# print(df[df.duplicated()])  # it is 0
plt.figure(figsize=(17, 12))
sns.heatmap(df.isnull())

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Step 1: Create a subset of the DataFrame excluding 'class' and 'ID' columns
data = df.drop(columns=['class', 'ID'])

# Step 2: Calculate the correlation matrix on the subset DataFrame
corr_matrix = data.corr()

# Step 3: Plot the heatmap to visualize correlations
plt.figure(figsize=(15, 10))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt='.2f', linewidths=0.5)
plt.show()

# Step 4: Set a threshold for correlation (e.g., 0.9)
threshold = 0.9

# Step 5: Identify columns to drop by checking pairwise correlations
drop_columns = set()

for i in range(len(corr_matrix.columns)):
    for j in range(i):
        if abs(corr_matrix.iloc[i, j]) > threshold:  # if correlation is above the threshold
            colname = corr_matrix.columns[i]
            drop_columns.add(colname)

# Step 6: Drop the identified collinear columns from the original 'data' DataFrame
data_cleaned = data.drop(columns=drop_columns)

# Step 7: Re-merge the 'class' and 'ID' columns back into the cleaned DataFrame
df_cleaned = pd.concat([df[['class', 'ID']], data_cleaned], axis=1)

# Step 8: Show remaining columns after cleaning
print("Remaining columns after removing collinear ones:")
print(df_cleaned.columns)

df = df_cleaned


In [None]:
# Step 1: Load original (non-imputed) data
df_original = pd.read_csv('hacktrain.csv')  # Replace with your actual path

# Step 2: `df` is your imputed DataFrame (already in memory)

# Step 3: Summary of statistics (unchanged)
summary = pd.DataFrame({
    'Original_Mean': df_original.mean(numeric_only=True),
    'Imputed_Mean': df.mean(numeric_only=True),
    'Mean_Diff': df.mean(numeric_only=True) - df_original.mean(numeric_only=True),
    'Original_Std': df_original.std(numeric_only=True),
    'Imputed_Std': df.std(numeric_only=True),
    'Std_Diff': df.std(numeric_only=True) - df_original.std(numeric_only=True)
}).round(2)

print("\n=== Summary of Statistical Differences (Original vs Imputed) ===\n")
print(summary)

# Step 4: KDE Plots in 3 Batches with consistent colors
numeric_cols = df.select_dtypes(include='number').columns
batch_size = 10

# Define color palettes to alternate
palettes = ['Set1', 'Set2', 'Paired']

for batch_idx, i in enumerate(range(0, len(numeric_cols), batch_size)):
    batch_cols = numeric_cols[i:i + batch_size]
    palette = sns.color_palette(palettes[batch_idx % len(palettes)], n_colors=len(batch_cols))

    plt.figure(figsize=(20, 10))
    for col, color in zip(batch_cols, palette):
        # Original with dashed line
        sns.kdeplot(df_original[col], label=f'{col} - Original', linestyle='--', color=color)
        # Imputed with solid line
        sns.kdeplot(df[col], label=f'{col} - Imputed', linestyle='-', color=color)

    plt.title(f'KDE Comparison (Original vs Imputed) — Columns {i+1} to {i+len(batch_cols)}', fontsize=14)
    plt.legend(ncol=2, fontsize=9)
    plt.tight_layout()
    plt.show()


In [None]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, PolynomialFeatures, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

# ————————————————————
# 1. Load & Prep
# ————————————————————
df = df.drop(columns=['Unnamed: 0', 'ID'], errors='ignore')

# Encode target
le = LabelEncoder()
y = le.fit_transform(df['class'])
X = df.drop(columns=['class'])

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    stratify=y,
    random_state=42
)

# ————————————————————
# 2. Build Pipeline with Degree 2 PolynomialFeatures
# ————————————————————
pipe = Pipeline([ 
    ('scaler', StandardScaler()),
    ('poly', PolynomialFeatures(degree=2, include_bias=False)),
    ('clf', LogisticRegression(
        solver='saga',
        penalty='l2',
        C=15,
        class_weight=None,
        max_iter=1000,
        multi_class='multinomial',
        n_jobs=-1
    ))
])

# ————————————————————
# 3. Train & Evaluate
# ————————————————————
pipe.fit(X_train, y_train)
y_pred = pipe.predict(X_test)

print("Parameters:")
print({
    "clf__solver": "saga",
    "clf__penalty": "l2",
    "clf__class_weight": None,
    "clf__C": 15
})

print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=le.classes_))


In [None]:
test_data = pd.read_csv("hacktest.csv")
test_data.drop(columns=['Unnamed: 0'], inplace=True)
test_data.shape
 

In [None]:
ID=test_data['ID']
test_data.drop(['ID'],axis=1,inplace=True)

In [None]:
test_data = test_data[[col for col in test_data.columns if col in df.columns and col != 'class']]
print(test_data.columns)

In [None]:
y_test = pipe.predict(test_data)

In [None]:
y_decoded = le.inverse_transform(y_test)
y_decoded

In [None]:
result = pd.DataFrame({
    'ID': ID,
    'class': y_decoded
})

In [None]:
result.to_csv("submission_mystuff2v1.csv", index=False) #this file will appear under the output section of the right navbar. You need to submit this csv file