In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

Import libraries

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import accuracy_score

Loading the data

In [None]:
train_raw = pd.read_csv('/kaggle/input/ecosphere-forecasting/train.csv')
test_raw = pd.read_csv('/kaggle/input/ecosphere-forecasting/test.csv')
train_raw.shape, test_raw.shape

Exploring the data

In [None]:
train_raw.head(10)

In [None]:
test_raw.head()

In [None]:
train_raw.isnull().sum()

In [None]:
test_raw.isnull().sum()

In [None]:
train_raw['Air Quality'].value_counts()

In [None]:
train_raw.describe()

In [None]:
test_raw.describe()

Data visualization

In [None]:
features = test_raw.drop('Id',axis=1).columns

In [None]:
for feature in features:
    sns.histplot(data=train_raw, x=feature, hue='Air Quality', kde=True)
    plt.title(f'Distribution of {feature} by Air Quality')
    plt.show()

In [None]:
subset_features = ['PM2.5', 'PM10', 'NO2', 'SO2', 'CO']
sns.pairplot(train_raw[subset_features + ['Air Quality']], hue='Air Quality')
plt.show()

Split data

In [None]:
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import RobustScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from catboost import CatBoostClassifier
from sklearn.metrics import classification_report

In [None]:
X =train_raw.drop(columns=['Id', 'Air Quality'])
y = train_raw['Air Quality']
X.shape, y.shape

In [None]:
y = y.map({'Good': 0, 'Moderate': 1, 'Poor': 2, 'Hazardous': 3})

In [None]:
submission = pd.DataFrame()
submission['Id'] = test_raw['Id']
scaler = RobustScaler()
X = scaler.fit_transform(X)
test_raw = scaler.transform(test_raw.drop('Id',axis = 1))

Split data

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,y,random_state = 42, test_size = 0.2)

Train best model

In [None]:
catboost_params = {
    'learning_rate': 0.03, 
    'l2_leaf_reg': 0.001, 
    'iterations': 600, 
    'depth': 4, 
    'bagging_temperature': 0.5555555555555556,
    'verbose': False
}

In [None]:
model = CatBoostClassifier(**catboost_params)
model.fit(X_train, y_train)

In [None]:
predict_val = model.predict(X_test)
accuracy_val = accuracy_score(y_test, predict_val)
print(f"Accuracy: {accuracy:.4f}")


In [None]:
from sklearn.metrics import ConfusionMatrixDisplay
ConfusionMatrixDisplay.from_estimator(model, X_test, y_test)


In [None]:
predict = model.predict(test_raw)

Submisstion

In [None]:
submission['Air_Quality_Level'] = predict
submission['Air_Quality_Level'] = submission['Air_Quality_Level'].map({0: 'Good', 1: 'Moderate', 2: 'Poor', 3: 'Hazardous'})
submission.to_csv('submission.csv', index=False)
submission.head()