In [1]:
import os
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA

np.random.seed(0)

In [2]:
train_val = pd.read_csv("/kaggle/input/tabular-playground-series-feb-2022/train.csv")

In [3]:
train_val["target"].value_counts()

Bacteroides_fragilis        20139
Streptococcus_pyogenes      20076
Streptococcus_pneumoniae    20074
Campylobacter_jejuni        20063
Salmonella_enterica         20030
Escherichia_coli            19958
Enterococcus_hirae          19947
Escherichia_fergusonii      19937
Staphylococcus_aureus       19929
Klebsiella_pneumoniae       19847
Name: target, dtype: int64

In [4]:
features = [
    column for column in train_val
    if column not in ("row_id", "target")
]

In [5]:
train_val.describe()    

Unnamed: 0,row_id,A0T0G0C10,A0T0G1C9,A0T0G2C8,A0T0G3C7,A0T0G4C6,A0T0G5C5,A0T0G6C4,A0T0G7C3,A0T0G8C2,...,A8T0G0C2,A8T0G1C1,A8T0G2C0,A8T1G0C1,A8T1G1C0,A8T2G0C0,A9T0G0C1,A9T0G1C0,A9T1G0C0,A10T0G0C0
count,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,...,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0
mean,99999.5,6.421457e-07,-3e-06,-1.4e-05,-1e-05,5e-06,2.5e-05,1.4e-05,-9e-06,-2.8e-05,...,0.000135,0.000348,0.000253,0.000464,0.000581,0.000379,6.2e-05,8.982134e-05,0.0001170014,2.215407e-07
std,57735.171256,8.654927e-05,0.000132,0.000287,0.000436,0.000683,0.000869,0.000775,0.000441,0.000107,...,0.000711,0.001117,0.000914,0.001238,0.001528,0.001182,0.000466,0.0005475606,0.0006788277,2.050081e-05
min,0.0,-9.536743e-07,-1e-05,-4.3e-05,-0.000114,-0.0002,-0.00024,-0.0002,-0.000114,-4.3e-05,...,-4.3e-05,-8.6e-05,-4.3e-05,-8.6e-05,-8.6e-05,-4.3e-05,-1e-05,-9.536743e-06,-9.536743e-06,-9.536743e-07
25%,49999.75,-9.536743e-07,-1e-05,-4.3e-05,-0.000114,-0.0002,-0.00024,-0.0002,-0.000114,-4.3e-05,...,-4.3e-05,-8.6e-05,-4.3e-05,-8.6e-05,-8.6e-05,-4.3e-05,-1e-05,-9.536743e-06,-9.536743e-06,-9.536743e-07
50%,99999.5,-9.536743e-07,-1e-05,-4.3e-05,-0.000114,-0.0002,-0.000237,-0.000199,-0.000114,-4.3e-05,...,1.4e-05,6.4e-05,4.7e-05,0.000124,0.000114,4.3e-05,-1e-05,4.632568e-07,4.632568e-07,-9.536743e-07
75%,149999.25,-9.536743e-07,-3e-06,-1.3e-05,-4e-06,-1.1e-05,3e-06,-3e-05,4e-06,-2.8e-05,...,0.000111,0.000463,0.000297,0.000684,0.000774,0.000365,3e-05,6.046326e-05,6.946326e-05,-9.536743e-07
max,199999.0,0.009999046,0.00999,0.009957,0.009886,0.0198,0.01976,0.0198,0.009886,0.009957,...,0.019957,0.019914,0.019957,0.019914,0.019914,0.019957,0.00999,0.009990463,0.01999046,0.0009990463


In [6]:
train, val = train_test_split(train_val)
X_train, y_train = train[features].values, train["target"].values
X_val, y_val = val[features].values, val["target"].values

label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(y_train)
y_val = label_encoder.transform(y_val)

In [7]:
pipeline = Pipeline(
    [
        ("scaler", StandardScaler()),
        ("pca", PCA(n_components=0.99)),
        ("model",  RandomForestClassifier(n_estimators=1000, n_jobs=-1)),
    ],
    verbose=1,
)

In [8]:
pipeline.fit(X_train, y_train)

[Pipeline] ............ (step 1 of 3) Processing scaler, total=   1.0s
[Pipeline] ............... (step 2 of 3) Processing pca, total=   4.9s
[Pipeline] ............. (step 3 of 3) Processing model, total=19.8min


Pipeline(steps=[('scaler', StandardScaler()), ('pca', PCA(n_components=0.99)),
                ('model',
                 RandomForestClassifier(n_estimators=1000, n_jobs=-1))],
         verbose=1)

In [9]:
y_pred = pipeline.predict(X_val)
print(classification_report(y_val, y_pred))

              precision    recall  f1-score   support

           0       0.99      1.00      0.99      5048
           1       0.99      1.00      0.99      4975
           2       0.99      0.99      0.99      5010
           3       0.99      0.99      0.99      5020
           4       0.98      0.99      0.99      4930
           5       1.00      0.99      1.00      4960
           6       1.00      0.99      0.99      5004
           7       0.99      0.99      0.99      5001
           8       0.99      0.99      0.99      5070
           9       0.99      0.99      0.99      4982

    accuracy                           0.99     50000
   macro avg       0.99      0.99      0.99     50000
weighted avg       0.99      0.99      0.99     50000



In [10]:
y_pred = pipeline.predict(X_train)
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     15091
           1       1.00      1.00      1.00     15088
           2       1.00      1.00      1.00     14937
           3       1.00      1.00      1.00     14938
           4       1.00      1.00      1.00     15007
           5       1.00      1.00      1.00     14887
           6       1.00      1.00      1.00     15026
           7       1.00      1.00      1.00     14928
           8       1.00      1.00      1.00     15004
           9       1.00      1.00      1.00     15094

    accuracy                           1.00    150000
   macro avg       1.00      1.00      1.00    150000
weighted avg       1.00      1.00      1.00    150000



In [11]:
test = pd.read_csv("/kaggle/input/tabular-playground-series-feb-2022/test.csv")
y_pred = pipeline.predict(test[features].values)

In [12]:
output = pd.DataFrame(
    {
        "row_id": test["row_id"],
        "target": label_encoder.inverse_transform(y_pred)
        
    }
)

In [13]:
output.to_csv("/kaggle/working/submission.csv", index=False)