In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/melting-point/sample_submission.csv
/kaggle/input/melting-point/train.csv
/kaggle/input/melting-point/test.csv


In [2]:
train = pd.read_csv('/kaggle/input/melting-point/train.csv').drop_duplicates(subset = ['SMILES', 'Tm']).reset_index(drop = True)
test = pd.read_csv('/kaggle/input/melting-point/test.csv')
sub = pd.read_csv('/kaggle/input/melting-point/sample_submission.csv')
X = train['SMILES']
y = train['Tm']
test = test['SMILES']
X

0             FC1=C(F)C(F)(F)C1(F)F
1       c1ccc2c(c1)ccc3Nc4ccccc4c23
2               CCN1C(C)=Nc2ccccc12
3                        CC#CC(=O)O
4                         CCCCC(S)C
                   ...             
2657                         ClCCBr
2658                 N#CC(Cl)(Cl)Cl
2659             Cc1ccc2c(C)cccc2c1
2660          CCC(=O)c1ccc2ccccc2c1
2661              Brc1ccc(cc1)N(C)C
Name: SMILES, Length: 2662, dtype: object

In [3]:
import re

SMILES_RE = re.compile(r"""
    \[[^\[\]]+\]      |  # атом целиком в [ ... ]
    Br|Cl             |  # двухбуквенные элементы вне скобок
    @@?               |  # @, @@
    \\|/              |  # стереосвязи
    =|#|:|-|\.        |  # типы связей и точка
    %\d{2,}           |  # кольца >=10: %10, %123 ...
    \d                |  # кольца 0-9
    [A-Z][a-z]?       |  # элементы: C, N, O, Si, Na ...
    [bcnops]          |  # ароматические: c, n, o, s, p, b
    [()]                 # ветвления
""", re.X)

def tokenize_smiles(s: str):
    f = [m.group(0) for m in SMILES_RE.finditer(s)]
    s = []
    k = 6
    for i in range(len(f) - 1):
        cur = f[i]
        for j in range(i + 1, min(i + k - 1, len(f))):
            cur += f[j]
            s.append(cur)
    return f + s

# пример:
s = "Clc1ccc(cc1)C(=C(Cl)Cl)c2ccc(Cl)cc2"
print(tokenize_smiles(s))


['Cl', 'c', '1', 'c', 'c', 'c', '(', 'c', 'c', '1', ')', 'C', '(', '=', 'C', '(', 'Cl', ')', 'Cl', ')', 'c', '2', 'c', 'c', 'c', '(', 'Cl', ')', 'c', 'c', '2', 'Clc', 'Clc1', 'Clc1c', 'Clc1cc', 'c1', 'c1c', 'c1cc', 'c1ccc', '1c', '1cc', '1ccc', '1ccc(', 'cc', 'ccc', 'ccc(', 'ccc(c', 'cc', 'cc(', 'cc(c', 'cc(cc', 'c(', 'c(c', 'c(cc', 'c(cc1', '(c', '(cc', '(cc1', '(cc1)', 'cc', 'cc1', 'cc1)', 'cc1)C', 'c1', 'c1)', 'c1)C', 'c1)C(', '1)', '1)C', '1)C(', '1)C(=', ')C', ')C(', ')C(=', ')C(=C', 'C(', 'C(=', 'C(=C', 'C(=C(', '(=', '(=C', '(=C(', '(=C(Cl', '=C', '=C(', '=C(Cl', '=C(Cl)', 'C(', 'C(Cl', 'C(Cl)', 'C(Cl)Cl', '(Cl', '(Cl)', '(Cl)Cl', '(Cl)Cl)', 'Cl)', 'Cl)Cl', 'Cl)Cl)', 'Cl)Cl)c', ')Cl', ')Cl)', ')Cl)c', ')Cl)c2', 'Cl)', 'Cl)c', 'Cl)c2', 'Cl)c2c', ')c', ')c2', ')c2c', ')c2cc', 'c2', 'c2c', 'c2cc', 'c2ccc', '2c', '2cc', '2ccc', '2ccc(', 'cc', 'ccc', 'ccc(', 'ccc(Cl', 'cc', 'cc(', 'cc(Cl', 'cc(Cl)', 'c(', 'c(Cl', 'c(Cl)', 'c(Cl)c', '(Cl', '(Cl)', '(Cl)c', '(Cl)cc', 'Cl)', 'Cl)c', 'Cl

In [4]:
!pip install rdkit-pypi
import shutil
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

import optuna
from sklearn.feature_selection import SelectFromModel
from sklearn.preprocessing import PowerTransformer
from sklearn.model_selection import KFold, cross_val_score, StratifiedKFold
from sklearn.metrics import mean_absolute_error, make_scorer, mean_squared_error

from rdkit import Chem
from rdkit.Chem import Descriptors, Crippen, rdMolDescriptors, MACCSkeys, RDKFingerprint, rdFingerprintGenerator
from rdkit.Chem.AtomPairs import Pairs, Torsions

# DISABLE WARNING FROM rdkit
from rdkit import RDLogger

def extract_all_descriptors(df):

    # GET ALL DESCRIPTORS
    descriptor_list = Descriptors._descList    # --> THESE WILL RETURN LIST OF TUPLE
    descriptors = [desc[0] for desc in descriptor_list]

    print(f'There Are {len(descriptor_list)} Descriptor Features')

    # EXTRACT ALL DESCRIPTORS FROM SMILES FEATURES
    result = []
    for smi in df:

        mol = Chem.MolFromSmiles(smi)

        # IF MOLECOLE IS INVALID
        if mol is None:
            row = {name : None for name, func in descriptor_list}
        else:
            # CREATE DESCRIPTORS FEATURES
            row = {name: func(mol) for name, func in descriptor_list}

        result.append(row)

    # MERGE DATA WITH EXTRACTED FEATURES
    df_descriptor = pd.DataFrame(result)
    df_result = pd.concat((df, df_descriptor), axis = 1)
    return df_result
def extract_all_fingerprint(df, SMILES = 'SMILES', morgan_radius = 2, morgan_nbits = 1024):

    fps_data = []  # --> STORE NEW FEATURES DATA

    # DEFINE MORGAN GENERATOR
    morgan_gen = rdFingerprintGenerator.GetMorganGenerator(radius = morgan_radius, fpSize = morgan_nbits, countSimulation = True, includeChirality = False)

    fcfp = rdFingerprintGenerator.GetMorganFeatureAtomInvGen()
    fcfp_gen = rdFingerprintGenerator.GetMorganGenerator(radius = morgan_nbits, fpSize = morgan_nbits, atomInvariantsGenerator = fcfp, countSimulation= True, includeChirality = False)

    atom_gen = rdFingerprintGenerator.GetAtomPairGenerator(fpSize = 2048, countSimulation= True, includeChirality = False)

    # ITERATE EVERY SAMPLE OF SMILES FEATURES
    for smiles in df[SMILES]:
        mol = Chem.MolFromSmiles(smiles)

        if mol is None:
            print(smiles, 'is Invalid!')
            fps_data.append({})
            continue

        # STORE NEW FEATURE FOR EACH SAMPLES CREATED
        feature_rows = {}

        # MORGAN FINGERPRINT (ECFP)
        morgan_fp = morgan_gen.GetFingerprint(mol)
        for i in range(morgan_nbits):
            feature_rows[f"Morgan_{i}"] = morgan_fp[i]

        # FUNCTIONAL-CLASS FINGERPRINT (FCFP)
        fc_fp = fcfp_gen.GetFingerprint(mol)
        for i in range(morgan_nbits):
            feature_rows[f"FCFP_{i}"] = fc_fp[i]

        # MACCS KEYS (166 BITS)
        maccs_fp = MACCSkeys.GenMACCSKeys(mol)
        for i in range(len(maccs_fp)):
            feature_rows[f"MACCS_{i}"] = int(maccs_fp[i])

        # AtomPair Fingerprint (2D)
        atompair_fp = atom_gen.GetCountFingerprint(mol)
        for i in range(morgan_nbits):
            feature_rows[f"AtomPair_{i}"] = atompair_fp[i]

        # RDKIT FINGERPRINT
        rdkit_fp = RDKFingerprint(mol)
        for i in range(len(rdkit_fp)):
            feature_rows[f"RDKIT_{i}"] = int(rdkit_fp[i])

        # AVALON FINGERPRINT (IF AVAILABLE) 
        # avalon_fp = pyAvalonTools.GetAvalonFP(mol, morgan_nbits)
        # for i in range(len(avalon_fp)):
        #     feature_rows[f"Avalon_{i}"] = int(avalon_fp[i])


        fps_data.append(feature_rows)

    print(f'There are {morgan_nbits} Morgan Fingerprint Features')
    print(f'There are {len(maccs_fp)} MACCS Keys Features')
    print(f'There are {len(rdkit_fp)} RDKIT Fingerprint Features')

    # MERGE REAL DATA WITH EXTRACTED FEATURES
    fps_df = pd.DataFrame(fps_data)
    df_result = pd.concat((df, fps_df), axis = 1)

    return df_result


Collecting rdkit-pypi
  Downloading rdkit_pypi-2022.9.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.9 kB)
Downloading rdkit_pypi-2022.9.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (29.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m29.4/29.4 MB[0m [31m51.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: rdkit-pypi
Successfully installed rdkit-pypi-2022.9.5


In [5]:
X = extract_all_fingerprint(extract_all_descriptors(X)).drop(columns='SMILES')
test = extract_all_fingerprint(extract_all_descriptors(test)).drop(columns='SMILES')

There Are 208 Descriptor Features
There are 1024 Morgan Fingerprint Features
There are 167 MACCS Keys Features
There are 2048 RDKIT Fingerprint Features
There Are 208 Descriptor Features
There are 1024 Morgan Fingerprint Features
There are 167 MACCS Keys Features
There are 2048 RDKIT Fingerprint Features


In [6]:
X

Unnamed: 0,MaxEStateIndex,MinEStateIndex,MaxAbsEStateIndex,MinAbsEStateIndex,qed,MolWt,HeavyAtomMolWt,ExactMolWt,NumValenceElectrons,NumRadicalElectrons,...,RDKIT_2038,RDKIT_2039,RDKIT_2040,RDKIT_2041,RDKIT_2042,RDKIT_2043,RDKIT_2044,RDKIT_2045,RDKIT_2046,RDKIT_2047
0,11.537037,-4.947338,11.537037,2.668981,0.479530,162.032,162.032,161.990419,58,0,...,0,0,0,0,1,0,0,1,0,0
1,3.470111,1.207271,3.470111,1.207271,0.446879,217.271,206.183,217.089149,80,0,...,0,0,0,1,0,0,0,0,0,0
2,4.446898,0.992407,4.446898,0.992407,0.625839,160.220,148.124,160.100048,62,0,...,1,0,0,1,0,0,0,0,0,0
3,9.425694,-1.071759,9.425694,1.071759,0.422302,84.074,80.042,84.021129,32,0,...,0,0,0,0,0,0,0,0,0,0
4,4.232083,0.597176,4.232083,0.597176,0.540734,118.245,104.133,118.081621,44,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2657,5.139082,0.708333,5.139082,0.708333,0.489475,143.411,139.379,141.918490,26,0,...,0,0,0,0,0,0,0,0,0,0
2658,7.756944,-1.736111,7.756944,1.400463,0.476306,144.388,144.388,142.909632,34,0,...,0,0,0,0,0,0,0,0,1,0
2659,2.224537,1.326944,2.224537,1.326944,0.547649,156.228,144.132,156.093900,60,0,...,0,0,0,0,0,0,0,0,0,0
2660,11.457114,0.204401,11.457114,0.204401,0.653309,184.238,172.142,184.088815,70,0,...,0,0,0,0,0,0,0,0,0,0


In [7]:
import numpy as np
import pandas as pd

from sklearn.utils.multiclass import type_of_target
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import mutual_info_classif, mutual_info_regression, f_classif
from sklearn.ensemble import ExtraTreesClassifier, ExtraTreesRegressor

def _safe_numeric_df(X: pd.DataFrame):
    Xn = X.select_dtypes(include=[np.number]).copy()
    # drop all-NaN columns
    Xn = Xn.loc[:, Xn.notna().any(axis=0)]
    # drop zero-variance columns
    nunique = Xn.nunique(dropna=False)
    Xn = Xn.loc[:, nunique > 1]
    return Xn

def _rank_from_scores(scores: pd.Series):
    # Convert scores to ranks in [0,1], higher score -> higher rank value
    # Handle NaNs by assigning worst rank (0)
    s = scores.fillna(scores.min() - abs(scores.min()) - 1e-9)
    order = s.rank(method="average")  # 1..n, low->low rank
    return (order - 1) / max(1, len(order) - 1)

def select_top_k_features(
    X: pd.DataFrame,
    y=None,
    k: int = 1000,
    random_state: int = 42,
    max_rows_for_scoring: int = 50000,
    lgbm_estimators: int = 500
):
    """
    Returns (X_topk, selected_columns) where X_topk = X[selected_columns].

    If y is provided: supervised selection (MI + tree gain + correlation/F).
    If y is None: unsupervised fallback (variance-based ranking).
    """
    rng = np.random.RandomState(random_state)

    # 1) keep only numeric & sane columns
    Xn = _safe_numeric_df(X)
    if Xn.shape[1] == 0:
        raise ValueError("No numeric columns to select from.")
    if k >= Xn.shape[1]:
        return X[Xn.columns], list(Xn.columns)

    # Optional subsample rows for scoring speed
    if max_rows_for_scoring and len(Xn) > max_rows_for_scoring:
        idx = rng.choice(len(Xn), size=max_rows_for_scoring, replace=False)
        Xs = Xn.iloc[idx].copy()
        ys = (y.iloc[idx] if isinstance(y, pd.Series) else np.asarray(y)[idx]) if y is not None else None
    else:
        Xs, ys = Xn, y

    # Median impute for univariate stats (tree models can handle NaN directly)
    Xs_imp = Xs.copy()
    med = Xs_imp.median()
    Xs_imp = Xs_imp.fillna(med)

    scores = pd.DataFrame(index=Xn.columns)
    is_supervised = y is not None

    if not is_supervised:
        # Unsupervised fallback: rank by (variance * (1 - missing_rate))
        var = Xs.var(ddof=1)
        miss = Xs.isna().mean()
        unsup_score = var * (1 - miss)
        scores["unsup"] = unsup_score.reindex(scores.index).fillna(0.0)
        blended_rank = _rank_from_scores(scores["unsup"])
    else:
        # --- Determine task type
        y_arr = pd.Series(ys).values
        # Encode y for classification if needed
        target_type = type_of_target(y_arr)
        is_class = target_type in ("binary", "multiclass")

        if is_class:
            # Ensure labels are 0..C-1 for MI/F
            le = LabelEncoder()
            y_enc = le.fit_transform(y_arr.astype(str))
            # 2a) Mutual information (classification)
            try:
                mi = mutual_info_classif(Xs_imp.values, y_enc, random_state=random_state, discrete_features=False)
                scores["mi"] = pd.Series(mi, index=Xs_imp.columns)
            except Exception:
                scores["mi"] = 0.0

            # 2b) ANOVA F-score
            try:
                with np.errstate(all="ignore"):
                    f_vals, _ = f_classif(Xs_imp.values, y_enc)
                scores["f"] = pd.Series(np.nan_to_num(f_vals, nan=0.0, neginf=0.0, posinf=np.nanmax(f_vals[np.isfinite(f_vals)]) if np.isfinite(f_vals).any() else 0.0), index=Xs_imp.columns)
            except Exception:
                scores["f"] = 0.0

            # 2c) Tree-based gain importance
            gain = None
            try:
                import lightgbm as lgb
                clf = lgb.LGBMClassifier(
                    n_estimators=lgbm_estimators,
                    learning_rate=0.05,
                    num_leaves=31,
                    subsample=0.8,
                    colsample_bytree=0.8,
                    random_state=random_state,
                    n_jobs=-1
                )
                clf.fit(Xs, y_enc)
                gain = pd.Series(clf.booster_.feature_importance(importance_type="gain"),
                                 index=Xs.columns).replace({np.nan: 0.0})
            except Exception:
                # Fallback: ExtraTrees
                et = ExtraTreesClassifier(
                    n_estimators=400,
                    max_features="sqrt",
                    random_state=random_state,
                    n_jobs=-1
                )
                et.fit(Xs_imp, y_enc)
                gain = pd.Series(et.feature_importances_, index=Xs_imp.columns)

            scores["gain"] = gain.reindex(scores.index).fillna(0.0)

            # Blend ranks (heavier weight on gain/MI)
            r_gain = _rank_from_scores(scores["gain"])
            r_mi   = _rank_from_scores(scores["mi"])
            r_f    = _rank_from_scores(scores["f"])
            blended_rank = 0.5 * r_gain + 0.3 * r_mi + 0.2 * r_f

        else:
            # Regression
            # 2a) Mutual information (regression)
            try:
                mi = mutual_info_regression(Xs_imp.values, y_arr, random_state=random_state)
                scores["mi"] = pd.Series(mi, index=Xs_imp.columns)
            except Exception:
                scores["mi"] = 0.0

            # 2b) Absolute Pearson correlation
            with np.errstate(all="ignore"):
                corr = []
                y_c = pd.Series(y_arr)
                y_c = (y_c - y_c.mean()) / (y_c.std(ddof=0) + 1e-12)
                for c in Xs_imp.columns:
                    xc = Xs_imp[c]
                    xc = (xc - xc.mean()) / (xc.std(ddof=0) + 1e-12)
                    v = float(np.nan_to_num(np.corrcoef(xc, y_c)[0,1], nan=0.0))
                    corr.append(abs(v))
                scores["corr"] = pd.Series(corr, index=Xs_imp.columns)

            # 2c) Tree-based gain importance
            gain = None
            try:
                import lightgbm as lgb
                reg = lgb.LGBMRegressor(
                    n_estimators=lgbm_estimators,
                    learning_rate=0.05,
                    num_leaves=31,
                    subsample=0.8,
                    colsample_bytree=0.8,
                    random_state=random_state,
                    n_jobs=-1
                )
                reg.fit(Xs, y_arr)
                gain = pd.Series(reg.booster_.feature_importance(importance_type="gain"),
                                 index=Xs.columns).replace({np.nan: 0.0})
            except Exception:
                et = ExtraTreesRegressor(
                    n_estimators=400,
                    max_features="sqrt",
                    random_state=random_state,
                    n_jobs=-1
                )
                et.fit(Xs_imp, y_arr)
                gain = pd.Series(et.feature_importances_, index=Xs_imp.columns)

            scores["gain"] = gain.reindex(scores.index).fillna(0.0)

            # Blend ranks
            r_gain = _rank_from_scores(scores["gain"])
            r_mi   = _rank_from_scores(scores["mi"])
            r_cor  = _rank_from_scores(scores["corr"])
            blended_rank = 0.5 * r_gain + 0.3 * r_mi + 0.2 * r_cor

    # Final pick
    blended_rank = blended_rank.reindex(Xn.columns).fillna(0.0)
    top_cols = blended_rank.sort_values(ascending=False).head(k).index.tolist()

    # Return original X with those columns (keep original values, not imputed)
    return X[top_cols], top_cols

# -----------------------
# Example usage:
X, selected_cols = select_top_k_features(X, y, k=500)
print(len(selected_cols), "features kept")



[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.083091 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 25279
[LightGBM] [Info] Number of data points in the train set: 2662, number of used features: 3349
[LightGBM] [Info] Start training from score 278.263449
500 features kept


In [8]:
X.describe()


Unnamed: 0,BertzCT,HeavyAtomMolWt,ExactMolWt,MolMR,HallKierAlpha,RingCount,RDKIT_1907,TPSA,FractionCSP3,LabuteASA,...,RDKIT_2011,AtomPair_873,FCFP_82,RDKIT_308,MACCS_118,FCFP_532,RDKIT_985,AtomPair_211,RDKIT_50,FCFP_276
count,2662.0,2662.0,2662.0,2662.0,2662.0,2662.0,2662.0,2662.0,2662.0,2662.0,...,2662.0,2662.0,2662.0,2662.0,2662.0,2662.0,2662.0,2662.0,2662.0,2662.0
mean,180.741676,146.083347,157.647182,43.112108,-0.596461,0.793764,0.592036,19.636322,0.520941,65.133909,...,0.021412,0.101427,0.038693,0.246431,0.330954,0.128099,0.094666,0.089406,0.046582,0.137866
std,153.001743,65.791649,69.088762,19.634082,0.771922,0.854235,0.491549,19.639709,0.391281,27.35587,...,0.144782,0.52039,0.192898,0.431014,0.470645,0.334263,0.292808,0.424149,0.21078,0.344824
min,0.0,26.018,31.042199,2.216,-3.703117,0.0,0.0,0.0,0.0,10.565437,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,59.764839,104.064,114.104465,31.1857,-1.06,0.0,0.0,0.0,0.142857,48.212521,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,150.869138,134.117,145.939152,40.6006,-0.53,1.0,1.0,17.82,0.5,61.320463,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,253.11838,174.138,184.984011,50.009,-0.04,1.0,1.0,29.54,1.0,74.706144,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
max,1211.216172,959.171,949.178286,242.734,3.04,7.0,1.0,157.11,1.0,354.811559,...,1.0,8.0,1.0,1.0,1.0,1.0,1.0,8.0,1.0,1.0


In [9]:
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from sklearn.model_selection import *
from sklearn.metrics import mean_absolute_error
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, random_state = 42, train_size = 0.8)
model = CatBoostRegressor(random_state = 42, verbose = 150, iterations=1200)
model.fit(Xtrain, np.log1p(ytrain))
print(mean_absolute_error(np.expm1(model.predict(Xtest)), ytest))


Learning rate set to 0.039779
0:	learn: 0.3017215	total: 81.9ms	remaining: 1m 38s
150:	learn: 0.1377253	total: 3.28s	remaining: 22.8s
300:	learn: 0.1108925	total: 6.47s	remaining: 19.3s
450:	learn: 0.0919982	total: 9.72s	remaining: 16.1s
600:	learn: 0.0788570	total: 12.9s	remaining: 12.9s
750:	learn: 0.0692407	total: 16.1s	remaining: 9.64s
900:	learn: 0.0610666	total: 19.4s	remaining: 6.43s
1050:	learn: 0.0542053	total: 22.6s	remaining: 3.2s
1199:	learn: 0.0487784	total: 25.8s	remaining: 0us
27.31342415276645


In [10]:
model.fit(X, np.log1p(y))
sub['Tm'] = np.expm1(model.predict(test))
# a, b, 

Learning rate set to 0.041209
0:	learn: 0.3029146	total: 28.2ms	remaining: 33.8s
150:	learn: 0.1395453	total: 3.41s	remaining: 23.7s
300:	learn: 0.1141608	total: 6.7s	remaining: 20s
450:	learn: 0.0965970	total: 9.99s	remaining: 16.6s
600:	learn: 0.0846835	total: 13.4s	remaining: 13.3s
750:	learn: 0.0752226	total: 16.7s	remaining: 9.96s
900:	learn: 0.0678544	total: 20s	remaining: 6.63s
1050:	learn: 0.0612286	total: 23.3s	remaining: 3.31s
1199:	learn: 0.0557397	total: 26.7s	remaining: 0us


In [11]:
sub.to_csv('submission.csv', index=False)

In [12]:
sub

Unnamed: 0,id,Tm
0,1022,374.602810
1,1146,336.064838
2,79,194.396748
3,2279,197.452919
4,1342,232.975130
...,...,...
661,2663,284.085075
662,624,306.719453
663,2655,165.037372
664,2089,265.984995
