In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

# import warnings filter
from warnings import simplefilter
# ignore all future warnings
simplefilter(action='ignore', category=FutureWarning)

from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

In [None]:
from sklearn.model_selection import GroupKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor

In [None]:
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

def import_data(file):
    """create a dataframe and optimize its memory usage"""
    df = pd.read_csv(file,index_col='id', parse_dates=True, keep_date_col=True)
    df = reduce_mem_usage(df)
    return df

In [None]:
train = import_data('../input/train.csv')#, index_col='id')#, nrows=2505542)
test = import_data('../input/test.csv')#, index_col='id')

In [None]:
#train = pd.read_csv('../input/train.csv', index_col='id')
#test = pd.read_csv('../input/test.csv', index_col='id')

In [None]:
print (train.shape)
print (test.shape)

In [None]:
display(train.head())

In [None]:
display(test.head())

In [None]:
def import_data(file):
    """create a dataframe and optimize its memory usage"""
    df = pd.read_csv(file, parse_dates=True, keep_date_col=True)
    df = reduce_mem_usage(df)
    return df

def import_data(file):
    """create a dataframe and optimize its memory usage"""
    df = pd.read_csv(file,nrows=100_000, parse_dates=True, keep_date_col=True)
    df = reduce_mem_usage(df)
    return df

def import_data(file):
    """create a dataframe and optimize its memory usage"""
    df = pd.read_csv(file,nrows=100_000, parse_dates=True, keep_date_col=True)
    df = reduce_mem_usage(df)
    return df

In [None]:
structures = import_data('../input/structures.csv')#, nrows=20_000)
display(structures.head())

In [None]:
structures.shape

In [None]:
# Map the atom structure data into train and test files

def map_atom_info(df, atom_idx):
    df = pd.merge(df, structures, how = 'left',
                  left_on  = ['molecule_name', f'atom_index_{atom_idx}'],
                  right_on = ['molecule_name',  'atom_index'])
    
    df = df.drop('atom_index', axis=1)
    df = df.rename(columns={'atom': f'atom_{atom_idx}',
                            'x': f'x_{atom_idx}',
                            'y': f'y_{atom_idx}',
                            'z': f'z_{atom_idx}'})
    return df

train = map_atom_info(train, 0)
train = map_atom_info(train, 1)

test = map_atom_info(test, 0)
test = map_atom_info(test, 1)

In [None]:
#train.isnull().sum()

In [None]:
#test.isnull().sum()

In [None]:
%%time
# Engineer a single feature: distance vector between atoms
#  (there's ways to speed this up!)

def dist(row):
    return ( (row['x_1'] - row['x_0'])**2 +
             (row['y_1'] - row['y_0'])**2 +
             (row['z_1'] - row['z_0'])**2 ) ** 0.5

train['dist'] = train.apply(lambda x: dist(x), axis=1)
test['dist'] = test.apply(lambda x: dist(x), axis=1)
#time: 16min

In [None]:
#train.head()

In [None]:
#test.head()

# Now calculating length of a vector
#train['length'] = train ['dist'] **2
#test['length'] = test ['dist'] **2
# XX, YY, 'ZZ'
train['XX'] = train ['x_0'] * train ['x_1']
train['YY'] = train ['y_0'] * train ['y_1']
train['ZZ'] = train ['z_0'] * train ['z_1']
test['XX'] = test ['x_0'] * test ['x_1']
test['YY'] = test ['y_0'] * test ['y_1']
test['ZZ'] = test ['z_0'] * test ['z_1']


# XY, XZ, 
train['XY'] = train ['x_0'] * train ['y_1']
train['XZ'] = train ['x_0'] * train ['z_1']
test['XY'] = test ['x_0'] * test ['y_1']
test['XZ'] = test ['x_0'] * test ['z_1']

# YX, 'YZ'
train['YX'] = train ['y_0'] * train ['x_1']
train['YZ'] = train ['y_0'] * train ['z_1']
test['YX'] = test ['y_0'] * test ['x_1']
test['YZ'] = test ['y_0'] * test ['z_1']

# ZX, ZY
train['ZX'] = train ['z_0'] * train ['x_1']
train['ZY'] = train ['z_0'] * train ['y_1']
test['ZX'] = test ['z_0'] * test ['x_1']
test['ZY'] = test ['z_0'] * test ['y_1']

%%time
# Label Encoding
for f in ['molecule_name','type', 'atom_0', 'atom_1']:
    lbl = LabelEncoder()
    lbl.fit(list(train[f].values) + list(train[f].values))
    train[f] = lbl.transform(list(train[f].values))
   # test[f] = lbl.transform(list(test[f].values))

In [None]:
#train.head()

In [None]:
#X=train.drop(['scalar_coupling_constant'], axis=1)
#X.head(1)

In [None]:
#y=train.scalar_coupling_constant
#y.head(1)

In [None]:
# Split into validation and training data
#train_X, val_X, train_y, val_y = train_test_split(X, y, random_state=1)

%%time
# Specify Model
iowa_model = GradientBoostingRegressor(random_state=0,max_depth=5,n_estimators=500,
                                       learning_rate=0.1 ,min_samples_leaf=1,max_leaf_nodes=None)
# 2.(random_state=0,max_depth=5,n_estimators=500,learning_rate=0.1 ,min_samples_leaf=1,max_leaf_nodes=5)
# 3.(random_state=0,max_depth=3,n_estimators=100,learning_rate=0.08 ,min_samples_leaf=1,max_leaf_nodes=5)
# 3.(random_state=0,max_depth=3,n_estimators=110,learning_rate=0.08 ,min_samples_leaf=1,max_leaf_nodes=5)
# 3.(random_state=0,max_depth=3,n_estimators=115,learning_rate=0.08 ,min_samples_leaf=1,max_leaf_nodes=5)
# 3.(random_state=0,max_depth=3,n_estimators=120,learning_rate=0.08 ,min_samples_leaf=1,max_leaf_nodes=5)
# 3.(random_state=0,max_depth=3,n_estimators=125,learning_rate=0.08 ,min_samples_leaf=1,max_leaf_nodes=5)


# Fit Model
iowa_model.fit(train_X, train_y)

# Make validation predictions and calculate mean absolute error
val_predictions = iowa_model.predict(val_X)
#print ('val_y', val_y)
#print ('val_predictions', val_predictions)
val_mae= mean_absolute_error(val_y, val_predictions)
print("Validation MAE when not specifying max_leaf_nodes: {:,.0f}".format((val_mae)))

%%time
# Specify Model
iowa_model = GradientBoostingRegressor(random_state=0,max_depth=3,n_estimators=130,
                                       learning_rate=0.08 ,min_samples_leaf=1,max_leaf_nodes=5)

# 3.(random_state=0,max_depth=3,n_estimators=130,learning_rate=0.08 ,min_samples_leaf=1,max_leaf_nodes=5)

# Fit Model
iowa_model.fit(train_X, train_y)

# Make validation predictions and calculate mean absolute error
val_predictions = iowa_model.predict(val_X)
#print ('val_y', val_y)
#print ('val_predictions', val_predictions)
val_mae= mean_absolute_error(val_y, val_predictions)
print("Validation MAE when not specifying max_leaf_nodes: {:,.0f}".format((val_mae)))

%%time
# Specify Model
iowa_model = GradientBoostingRegressor(random_state=0,max_depth=3,n_estimators=135,
                                       learning_rate=0.08 ,min_samples_leaf=1,max_leaf_nodes=5)

# 3.(random_state=0,max_depth=3,n_estimators=135,learning_rate=0.08 ,min_samples_leaf=1,max_leaf_nodes=5)


# Fit Model
iowa_model.fit(train_X, train_y)

# Make validation predictions and calculate mean absolute error
val_predictions = iowa_model.predict(val_X)
#print ('val_y', val_y)
#print ('val_predictions', val_predictions)
val_mae= mean_absolute_error(val_y, val_predictions)
print("Validation MAE when not specifying max_leaf_nodes: {:,.0f}".format((val_mae)))

%%time
# Specify Model
iowa_model = GradientBoostingRegressor(random_state=0,max_depth=3,n_estimators=140,
                                       learning_rate=0.08 ,min_samples_leaf=1,max_leaf_nodes=5)

# 3.(random_state=0,max_depth=3,n_estimators=140,learning_rate=0.08 ,min_samples_leaf=1,max_leaf_nodes=5)

# Fit Model
iowa_model.fit(train_X, train_y)

# Make validation predictions and calculate mean absolute error
val_predictions = iowa_model.predict(val_X)
#print ('val_y', val_y)
#print ('val_predictions', val_predictions)
val_mae= mean_absolute_error(val_y, val_predictions)
print("Validation MAE when not specifying max_leaf_nodes: {:,.0f}".format((val_mae)))

%%time
# Specify Model
iowa_model = GradientBoostingRegressor(random_state=0,max_depth=3,n_estimators=145,
                                       learning_rate=0.08 ,min_samples_leaf=1,max_leaf_nodes=5)

# 3.(random_state=0,max_depth=3,n_estimators=145,learning_rate=0.08 ,min_samples_leaf=1,max_leaf_nodes=5)


# Fit Model
iowa_model.fit(train_X, train_y)

# Make validation predictions and calculate mean absolute error
val_predictions = iowa_model.predict(val_X)
#print ('val_y', val_y)
#print ('val_predictions', val_predictions)
val_mae= mean_absolute_error(val_y, val_predictions)
print("Validation MAE when not specifying max_leaf_nodes: {:,.0f}".format((val_mae)))

%%time
# Specify Model
iowa_model = GradientBoostingRegressor(random_state=0,max_depth=3,n_estimators=150,
                                       learning_rate=0.08 ,min_samples_leaf=1,max_leaf_nodes=5)
# 3.(random_state=0,max_depth=5,n_estimators=150,learning_rate=0.08 ,min_samples_leaf=1,max_leaf_nodes=5)



# Fit Model
iowa_model.fit(train_X, train_y)

# Make validation predictions and calculate mean absolute error
val_predictions = iowa_model.predict(val_X)
#print ('val_y', val_y)
#print ('val_predictions', val_predictions)
val_mae= mean_absolute_error(val_y, val_predictions)
print("Validation MAE when not specifying max_leaf_nodes: {:,.0f}".format((val_mae)))

%%time
# Specify Model
iowa_model = GradientBoostingRegressor(random_state=0,max_depth=3,n_estimators=200,
                                       learning_rate=0.08 ,min_samples_leaf=1,max_leaf_nodes=5)
# 3.(random_state=0,max_depth=5,n_estimators=200,learning_rate=0.08 ,min_samples_leaf=1,max_leaf_nodes=5)



# Fit Model
iowa_model.fit(train_X, train_y)

# Make validation predictions and calculate mean absolute error
val_predictions = iowa_model.predict(val_X)
#print ('val_y', val_y)
#print ('val_predictions', val_predictions)
val_mae= mean_absolute_error(val_y, val_predictions)
print("Validation MAE when not specifying max_leaf_nodes: {:,.0f}".format((val_mae)))

%%time
# Specify Model
iowa_model = GradientBoostingRegressor(random_state=0,max_depth=3,n_estimators=300,
                                       learning_rate=0.08 ,min_samples_leaf=1,max_leaf_nodes=5)
# 3.(random_state=0,max_depth=5,n_estimators=200,learning_rate=0.08 ,min_samples_leaf=1,max_leaf_nodes=5)



# Fit Model
iowa_model.fit(train_X, train_y)

# Make validation predictions and calculate mean absolute error
val_predictions = iowa_model.predict(val_X)
#print ('val_y', val_y)
#print ('val_predictions', val_predictions)
val_mae= mean_absolute_error(val_y, val_predictions)
print("Validation MAE when not specifying max_leaf_nodes: {:,.0f}".format((val_mae)))

%%time
# Specify Model
iowa_model = GradientBoostingRegressor(random_state=0,max_depth=3,n_estimators=300,
                                       learning_rate=0.08 ,min_samples_leaf=1,max_leaf_nodes=5)
# 3.(random_state=0,max_depth=5,n_estimators=200,learning_rate=0.08 ,min_samples_leaf=1,max_leaf_nodes=5)



# Fit Model
iowa_model.fit(train_X, train_y)

# Make validation predictions and calculate mean absolute error
val_predictions = iowa_model.predict(val_X)
#print ('val_y', val_y)
#print ('val_predictions', val_predictions)
val_mae= mean_absolute_error(val_y, val_predictions)
print("Validation MAE when not specifying max_leaf_nodes: {:,.0f}".format((val_mae)))

 # max_leaf_Nodes

# 1).

# To improve accuracy, create a new Random Forest model which you will train on all training data
def get_mae(max_leaf_nodes, train_X, val_X, train_y, val_y):
    model = GradientBoostingRegressor(max_leaf_nodes=max_leaf_nodes, random_state=1)
    model.fit(train_X, train_y)
    preds_val = model.predict(val_X)
    mae = mean_absolute_error(val_y,preds_val)
    return(mae)

candidate_max_leaf_nodes = [5, 25, 50,75,100] #[2,2,2,2,2]
# Write loop to find the ideal tree size from candidate_max_leaf_nodes
for max_leaf_nodes in candidate_max_leaf_nodes:
    my_mae = get_mae(max_leaf_nodes, train_X, val_X, train_y, val_y)
    print("Max leaf nodes: %d  \t\t Mean Absolute Error:  %d" %(max_leaf_nodes, my_mae))

# 2).
%%time

# To improve accuracy, create a new Random Forest model which you will train on all training data
def get_mae(max_leaf_nodes, train_X, val_X, train_y, val_y):
    model = GradientBoostingRegressor(max_leaf_nodes=max_leaf_nodes, random_state=1)
    model.fit(train_X, train_y)
    preds_val = model.predict(val_X)
    mae = mean_absolute_error(val_y,preds_val)
    return(mae)

candidate_max_leaf_nodes = [125,150,175,200,225]
# Write loop to find the ideal tree size from candidate_max_leaf_nodes
for max_leaf_nodes in candidate_max_leaf_nodes:
    my_mae = get_mae(max_leaf_nodes, train_X, val_X, train_y, val_y)
    print("Max leaf nodes: %d  \t\t Mean Absolute Error:  %d" %(max_leaf_nodes, my_mae))

# 3).
%%time
# To improve accuracy, create a new Random Forest model which you will train on all training data
def get_mae(max_leaf_nodes, train_X, val_X, train_y, val_y):
    model = GradientBoostingRegressor(max_leaf_nodes=max_leaf_nodes, random_state=1)
    model.fit(train_X, train_y)
    preds_val = model.predict(val_X)
    mae = mean_absolute_error(val_y,preds_val)
    return(mae)

candidate_max_leaf_nodes = [250,275,300,325,350]
# Write loop to find the ideal tree size from candidate_max_leaf_nodes
for max_leaf_nodes in candidate_max_leaf_nodes:
    my_mae = get_mae(max_leaf_nodes, train_X, val_X, train_y, val_y)
    print("Max leaf nodes: %d  \t\t Mean Absolute Error:  %d" %(max_leaf_nodes, my_mae))

# 4).
%%time
# To improve accuracy, create a new Random Forest model which you will train on all training data
def get_mae(max_leaf_nodes, train_X, val_X, train_y, val_y):
    model = GradientBoostingRegressor(max_leaf_nodes=max_leaf_nodes, random_state=1)
    model.fit(train_X, train_y)
    preds_val = model.predict(val_X)
    mae = mean_absolute_error(val_y,preds_val)
    return(mae)

candidate_max_leaf_nodes = [375,400,425,450,475,500]
# Write loop to find the ideal tree size from candidate_max_leaf_nodes
for max_leaf_nodes in candidate_max_leaf_nodes:
    my_mae = get_mae(max_leaf_nodes, train_X, val_X, train_y, val_y)
    print("Max leaf nodes: %d  \t\t Mean Absolute Error:  %d" %(max_leaf_nodes, my_mae))

# min_samples_leaf

#1).
%%time
# To improve accuracy, create a new Random Forest model which you will train on all training data
def get_mae(min_samples_leaf, train_X, val_X, train_y, val_y):
    model = GradientBoostingRegressor(min_samples_leaf=min_samples_leaf, random_state=1)
    model.fit(train_X, train_y)
    preds_val = model.predict(val_X)
    mae = mean_absolute_error(val_y, preds_val)
    return(mae)

candidate_min_samples_leaf = [1,2,3,4,5]
# Write loop to find the ideal tree size from candidate_max_leaf_nodes
for min_samples_leaf in candidate_min_samples_leaf:
    my_mae = get_mae(min_samples_leaf, train_X, val_X, train_y, val_y)
    print("min_samples_leaf: %d  \t\t Mean Absolute Error:  %d" %(min_samples_leaf, my_mae))

#2).
%%time
# To improve accuracy, create a new Random Forest model which you will train on all training data
def get_mae(min_samples_leaf, train_X, val_X, train_y, val_y):
    model = GradientBoostingRegressor(min_samples_leaf=min_samples_leaf, random_state=1)
    model.fit(train_X, train_y)
    preds_val = model.predict(val_X)
    mae = mean_absolute_error(val_y, preds_val)
    return(mae)

candidate_min_samples_leaf = [6,7,8,9,10]
# Write loop to find the ideal tree size from candidate_max_leaf_nodes
for min_samples_leaf in candidate_min_samples_leaf:
    my_mae = get_mae(min_samples_leaf, train_X, val_X, train_y, val_y)
    print("min_samples_leaf: %d  \t\t Mean Absolute Error:  %d" %(min_samples_leaf, my_mae))

#3).
%%time
# To improve accuracy, create a new Random Forest model which you will train on all training data
def get_mae(min_samples_leaf, train_X, val_X, train_y, val_y):
    model = GradientBoostingRegressor(min_samples_leaf=min_samples_leaf, random_state=1)
    model.fit(train_X, train_y)
    preds_val = model.predict(val_X)
    mae = mean_absolute_error(val_y, preds_val)
    return(mae)

candidate_min_samples_leaf = [15,25, 50,75] 
# Write loop to find the ideal tree size from candidate_max_leaf_nodes
for min_samples_leaf in candidate_min_samples_leaf:
    my_mae = get_mae(min_samples_leaf, train_X, val_X, train_y, val_y)
    print("min_samples_leaf: %d  \t\t Mean Absolute Error:  %d" %(min_samples_leaf, my_mae))

#4).
%%time
# To improve accuracy, create a new Random Forest model which you will train on all training data
def get_mae(min_samples_leaf, train_X, val_X, train_y, val_y):
    model = GradientBoostingRegressor(min_samples_leaf=min_samples_leaf, random_state=1)
    model.fit(train_X, train_y)
    preds_val = model.predict(val_X)
    mae = mean_absolute_error(val_y, preds_val)
    return(mae)

candidate_min_samples_leaf = [100,150,200, 300, 400]
# Write loop to find the ideal tree size from candidate_max_leaf_nodes
for min_samples_leaf in candidate_min_samples_leaf:
    my_mae = get_mae(min_samples_leaf, train_X, val_X, train_y, val_y)
    print("min_samples_leaf: %d  \t\t Mean Absolute Error:  %d" %(min_samples_leaf, my_mae))

# n_estimators

#2).
%%time
def get_mae(n_estimators, train_X, val_X, train_y, val_y):
    model = GradientBoostingRegressor(n_estimators=n_estimators, random_state=1)
    model.fit(train_X, train_y)
    preds_val = model.predict(val_X)
    mae = mean_absolute_error(val_y, preds_val)
    return(mae)

find_n_estimators = [100,200,300,400,500]
# Write loop to find the ideal tree size from candidate_max_leaf_nodes
for n_estimators in find_n_estimators:
    my_mae = get_mae(n_estimators, train_X, val_X, train_y, val_y)
    print("n_estimators: %d  \t\t Mean Absolute Error:  %d" %(n_estimators, my_mae))

#3).
#%%time
def get_mae(n_estimators, train_X, val_X, train_y, val_y):
    model = GradientBoostingRegressor(n_estimators=n_estimators, random_state=1)
    model.fit(train_X, train_y)
    preds_val = model.predict(val_X)
    mae = mean_absolute_error(val_y, preds_val)
    return(mae)

find_n_estimators = [110,115,120,125] 
# Write loop to find the ideal tree size from candidate_max_leaf_nodes
for n_estimators in find_n_estimators:
    my_mae = get_mae(n_estimators, train_X, val_X, train_y, val_y)
    print("n_estimators: %d  \t\t Mean Absolute Error:  %d" %(n_estimators, my_mae))

#4).
%%time
def get_mae(n_estimators, train_X, val_X, train_y, val_y):
    model = GradientBoostingRegressor(n_estimators=n_estimators, random_state=1)
    model.fit(train_X, train_y)
    preds_val = model.predict(val_X)
    mae = mean_absolute_error(val_y, preds_val)
    return(mae)

find_n_estimators = [130,135,140,145,150]
# Write loop to find the ideal tree size from candidate_max_leaf_nodes
for n_estimators in find_n_estimators:
    my_mae = get_mae(n_estimators, train_X, val_X, train_y, val_y)
    print("n_estimators: %d  \t\t Mean Absolute Error:  %d" %(n_estimators, my_mae))

#5).
%%time
def get_mae(n_estimators, train_X, val_X, train_y, val_y):
    model = GradientBoostingRegressor(n_estimators=n_estimators, random_state=1)
    model.fit(train_X, train_y)
    preds_val = model.predict(val_X)
    mae = mean_absolute_error(val_y, preds_val)
    return(mae)

find_n_estimators = [155, 160,165,170,175]
# Write loop to find the ideal tree size from candidate_max_leaf_nodes
for n_estimators in find_n_estimators:
    my_mae = get_mae(n_estimators, train_X, val_X, train_y, val_y)
    print("n_estimators: %d  \t\t Mean Absolute Error:  %d" %(n_estimators, my_mae))

#5).
%%time
def get_mae(n_estimators, train_X, val_X, train_y, val_y):
    model = GradientBoostingRegressor(n_estimators=n_estimators, random_state=1)
    model.fit(train_X, train_y)
    preds_val = model.predict(val_X)
    mae = mean_absolute_error(val_y, preds_val)
    return(mae)

find_n_estimators = [180, 185,190,195,200]
# Write loop to find the ideal tree size from candidate_max_leaf_nodes
for n_estimators in find_n_estimators:
    my_mae = get_mae(n_estimators, train_X, val_X, train_y, val_y)
    print("n_estimators: %d  \t\t Mean Absolute Error:  %d" %(n_estimators, my_mae))

# learning_rate

def get_mae(learning_rate, train_X, val_X, train_y, val_y):
    model = GradientBoostingRegressor(learning_rate=learning_rate, random_state=1)
    model.fit(train_X, train_y)
    preds_val =( model.predict(val_X))
    mae = (mean_absolute_error(val_y, preds_val))
    return(mae)

find_learning_rate = [0.08,0.09,0.1,0.2] #[2,2,2,2]
# 1. 9=0.01,2. 4=0.02,3. 3=0.03, 4. 2=0.04, 5. 2=0.05, 6. 2=0.06, 7. 2=0.07
    
# Write loop to find the ideal tree size from candidate_max_leaf_nodes
for learning_rate in find_learning_rate:
    my_mae = get_mae(float(learning_rate), train_X, val_X, train_y, val_y)
    print("learning_rate: %d  \t\t Mean Absolute Error:  %d" %(learning_rate, my_mae))

def get_mae(learning_rate, train_X, val_X, train_y, val_y):
    model = GradientBoostingRegressor(learning_rate=learning_rate, random_state=1)
    model.fit(train_X, train_y)
    preds_val = model.predict(val_X)
    mae = mean_absolute_error(val_y, preds_val)
    return(mae)

find_learning_rate =[0.3,0.4,0.5,0.6] # [2,2,2,2]
# 1. 9=0.01,2. 4=0.02,3. 3=0.03, 4. 2=0.04, 5. 2=0.05, 6. 2=0.06, 7. 2=0.07
    
# Write loop to find the ideal tree size from candidate_max_leaf_nodes
for learning_rate in find_learning_rate:
    my_mae = get_mae(learning_rate, train_X, val_X, train_y, val_y)
    print("learning_rate: %d  \t\t Mean Absolute Error:  %d" %(learning_rate, my_mae))

# max_depth

def get_mae(max_depth, train_X, val_X, train_y, val_y):
    model = GradientBoostingRegressor(max_depth=max_depth, random_state=1)
    model.fit(train_X, train_y)
    preds_val = model.predict(val_X)
    mae = mean_absolute_error(val_y, preds_val)
    return(mae)

find_max_depth =[3,4,5] #[2,2,2]
# 1. 9=0.01,2. 4=0.02,3. 3=0.03, 4. 2=0.04, 5. 2=0.05, 6. 2=0.06, 7. 2=0.07
    
# Write loop to find the ideal tree size from candidate_max_leaf_nodes
for max_depth in find_max_depth:
    my_mae = get_mae(max_depth, train_X, val_X, train_y, val_y)
    print("max_depth: %d  \t\t Mean Absolute Error:  %d" %(max_depth, my_mae))

In [None]:
molecules = train.pop('molecule_name')
test = test.drop('molecule_name', axis=1)

y = train.pop('scalar_coupling_constant')

# Label Encoding
for f in ['type', 'atom_0', 'atom_1']:
    lbl = LabelEncoder()
    lbl.fit(list(train[f].values) + list(train[f].values))
    train[f] = lbl.transform(list(train[f].values))
    test[f] = lbl.transform(list(test[f].values))

# Check the run time, below cell

In [None]:
%%time
yoof = np.zeros(len(train))
yhat = np.zeros(len(test))

n_splits = 3
gkf = GroupKFold(n_splits=n_splits) # we're going to split folds by molecules

fold = 0
for train_index, test_index in gkf.split(train, y, groups=molecules):
    fold += 1
    print(f'fold {fold} of {n_splits}')
    #print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_test = train.values[train_index], train.values[test_index]
    y_train, y_test = y.values[train_index], y.values[test_index]
    #print(X_train, X_test, y_train, y_test)
    reg = GradientBoostingRegressor(n_estimators=600, learning_rate=0.1 ,
                                max_depth=5, 
                                min_samples_leaf=1,
                                random_state=0)
                                #n_jobs=-1) # RandomForestRegressor , LGBMClassifier, 250,9,3,4
    reg.fit(X_train, y_train)
    yoof[test_index] = reg.predict(X_test)
    yhat += reg.predict(test)

yhat /= n_splits

# Try different numbers of n_estimators - this will take a minute or so
estimators = np.arange(10, 200, 10)
scores = []
for n in estimators:
    reg.set_params(n_estimators=n)
    reg.fit(X_in, y_in)
    scores.append(reg.score(X_oof, y_oof))
plt.title("Effect of n_estimators")
plt.xlabel("n_estimator")
plt.ylabel("score")
plt.plot(estimators, scores)

from sklearn.metrics import mean_absolute_error
score = mean_absolute_error(yoof, yhat)
print(f'Score: {score:0.3f}')

In [None]:
sample_submission = pd.read_csv('../input/sample_submission.csv', index_col='id')#, nrows=20_000)

benchmark = sample_submission.copy()
benchmark['scalar_coupling_constant'] = yhat
benchmark.to_csv('atomic_distance_benchmark.csv')

In [None]:
plot_data = pd.DataFrame(y)
plot_data.index.name = 'id'
plot_data['yhat'] = yoof
plot_data['type'] = pd.read_csv('../input/train.csv', index_col='id', usecols=['id', 'type'])

def plot_oof_preds(ctype, llim, ulim):
        plt.figure(figsize=(6,6))
        sns.scatterplot(x='scalar_coupling_constant',y='yhat',
                        data=plot_data.loc[plot_data['type']==ctype,
                        ['scalar_coupling_constant', 'yhat']]);
        plt.xlim((llim, ulim))
        plt.ylim((llim, ulim))
        plt.plot([llim, ulim], [llim, ulim])
        plt.xlabel('scalar_coupling_constant')
        plt.ylabel('predicted')
        plt.title(f'{ctype}', fontsize=18)
        plt.show()

plot_oof_preds('1JHC', 0, 250)
plot_oof_preds('1JHN', 0, 100)
plot_oof_preds('2JHC', -50, 50)
plot_oof_preds('2JHH', -50, 50)
plot_oof_preds('2JHN', -25, 25)
plot_oof_preds('3JHC', -25, 100)
plot_oof_preds('3JHH', -20, 20)
plot_oof_preds('3JHN', -15, 15)