In [235]:
import pandas as pd
import xgboost as xgb
import matplotlib.pyplot as plt
import lightgbm as lgb

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.preprocessing import OneHotEncoder

import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor

%matplotlib inline

## Model train

In [236]:
df = pd.read_csv('clean_train_data.csv')

In [237]:
df['group2'].unique()

array([ 5,  9, 34, 28, 51, 50, 57], dtype=int64)

In [238]:
train = df.query('group1 in [5, 9, 34, 28, 51]')
validate = df.query('group1 == 50')
test = df.query('group1 == 57')

In [239]:
features = ['position1', 'pH1', 'change1', 'change2']
cat_features = ['change1', 'change2']

In [240]:
for i in cat_features:
        train[i] = pd.Categorical(train[i])
        validate[i] = pd.Categorical(validate[i])  
        test[i] = pd.Categorical(test[i])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train[i] = pd.Categorical(train[i])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  validate[i] = pd.Categorical(validate[i])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test[i] = pd.Categorical(test[i])


In [241]:
lgb_train = lgb.Dataset(train[features], label=train.target)
lgb_eval = lgb.Dataset(validate[features], label=validate.target)

params = {
        'boosting_type': 'gbdt',
        'objective': 'mse',
        'metric': 'l2',
        'learning_rate': 0.1,
        'verbose': -1,
        }
model = lgb.train(params,
                  lgb_train,
                  num_boost_round=1000,
                  valid_sets=lgb_eval,
                  early_stopping_rounds=20,
                  verbose_eval=False
                 )





In [242]:
model.best_score

defaultdict(collections.OrderedDict,
            {'valid_0': OrderedDict([('l1', 4.155891763340033)])})

In [243]:
test['predict'] = model.predict(test[features])
test['error'] = test['predict'] - test['target']
print(test['error'].mean())

-0.03152180436075196


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test['predict'] = model.predict(test[features])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test['error'] = test['predict'] - test['target']


## Creating submission

In [244]:
test_data = pd.read_csv('test.csv')

In [245]:
base = 'VPVNPEPDATSVENVALKTGSGDSQSDPIKADLEVKGQSALPFDVDCWAILCKGAPNVLQRVNEKTKNSNRDRSGANKGPFKDPQKWGIKALPPKNPSWSAQDFKSPEEYAFASSLQGGTNAILAPVNLASQNSQGGVLNGFYSANKVAQFDPSKPQQTKGTWFQITKFTGAAGPYCKALGSNDKSVCDKNKNIAGDWGFDPAKWAYQYDEKNNKFNYVGK'

In [246]:
def compare(base, y):
    for idx, [c1, c2] in enumerate(zip(base, y)):
        if c1 != c2:
            return int(idx), c1, c2
    return None, None, None

In [247]:
position = []
change1 = []
change2 = []


for row in test_data.iterrows():
    row_clean = row[1]
    if len(base) == len(row_clean['protein_sequence']):
        pos, ch1, ch2 = compare(base, row_clean['protein_sequence'])
        position.append(pos)
        change1.append(ch1)
        change2.append(ch2)
    else:
        position.append(None)
        change1.append(None)
        change2.append(None)

test_data['position1'] = position
test_data['change1'] = change1
test_data['change2'] = change2

In [249]:
for i in cat_features:
    test_data[i] = pd.Categorical(test_data[i])
    
test_data.rename(columns={'pH': 'pH1'}, inplace=True)
test_data['tm'] = None
test_data.loc[~test_data['change1'].isna(), 'tm'] = model.predict(test_data[features].dropna())
test_data.loc[test_data['change1'].isna(), 'tm'] = 0

In [250]:
test_data[['seq_id', 'tm']].to_csv('submission.csv', index=False) 