In [1]:
import lightgbm
import numpy
import pandas
import random

In [4]:
def statfeature(table, key, statdict, pre=""):
   if not isinstance(key, list):
      key = [key]
   table_temp=table.groupby(key).aggregate(statdict)
   table_temp.columns = ["%s%s and %s%s" % (pre, "".join(key), col, c if isinstance(c, str) else c.__name__) for col, fun in statdict.items() for c in (fun if isinstance(fun, list) else [fun])]
   return table_temp
pandas.DataFrame.statfeature = statfeature

train_table = pandas.read_csv("final_dataset_train.tsv", sep="\t")
train_table["id"] = range(-1, -len(train_table) -1, -1)
test_table = pandas.read_csv("final_dataset_testA.tsv", sep="\t")
test_table["delta_g"] = -1

train_base_feature_table = train_table.loc[:, ["id", "antibody_seq_a", "antibody_seq_b", "antigen_seq"]]
for a in ["antibody_seq_a", "antibody_seq_b", "antigen_seq"]:
   train_base_feature_table["%s length" % a] = train_base_feature_table[a].str.len()
   for b in [chr(65 + x) for x in range(26)]:
      train_base_feature_table["%s_%s" % (a, b)] = train_base_feature_table[a].str.count(b)
      for c in [chr(65 + x) for x in range(26)]:
         train_base_feature_table["%s_%s" % (a, b + c)] = train_base_feature_table[a].str.count(b + c)
         for d in [chr(65 + x) for x in range(26)]:
            train_base_feature_table["%s_%s" % (a, b + c + d)] = train_base_feature_table[a].str.count(b + c + d)
train_base_feature_table = train_base_feature_table.drop(["antibody_seq_a", "antibody_seq_b", "antigen_seq"], axis=1)

test_base_feature_table = test_table.loc[:, ["id", "antibody_seq_a", "antibody_seq_b", "antigen_seq"]]
for a in ["antibody_seq_a", "antibody_seq_b", "antigen_seq"]:
   test_base_feature_table["%s length" % a] = test_base_feature_table[a].str.len()
   for b in [chr(65 + x) for x in range(26)]:
      test_base_feature_table["%s_%s" % (a, b)] = test_base_feature_table[a].str.count(b)
      for c in [chr(65 + x) for x in range(26)]:
         test_base_feature_table["%s_%s" % (a, b + c)] = test_base_feature_table[a].str.count(b + c)
         for d in [chr(65 + x) for x in range(26)]:
            test_base_feature_table["%s_%s" % (a, b + c + d)] = test_base_feature_table[a].str.count(b + c + d)
test_base_feature_table = test_base_feature_table.drop(["antibody_seq_a", "antibody_seq_b", "antigen_seq"], axis=1)



def extract_table(table, base_feature_table, feature_table):
   table_temp = table
   table_temp = table_temp.merge(base_feature_table, on="id", how="left")
   table_temp = table_temp.merge(feature_table.statfeature("antibody_seq_a", {"delta_g": ["mean", "median", "min", "max"]}).reset_index(), on="antibody_seq_a", how="left")
   table_temp = table_temp.merge(feature_table.statfeature("antibody_seq_b", {"delta_g": ["mean", "median", "min", "max"]}).reset_index(), on="antibody_seq_b", how="left")
   table_temp = table_temp.merge(feature_table.statfeature("antigen_seq", {"delta_g": ["mean", "median", "min", "max"]}).reset_index(), on="antigen_seq", how="left")
   table_temp = table_temp.drop(["pdb", "antibody_seq_a", "antibody_seq_b", "antigen_seq"], axis=1)

   table_temp["label"] = table_temp.delta_g.rank()
   table_temp = table_temp.loc[:, ["id", "delta_g", "label"] + [x for x in table_temp.columns if x not in ["id", "delta_g", "label"]]]

   return table_temp

foldN = 6
index = random.sample(range(len(train_table)), len(train_table))
train_data_table = None
for y in range(foldN):
   y_label_table = train_table.iloc[[x for x in range(len(index)) if x % foldN == y]].reset_index(drop=True)
   y_feature_table = train_table.iloc[[x for x in range(len(index)) if x % foldN != y]].reset_index(drop=True)

   y_data_table = extract_table(y_label_table, train_base_feature_table, y_feature_table)
   train_data_table = pandas.concat([train_data_table, y_data_table], ignore_index=True)

lightmodel = lightgbm.train(train_set=lightgbm.Dataset(train_data_table.iloc[:, 3:], label=train_data_table.label)
   , num_boost_round=2048, params={"objective": "regression", "learning_rate": 0.05, "max_depth": 6, "num_leaves": 32, "bagging_fraction": 0.7, "feature_fraction": 0.7, "num_threads": 64, "verbose": -1}
)


test_data_table = extract_table(test_table, test_base_feature_table, train_table)

predcit_table = test_data_table.loc[:, ["id"]]
predcit_table["delta_g"] = lightmodel.predict(test_data_table.iloc[:, 3:])

predcit_table.to_csv("result.csv", index=False)