In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, PolynomialFeatures
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
import lightgbm as lgbm
import optuna
from catboost import CatBoostClassifier
from tensorflow import keras
from keras import layers
from keras import callbacks

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
train_data = pd.read_csv("train.csv")

train_data.head()

Unnamed: 0,id,f_00,f_01,f_02,f_03,f_04,f_05,f_06,f_07,f_08,...,f_22,f_23,f_24,f_25,f_26,f_27,f_28,f_29,f_30,target
0,0,-1.373246,0.238887,-0.243376,0.567405,-0.647715,0.839326,0.113133,1,5,...,-2.540739,0.766952,-2.730628,-0.208177,1.363402,ABABDADBAB,67.609153,0,0,0
1,1,1.697021,-1.710322,-2.230332,-0.545661,1.113173,-1.552175,0.447825,1,3,...,2.278315,-0.633658,-1.217077,-3.782194,-0.058316,ACACCADCEB,377.096415,0,0,1
2,2,1.681726,0.616746,-1.027689,0.810492,-0.609086,0.113965,-0.70866,1,0,...,-1.385775,-0.520558,-0.009121,2.788536,-3.703488,AAAEABCKAD,-195.599702,0,2,1
3,3,-0.118172,-0.587835,-0.804638,2.086822,0.371005,-0.128831,-0.282575,3,2,...,0.572594,-1.653213,1.686035,-2.533098,-0.608601,BDBBAACBCB,210.826205,0,0,1
4,4,1.148481,-0.176567,-0.664871,-1.101343,0.467875,0.500117,0.407515,3,3,...,-3.912929,-1.430366,2.127649,-3.306784,4.371371,BDBCBBCHFE,-217.211798,0,1,1


In [3]:
X = train_data.drop(columns=["target", "id"])
y = train_data["target"]

In [4]:
for i in range(10):
  X[f"l_0{i}"] = X["f_27"].str.get(i).apply(ord) - ord('A')

X.head()

Unnamed: 0,f_00,f_01,f_02,f_03,f_04,f_05,f_06,f_07,f_08,f_09,...,l_00,l_01,l_02,l_03,l_04,l_05,l_06,l_07,l_08,l_09
0,-1.373246,0.238887,-0.243376,0.567405,-0.647715,0.839326,0.113133,1,5,1,...,0,1,0,1,3,0,3,1,0,1
1,1.697021,-1.710322,-2.230332,-0.545661,1.113173,-1.552175,0.447825,1,3,4,...,0,2,0,2,2,0,3,2,4,1
2,1.681726,0.616746,-1.027689,0.810492,-0.609086,0.113965,-0.70866,1,0,2,...,0,0,0,4,0,1,2,10,0,3
3,-0.118172,-0.587835,-0.804638,2.086822,0.371005,-0.128831,-0.282575,3,2,1,...,1,3,1,1,0,0,2,1,2,1
4,1.148481,-0.176567,-0.664871,-1.101343,0.467875,0.500117,0.407515,3,3,0,...,1,3,1,2,1,1,2,7,5,4


In [5]:
X["unique_chars"] = X["f_27"].apply(lambda s: len(set(s)))
X.drop(columns="f_27", inplace=True)

X.head()

Unnamed: 0,f_00,f_01,f_02,f_03,f_04,f_05,f_06,f_07,f_08,f_09,...,l_01,l_02,l_03,l_04,l_05,l_06,l_07,l_08,l_09,unique_chars
0,-1.373246,0.238887,-0.243376,0.567405,-0.647715,0.839326,0.113133,1,5,1,...,1,0,1,3,0,3,1,0,1,3
1,1.697021,-1.710322,-2.230332,-0.545661,1.113173,-1.552175,0.447825,1,3,4,...,2,0,2,2,0,3,2,4,1,5
2,1.681726,0.616746,-1.027689,0.810492,-0.609086,0.113965,-0.70866,1,0,2,...,0,0,4,0,1,2,10,0,3,6
3,-0.118172,-0.587835,-0.804638,2.086822,0.371005,-0.128831,-0.282575,3,2,1,...,3,1,1,0,0,2,1,2,1,4
4,1.148481,-0.176567,-0.664871,-1.101343,0.467875,0.500117,0.407515,3,3,0,...,3,1,2,1,1,2,7,5,4,6


In [6]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, random_state=0)

In [7]:
#model = CatBoostClassifier()
#model.fit(X_train, y_train)

In [8]:
#predictions_temp = model.predict(X_valid, prediction_type="Probability")
#predictions = []
#for i in range(len(predictions_temp)):
#  predictions.append(predictions_temp[i][1])

#predictions

In [9]:
#roc_auc_score(y_valid, predictions)

In [10]:
model_lgbm = lgbm.LGBMClassifier(n_estimators = 5000, learning_rate = 0.1, random_state=0, min_child_samples=90, num_leaves=150, max_bins=511, n_jobs=-1)
#model_lgbm.fit(X_train, y_train)
model_lgbm.fit(X, y)

In [11]:
predictions = model_lgbm.predict_proba(X_valid)

predictions

array([[1.60514015e-04, 9.99839486e-01],
       [9.93221452e-01, 6.77854844e-03],
       [5.91382249e-06, 9.99994086e-01],
       ...,
       [1.29949580e-05, 9.99987005e-01],
       [9.99991042e-01, 8.95809987e-06],
       [4.06488050e-04, 9.99593512e-01]])

In [12]:
predictions_1 = [pred[1] for pred in predictions]

predictions_1

[0.9998394859849735,
 0.006778548438294273,
 0.9999940861775055,
 0.9904294564242829,
 0.0014362284704047723,
 0.9999996364641852,
 0.00013006563016660985,
 6.258195488034751e-05,
 0.0030922556176354974,
 1.0773014527865042e-05,
 6.370655429771682e-05,
 0.985381758239389,
 0.0007341249683696293,
 0.9999999999904323,
 8.564458007351254e-06,
 0.0001236047362273618,
 0.9998740079827512,
 0.011615837634132353,
 0.9996512779877248,
 4.45899839144058e-05,
 0.9989865910059395,
 0.9939342926842142,
 0.9999710617952464,
 0.9990511473992059,
 0.0003108948386396418,
 0.9999997133741398,
 0.9999989619978414,
 9.68985045388825e-05,
 0.005202906550888403,
 0.9984992852890798,
 0.9981031956285853,
 0.01182794432030886,
 0.9982536470951919,
 0.9999999996989328,
 0.9989480366220157,
 0.9991165332684376,
 0.9901469078909692,
 0.9999994200064872,
 3.979590939224816e-05,
 3.535111648596892e-11,
 0.0031925037708552427,
 0.9753760876373403,
 0.999971441215935,
 0.0045938948038459475,
 0.9992440099737031,
 0

In [13]:
roc_auc_score(y_valid, predictions_1)

1.0

In [20]:
def make_submission(model, test_data):
  predictions = model.predict_proba(test_data)
  predictions_1 = [pred[1] for pred in predictions]
  #final_predictions = []
  #for i in range(len(predictions)):
  #  if predictions[i][0] < 0.5:
  #    final_predictions.append(0)
  #  else:
  #    final_predictions.append(1)
  predictions_df = pd.DataFrame(data={"id": range(900000, 900000 + len(test_data)), "target": predictions_1})
  predictions_df.to_csv("submission.csv", index=False)

test_data = pd.read_csv("test.csv")

In [21]:
test_data.drop(columns="id", inplace=True)

for i in range(10):
  test_data[f"l_0{i}"] = test_data["f_27"].str.get(i).apply(ord) - ord('A')
test_data["unique_chars"] = test_data["f_27"].apply(lambda s: len(set(s)))
test_data.drop(columns="f_27", inplace=True)

In [22]:
make_submission(model_lgbm, test_data)