In [1]:
# importing library
import pandas as pd
import numpy as np
import math
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
import xgboost
from sklearn.metrics import precision_score, recall_score, f1_score,accuracy_score,balanced_accuracy_score

import hyperopt
from hyperopt import hp

# Loading Dataset

In [2]:
df_1 = pd.read_csv('./train.csv')
df_1 = df_1.drop(['s53', 's54','s55','s56','s57', 's59'], axis=1)  #dropping incomplete features

column_list = df_1.columns
df_1['s52'] = df_1['s52'].replace(['l','o'],['1','0'])    #replacing syntax  


# Converting Alphanumeric data to Ascii values

In [3]:
## Converting to ascii: Any character --> Corresponding ascii value
## Any string --> summation of each character's acii v

for i in column_list:
    if i == 'id':
        continue
    arr = df_1[i]
    new_arr = []
    for each_val in arr:
        if isinstance(each_val, str):
            if len(each_val) > 1:
                get_all_ascii = [ord(j) for j in each_val]
                final_val = sum(get_all_ascii)
                new_arr.append(final_val)
            else:
                new_arr.append(ord(each_val))

        elif math.isnan(each_val):
            new_arr.append(0)
        else:
            new_arr.append(each_val)
    df_1[i] = np.asarray(new_arr)
    df_1[i] = pd.to_numeric(df_1[i])

# One hot encoding of Alphanumeric Data

In [4]:
df_2 = pd.read_csv('./train.csv')
df_2 = df_2.drop(['id','s53', 's54','s55','s56','s57', 's59'], axis=1)  #dropping incomplete features with ID

column_list = df_2.columns
df_2['s52'] = df_2['s52'].replace(['l','o'],['1','0'])    #replacing syntax  


categorical_features = ['gender', 's11', 's12', 's16', 's17', 's18', 's58','s69','s71','s70']

for i in categorical_features :
# get the dummies and store it in a variable
    dummies = pd.get_dummies(df_1[i])
    df_2 = pd.concat([df_2, dummies], axis='columns')   #one hot encoding of all alphanumeric data


In [5]:
df_2 = df_2.drop(categorical_features, axis=1)

df_2.columns = ['s13', 's48', 's52', 'n1', 'n2', 'n3', 'n4', 'n5', 'n6', 'n7', 'n8',
       'n9', 'n10', 'n11', 'n12', 'n13', 'n14', 'n15','label', 'F', 'M', 'N', 'Y', 'N_1',
       'Y_1', 'A', 'B', 'C', 'D', 'A_1', 'B_1', 'C_1', 'D_1', 'A_2', 'B_2', 'C_2', 'D_2', 'A_3',
       'B_3', '0', 'C_3', 'x', '~1', 'a', 'b', 'c', 'd', 'op: A', 'op: B',
       'op: C', 'op: D']

column_list = df_2.columns

for i in column_list:
    df_2[i] = pd.to_numeric(df_2[i])
    
df_2 = df_2.reset_index()
df_2 = df_2.drop('index', axis = 1)
df_2['id'] = df_1['id']

# Merging two types of processed alphanumeric data(One hot encoded,ascii)

In [6]:
df_3 = pd.merge(df_2, df_1[['id','gender','s11','s12','s16','s17','s18','s58','s69','s70','s71']], on='id')

In [7]:
df_3.columns

Index(['s13', 's48', 's52', 'n1', 'n2', 'n3', 'n4', 'n5', 'n6', 'n7', 'n8',
       'n9', 'n10', 'n11', 'n12', 'n13', 'n14', 'n15', 'label', 'F', 'M', 'N',
       'Y', 'N_1', 'Y_1', 'A', 'B', 'C', 'D', 'A_1', 'B_1', 'C_1', 'D_1',
       'A_2', 'B_2', 'C_2', 'D_2', 'A_3', 'B_3', '0', 'C_3', 'x', '~1', 'a',
       'b', 'c', 'd', 'op: A', 'op: B', 'op: C', 'op: D', 'id', 'gender',
       's11', 's12', 's16', 's17', 's18', 's58', 's69', 's70', 's71'],
      dtype='object')

# Dataset saving (optional)

In [8]:
df_3.to_csv('./processed_data.csv', index=False)

# Train Test splitting 

In [9]:
X = df_3.drop(['id','label'], axis=1)
y = df_3['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Model Training (Xgboost)

In [10]:
from xgboost import XGBClassifier
import xgboost

model = XGBClassifier(learning_rate=0.001,
                      colsample_bytree = 0.8104161212923376,
                      gamma= 2.443119529076133, 
                      max_depth= 7, 
                      min_child_weight= 7, 
                      reg_alpha= 87.0,
                      reg_lambda= 0.08809142885491292,scale_pos_weight = 6,n_estimators=10000)#"gain", "weight", "cover"
model.fit(X_train, y_train)

KeyboardInterrupt: 

# Optimal Hyperparameter Tuning (Optional)

In [None]:
def objective(space):
    clf=xgb.XGBClassifier(scale_pos_weight = 6,
                    n_estimators =space['n_estimators'], max_depth = int(space['max_depth']), gamma = space['gamma'],
                    reg_alpha = int(space['reg_alpha']),min_child_weight=int(space['min_child_weight']),
                    colsample_bytree=int(space['colsample_bytree']))
    
    evaluation = [( X_train, y_train), ( X_test, y_test)]
    
    clf.fit(X_train, y_train,
            eval_set=evaluation, eval_metric="auc",
            early_stopping_rounds=10,verbose=False)
    

    pred = clf.predict(X_test)
    balanced_accuracy = balanced_accuracy_score(y_test, pred>0.5)
    print ("SCORE:", balanced_accuracy)
    return {'loss': -balanced_accuracy, 'status': hyperopt.STATUS_OK }


trials = hyperopt.Trials()

best_hyperparams = hyperopt.fmin(fn = objective,
                        space = space,
                        algo = hyperopt.tpe.suggest,
                        max_evals = 300,
                        trials = trials)


print("The best hyperparameters are : ","\n")
print(best_hyperparams)

# Validation Score showing

In [None]:
y_pred_v_xgb = model.predict(X_test)    #predicting values on test data(val set)
y_pred_v_xgb = [round(value) for value in y_pred_v_xgb]   #rounding floating values
accuracy = accuracy_score(y_test, y_pred_v_xgb)
print("Accuracy: %.2f%%" % (accuracy * 100.0))
print(f' test f1 {f1_score(y_test, y_pred_v_xgb)}')
print(f' test balance acc {balanced_accuracy_score(y_test, y_pred_v_xgb)}')   #balanced accuracy matches with leaderboard

# Testset csv generation

In [None]:
## Alphanumeric to ascii value

df_1 = pd.read_csv('./test.csv')
df_1 = df_1.drop(['s53', 's54','s55','s56','s57', 's59'], axis=1)  #dropping incomplete features

column_list = df_1.columns
df_1['s52'] = df_1['s52'].replace(['l','o'],['1','0'])    #replacing syntax  

## Converting to ascii: Any character --> Corresponding ascii value
## Any string --> summation of each character's acii v

for i in column_list:
    if i == 'id':
        continue
    arr = df_1[i]
    new_arr = []
    for each_val in arr:
        if isinstance(each_val, str):
            if len(each_val) > 1:
                get_all_ascii = [ord(j) for j in each_val]
                final_val = sum(get_all_ascii)
                new_arr.append(final_val)
            else:
                new_arr.append(ord(each_val))

        elif math.isnan(each_val):
            new_arr.append(0)
        else:
            new_arr.append(each_val)
    df_1[i] = np.asarray(new_arr)
    df_1[i] = pd.to_numeric(df_1[i])

In [None]:
## Alphanumeric to one hot encoded value

df_2 = pd.read_csv('./test.csv')
df_2 = df_2.drop(['id','s53', 's54','s55','s56','s57', 's59'], axis=1)  #dropping incomplete features with ID

column_list = df_2.columns
df_2['s52'] = df_2['s52'].replace(['l','o'],['1','0'])    #replacing syntax  


categorical_features = ['gender', 's11', 's12', 's16', 's17', 's18', 's58','s69','s71','s70']

for i in categorical_features :
# get the dummies and store it in a variable
    dummies = pd.get_dummies(df_1[i])
    df_2 = pd.concat([df_2, dummies], axis='columns')   #one hot encoding of all alphanumeric data
    

df_2 = df_2.drop(categorical_features, axis=1)

df_2.columns = ['s13', 's48', 's52', 'n1', 'n2', 'n3', 'n4', 'n5', 'n6', 'n7', 'n8',
       'n9', 'n10', 'n11', 'n12', 'n13', 'n14', 'n15','label', 'F', 'M', 'N', 'Y', 'N_1',
       'Y_1', 'A', 'B', 'C', 'D', 'A_1', 'B_1', 'C_1', 'D_1', 'A_2', 'B_2', 'C_2', 'D_2', 'A_3',
       'B_3', '0', 'C_3', 'x', '~1', 'a', 'b', 'c', 'd', 'op: A', 'op: B',
       'op: C', 'op: D']

column_list = df_2.columns

for i in column_list:
    df_2[i] = pd.to_numeric(df_2[i])
    
df_2 = df_2.reset_index()
df_2 = df_2.drop('index', axis = 1)
df_2['id'] = df_1['id']


In [None]:
## Merging both converted data
df_3 = pd.merge(df_2, df_1[['id','gender','s11','s12','s16','s17','s18','s58','s69','s70','s71']], on='id')

In [None]:
## Creating testing X
X = df_3.drop(['id'], axis=1)

# Model Prediction on testset

In [None]:
y_pred = model.predict(X)

# Making Submission File

In [None]:
sub = pd.DataFrame()
sub['id'] = df_3['id']
sub['label'] = y_pred

sub.to_csv('submission.csv',index = False)   #saving csv file