In [2]:
import pandas  as pd 
import seaborn as sns
import numpy as np 
import matplotlib.pyplot as plt 
from datetime import datetime as dt 
import warnings
from sklearn.preprocessing import OrdinalEncoder
from sklearn.metrics import balanced_accuracy_score, confusion_matrix, f1_score, precision_score, roc_auc_score, recall_score, accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import RobustScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from collections import Counter
from imblearn.over_sampling import SMOTE

%matplotlib inline
warnings.filterwarnings("ignore")
pd.set_option("display.max_columns",100)

In [3]:
drugs = pd.read_csv('drugs_train.csv')
drugs.info()
drugs.columns

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1500 entries, 0 to 1499
Data columns (total 21 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   id                              1500 non-null   object 
 1   age                             1500 non-null   object 
 2   gender                          1500 non-null   object 
 3   education                       1500 non-null   object 
 4   country                         1500 non-null   object 
 5   ethnicity                       1500 non-null   object 
 6   personality_neuroticism         1500 non-null   float64
 7   personality_extraversion        1500 non-null   float64
 8   personality_openness            1500 non-null   float64
 9   personality_agreeableness       1500 non-null   float64
 10  personality_conscientiousness   1500 non-null   float64
 11  personality_impulsiveness       1500 non-null   float64
 12  personality_sensation           15

Index(['id', 'age', 'gender', 'education', 'country', 'ethnicity',
       'personality_neuroticism', 'personality_extraversion',
       'personality_openness', 'personality_agreeableness',
       'personality_conscientiousness', 'personality_impulsiveness',
       'personality_sensation', 'consumption_alcohol',
       'consumption_amphetamines', 'consumption_caffeine',
       'consumption_cannabis', 'consumption_chocolate',
       'consumption_mushrooms', 'consumption_nicotine',
       'consumption_cocaine_last_month'],
      dtype='object')

In [34]:
drugs.education.value_counts()

Some college or university, no certificate or degree    405
University degree                                       376
Masters degree                                          229
Professional certificate/ diploma                       221
Left school at 18 years                                  85
Left school at 16 years                                  72
Doctorate degree                                         66
Left school at 17 years                                  26
Left school before 16 years                              20
Name: education, dtype: int64

In [4]:
#print(drugs.describe())
def getting_dummies(drugs):
    drugs['tmp'] = range(0,drugs.shape[0])
    t1 = pd.get_dummies(drugs['age'])
    t1['tmp'] = range(0,drugs.shape[0])
    drugs = pd.merge(drugs, t1, on = ['tmp'])

    t1 = pd.get_dummies(drugs['country'])
    t1['tmp'] = range(0,drugs.shape[0])
    drugs = pd.merge(drugs, t1, on = ['tmp'])
    drugs['Other_country'] = drugs['Other'] + drugs['Ireland'] + drugs['Canada']

    t1 = pd.get_dummies(drugs['ethnicity'])
    t1['tmp'] = range(0,drugs.shape[0])
    drugs = pd.merge(drugs, t1, on = ['tmp'])

    drugs = drugs.drop(columns = ['tmp', 'age', 'country', 'ethnicity','Ireland', 'Canada', 'Other_x' ])

    return (drugs)

def getting_ordinals(drugs):
    
    lst = ['consumption_alcohol', 'consumption_amphetamines', 'consumption_caffeine',
            'consumption_cannabis', 'consumption_chocolate',
            'consumption_mushrooms', 'consumption_nicotine']
    consDict = {'never used':0,  'used over a decade ago':1,
       'used in last decade':2,'used in last year':3, 'used in last month':4, 'used in last week' : 5, 'used in last day' : 6}

    eduDict = {'Left school before 16 years' : 0, 'Left school at 16 years' : 0, 'Left school at 17 years':0, 'Left school at 18 years' : 0, 
               'Some college or university, no certificate or degree':1, 'Professional certificate/ diploma':2 ,'University degree':3, 'Masters degree':4, 
               'Doctorate degree' : 5}


    for cols in lst:
        drugs[cols]=drugs[cols].map(consDict)
    
    drugs['education'] = drugs['education'].map(eduDict)
    return(drugs)

def gender_dummy(drugs):
    drugs['female'] = np.where(drugs['gender'] == 'female', 1, 0) 
    drugs = drugs.drop(columns=['gender', 'id'])
    return (drugs)

def expl_var_dummy(drugs):
    drugs['consumption_cocaine_last_month'] = np.where(drugs['consumption_cocaine_last_month'] == 'Yes', 1, 0) 
    return (drugs)

def scalling(drugs):
    scaled_features = drugs.copy()

    col_names = ['personality_neuroticism', 'personality_extraversion',
                'personality_openness', 'personality_agreeableness',
                'personality_conscientiousness', 'personality_impulsiveness',
                'personality_sensation']
    features = scaled_features[col_names]

    scaler = RobustScaler().fit(features.values)
    features = scaler.transform(features.values)

    scaled_features[col_names] = features

    return(scaled_features)
    
def our_metrics(y_test, preds):
     print(f'Balanced Accuracy:', balanced_accuracy_score(y_test, preds), 
          '\nconfusion:', confusion_matrix(y_test, preds),
          '\nprecision:', precision_score(y_test, preds) ,
          '\naccuracy:', accuracy_score(y_test, preds), 
          '\nrecall:', recall_score(y_test, preds),
          '\nauroc:', roc_auc_score(y_test, preds) )
    

In [13]:
drugs = pd.read_csv('drugs_train.csv')

drugs = getting_dummies(drugs)
drugs = getting_ordinals(drugs)
drugs = gender_dummy(drugs)
drugs = expl_var_dummy(drugs)
drugs = scalling(drugs)
x_train, x_test, y_train, y_test = train_test_split(drugs.loc[:,drugs.columns!='consumption_cocaine_last_month'], drugs.consumption_cocaine_last_month, test_size=0.25)

print(drugs.columns )
drugs.head()


Index(['education', 'personality_neuroticism', 'personality_extraversion',
       'personality_openness', 'personality_agreeableness',
       'personality_conscientiousness', 'personality_impulsiveness',
       'personality_sensation', 'consumption_alcohol',
       'consumption_amphetamines', 'consumption_caffeine',
       'consumption_cannabis', 'consumption_chocolate',
       'consumption_mushrooms', 'consumption_nicotine',
       'consumption_cocaine_last_month', '18-24', '25-34', '35-44', '45-54',
       '55-64', '65+', 'Australia', 'New Zealand', 'UK', 'USA',
       'Other_country', 'Asian', 'Black', 'Mixed-Black/Asian',
       'Mixed-White/Asian', 'Mixed-White/Black', 'Other_y', 'White', 'female'],
      dtype='object')


Unnamed: 0,education,personality_neuroticism,personality_extraversion,personality_openness,personality_agreeableness,personality_conscientiousness,personality_impulsiveness,personality_sensation,consumption_alcohol,consumption_amphetamines,consumption_caffeine,consumption_cannabis,consumption_chocolate,consumption_mushrooms,consumption_nicotine,consumption_cocaine_last_month,18-24,25-34,35-44,45-54,55-64,65+,Australia,New Zealand,UK,USA,Other_country,Asian,Black,Mixed-Black/Asian,Mixed-White/Asian,Mixed-White/Black,Other_y,White,female
0,4,0.287179,0.35468,-0.111588,-0.115607,0.213483,0.0,-0.978328,5,1,6,5,6,0,5,0,0,0,0,1,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0
1,3,-0.215385,0.832512,-0.300429,-0.115607,0.342697,-0.396476,-0.718266,5,0,5,0,6,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0
2,3,0.287179,-0.334975,0.111588,-0.242775,0.0,0.889868,0.247678,4,0,6,5,5,3,4,0,1,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,1
3,4,1.015385,-0.931034,-0.390558,0.375723,-1.016854,0.889868,0.529412,6,0,6,2,6,0,2,0,0,1,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,1
4,1,0.210256,0.600985,0.751073,0.942197,-0.421348,0.334802,0.247678,5,0,4,4,6,3,4,0,1,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0


In [59]:
y_test.value_counts()

0    137
1     13
Name: consumption_cocaine_last_month, dtype: int64

In [6]:
counter = Counter(y_train)
counter

Counter({0: 1028, 1: 97})

In [19]:
x_train, x_test, y_train, y_test = train_test_split(drugs.loc[:,drugs.columns!='consumption_cocaine_last_month'], drugs.consumption_cocaine_last_month, test_size=0.15, stratify= drugs.consumption_cocaine_last_month)

oversample = SMOTE()
X, y = oversample.fit_resample(x_train, y_train)
counter_a = Counter(y)
print(counter_a)

x_trainSM, x_testSM, y_trainSM, y_testSM = train_test_split(X,y, test_size=0.25)


Counter({0: 1167, 1: 1167})


In [24]:

reg = LogisticRegression()
reg.fit(x_trainSM, y_trainSM)

reg_pred = reg.predict(x_test)

our_metrics(y_test, reg_pred)


Balanced Accuracy: 0.5826520183955033 
confusion: [[175  31]
 [ 13   6]] 
precision: 0.16216216216216217 
accuracy: 0.8044444444444444 
recall: 0.3157894736842105 
auroc: 0.5826520183955034


In [25]:
import lightgbm as lgb
lg = lgb.LGBMClassifier()
lg.fit(x_trainSM, y_trainSM)
lgb_preds = lg.predict(x_test)

our_metrics(y_test, lgb_preds)



Balanced Accuracy: 0.4971895758814512 
confusion: [[194  12]
 [ 18   1]] 
precision: 0.07692307692307693 
accuracy: 0.8666666666666667 
recall: 0.05263157894736842 
auroc: 0.4971895758814512


In [41]:
import numpy as np
import optuna

import lightgbm as lgb
import sklearn.metrics
from sklearn.model_selection import train_test_split


# FYI: Objective functions can take additional arguments
# (https://optuna.readthedocs.io/en/stable/faq.html#objective-func-additional-args).
def objective(trial):
    
    #train_x, valid_x, train_y, valid_y = train_test_split(drugs.loc[:,drugs.columns!='consumption_cocaine_last_month'], drugs.consumption_cocaine_last_month, test_size=0.1, random_state = 112)
    #dtrain = lgb.Dataset(train_x, label=train_y)

    param = {
        "objective": "binary",
        "metric": "binary_logloss",
        "n_estimators": trial.suggest_int("n_estimators", 1000, 8000),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3),
        "verbosity": -1,
        "boosting_type": "gbdt",
        "lambda_l1": trial.suggest_float("lambda_l1", 1e-8, 1.5, log=True),
        "lambda_l2": trial.suggest_float("lambda_l2", 1e-8, 1.5, log=True),
        "num_leaves": trial.suggest_int("num_leaves", 4000, 12000),
        "feature_fraction": trial.suggest_float("feature_fraction", 0.4, 1.0),
        "bagging_fraction": trial.suggest_float("bagging_fraction", 0.4, 1.0),
        "bagging_freq": trial.suggest_int("bagging_freq", 1, 7),
        "min_child_samples": trial.suggest_int("min_child_samples", 5, 140),
    }

    lst = []
    for i in range(1):
        #train_x, valid_x, train_y, valid_y = train_test_split(drugs.loc[:,drugs.columns!='consumption_cocaine_last_month'], drugs.consumption_cocaine_last_month, test_size=0.1, random_state = i+1)
        dtrain = lgb.Dataset(x_trainSM, label=y_trainSM)

        gbm = lgb.train(param, dtrain)
        preds = gbm.predict(x_test)
        #pred_labels = np.rint(preds)
        pred_labels1 = np.where(preds>0.4,1,0)
        # pred_labels2 = np.where(preds>0.4,1,0)
        # pred_labels3 = np.where(preds>0.2,1,0)
        # pred_labels4 = np.where(preds>0.1,1,0)
        x1 = balanced_accuracy_score(y_test, pred_labels1)
        # x2 = balanced_accuracy_score(valid_y, pred_labels2)
        # x3 = balanced_accuracy_score(valid_y, pred_labels3)
        # x4 = balanced_accuracy_score(valid_y, pred_labels4)
        pred_labels = x1 #max(x1, x2, x3, x4)
        lst.append(pred_labels)
        #lst.append(balanced_accuracy_score(valid_y, pred_labels))
    accuracy = np.mean(lst) 
        #sklearn.metrics.balanced_accuracy_score(valid_y, pred_labels)
        
    return accuracy


if __name__ == "__main__":
    study = optuna.create_study(direction="maximize")
    study.optimize(objective, n_trials=500)
    
    print("Number of finished trials: {}".format(len(study.trials)))

    print("Best trial:")
    trial = study.best_trial

    print("  Value: {}".format(trial.value))

    print("  Params: ")
    for key, value in trial.params.items():
        print("    {}: {}".format(key, value))

[32m[I 2022-05-29 20:51:03,267][0m A new study created in memory with name: no-name-d6c40012-1be3-4e8b-b560-129dc16e187e[0m
[32m[I 2022-05-29 20:51:16,629][0m Trial 0 finished with value: 0.5089422585590189 and parameters: {'n_estimators': 6816, 'learning_rate': 0.2500084514897297, 'lambda_l1': 9.68737055437473e-06, 'lambda_l2': 0.04458578008948583, 'num_leaves': 8553, 'feature_fraction': 0.8950828878247687, 'bagging_fraction': 0.5054817607640084, 'bagging_freq': 3, 'min_child_samples': 58}. Best is trial 0 with value: 0.5089422585590189.[0m
[32m[I 2022-05-29 20:51:22,161][0m Trial 1 finished with value: 0.5186509964230965 and parameters: {'n_estimators': 1373, 'learning_rate': 0.23131067032159228, 'lambda_l1': 4.612947952176264e-08, 'lambda_l2': 0.30846252767286814, 'num_leaves': 10975, 'feature_fraction': 0.9704787708804321, 'bagging_fraction': 0.9804402787008636, 'bagging_freq': 5, 'min_child_samples': 29}. Best is trial 1 with value: 0.5186509964230965.[0m
[32m[I 2022-05-

Number of finished trials: 500
Best trial:
  Value: 0.6117782319877363
  Params: 
    n_estimators: 5253
    learning_rate: 0.16503924070285111
    lambda_l1: 1.4418677767825967
    lambda_l2: 1.6755037131245352e-08
    num_leaves: 8539
    feature_fraction: 0.6873692705073272
    bagging_fraction: 0.7050929990357381
    bagging_freq: 5
    min_child_samples: 57


In [175]:
study.best_trial.params


{'n_estimators': 6584,
 'learning_rate': 0.2585180342596222,
 'lambda_l1': 0.24869615891449395,
 'lambda_l2': 1.5185272830960025e-07,
 'num_leaves': 5328,
 'feature_fraction': 0.5516054799322556,
 'bagging_fraction': 0.9202901878168293,
 'bagging_freq': 7,
 'min_child_samples': 127}

In [39]:
param = {
        "objective": "binary",
        "metric": "binary_logloss",
        "n_estimators":study.best_trial.params['n_estimators'],
        "learning_rate": study.best_trial.params['learning_rate'],
        "verbosity": -1,
        "boosting_type": "gbdt",
        "lambda_l1": study.best_trial.params['lambda_l1'],
        "lambda_l2": study.best_trial.params['lambda_l2'] ,
        "num_leaves": study.best_trial.params['num_leaves'],
        "feature_fraction": study.best_trial.params['feature_fraction'],
        "bagging_fraction": study.best_trial.params['bagging_fraction'],
        "bagging_freq": study.best_trial.params['bagging_freq'],
        "min_child_samples": study.best_trial.params['min_child_samples'],
    }

lst = []
for i in range(100):
    train_x, valid_x, train_y, valid_y = train_test_split(drugs.loc[:,drugs.columns!='consumption_cocaine_last_month'], drugs.consumption_cocaine_last_month, test_size=0.15, random_state = i+1*11, stratify = drugs.consumption_cocaine_last_month )
    dtrain = lgb.Dataset(train_x, label=train_y)

    gbm = lgb.train(param, dtrain)
    preds = gbm.predict(valid_x)
    pred_labels = np.where(preds>0.33,1,0)
    lst.append(balanced_accuracy_score(valid_y, pred_labels))

    our_metrics(valid_y, pred_labels)
print(np.mean(lst))

Balanced Accuracy: 0.5643842616249362 
confusion: [[200   6]
 [ 16   3]] 
precision: 0.3333333333333333 
accuracy: 0.9022222222222223 
recall: 0.15789473684210525 
auroc: 0.5643842616249362
Balanced Accuracy: 0.5498211548288197 
confusion: [[194  12]
 [ 16   3]] 
precision: 0.2 
accuracy: 0.8755555555555555 
recall: 0.15789473684210525 
auroc: 0.5498211548288197
Balanced Accuracy: 0.5882728666326009 
confusion: [[199   7]
 [ 15   4]] 
precision: 0.36363636363636365 
accuracy: 0.9022222222222223 
recall: 0.21052631578947367 
auroc: 0.5882728666326009
Balanced Accuracy: 0.6097342871742463 
confusion: [[197   9]
 [ 14   5]] 
precision: 0.35714285714285715 
accuracy: 0.8977777777777778 
recall: 0.2631578947368421 
auroc: 0.6097342871742463
Balanced Accuracy: 0.6097342871742463 
confusion: [[197   9]
 [ 14   5]] 
precision: 0.35714285714285715 
accuracy: 0.8977777777777778 
recall: 0.2631578947368421 
auroc: 0.6097342871742463
Balanced Accuracy: 0.6048799182422074 
confusion: [[195  11]
 [ 

In [40]:
from sklearn.neighbors import KNeighborsClassifier
#train_x, valid_x, train_y, valid_y = train_test_split(drugs.loc[:,drugs.columns!='consumption_cocaine_last_month'], drugs.consumption_cocaine_last_month, test_size=0.1, random_state = 997)
knn = KNeighborsClassifier()
knn.fit(x_trainSM, y_trainSM)
knn_pred = knn.predict(x_testSM)

our_metrics(y_testSM, knn_pred)

Balanced Accuracy: 0.8294701986754967 
confusion: [[199 103]
 [  0 282]] 
precision: 0.7324675324675325 
accuracy: 0.8236301369863014 
recall: 1.0 
auroc: 0.8294701986754967
