In [1]:
import pandas as pd
import numpy as np
import matplotlib as plt
from scipy import stats
from scipy.stats import skew


In [2]:
data=pd.read_csv('drug_consumption.data',
                 sep=',')
data.columns=['ID','Age','Gender','Education','Country','Ethnicity','Nscore','Escore',
             'Oscore','Ascore','Cscore','Impulsive','SS','Alcohol','Amphet','Amyl',
             'Benzos','Caff','Cannabis','Choc','Coke','Crack','Ecstasy','Heroin',
             'Ketamine','Legalh','LSD','Meth','Mushrooms','Nicotine','Semer','VSA']
# drugs=drugs.replace('CL0',0)
# drugs=drugs.replace('CL1',0)
# drugs=drugs.replace('CL2',0)
# drugs=drugs.replace('CL3',1)
# drugs=drugs.replace('CL4',1)
# drugs=drugs.replace('CL5',1)
# drugs=drugs.replace('CL6',1)

data=data.replace('CL0',0)
data=data.replace('CL1',0)
data=data.replace('CL2',0)
data=data.replace('CL3',1)
data=data.replace('CL4',1)
data=data.replace('CL5',1)
data=data.replace('CL6',1)


In [3]:
#data split
drugs=data.iloc[:,13:33]
personality=data.iloc[:,2:13]
d1=data.iloc[:,13:17]
d2=data.iloc[:,19]
d3=data.iloc[:,18]
d4=data.iloc[:,20:33]
legal=pd.concat([d1, d2], axis=1)
illegal=pd.concat([d3,d4],axis=1)

In [10]:
for name in illegal.columns:
    print(illegal[name].value_counts())

1    999
0    885
Name: Cannabis, dtype: int64
0    1467
1     417
Name: Coke, dtype: int64
0    1805
1      79
Name: Crack, dtype: int64
0    1367
1     517
Name: Ecstasy, dtype: int64
0    1766
1     118
Name: Heroin, dtype: int64
0    1676
1     208
Name: Ketamine, dtype: int64
0    1320
1     564
Name: Legalh, dtype: int64
0    1504
1     380
Name: LSD, dtype: int64
0    1564
1     320
Name: Meth, dtype: int64
0    1450
1     434
Name: Mushrooms, dtype: int64
1    1060
0     824
Name: Nicotine, dtype: int64
0    1881
1       3
Name: Semer, dtype: int64
0    1789
1      95
Name: VSA, dtype: int64


In [13]:
def get_target(drug):
    a=illegal[drug].value_counts()
    if illegal[illegal[drug]==0].shape[0]>illegal[illegal[drug]==1].shape[0]:
        target=0
        return target
    else:
        target=1
        return target
drug_name='LSD'
target=get_target(drug_name)
    
major=(illegal.loc[illegal[drug_name]==target])
minor=(illegal.loc[illegal[drug_name]==1-target])
majorframe=major[drug_name]
minorframe=minor[drug_name]

In [14]:
from sklearn.utils import resample
# Separate majority and minority classes
df_majority = majorframe
df_minority = minorframe
 
# Upsample minority class
df_minority_upsampled = resample(df_minority, 
                                 replace=True,     # sample with replacement
                                 n_samples=illegal[illegal[drug_name]==target].shape[0],    # to match majority class
                                 random_state=123) # reproducible results
 
# Combine majority class with upsampled minority class
df_upsampled = pd.concat([df_majority, df_minority_upsampled])
 
# Display new class counts
df_upsampled.value_counts()


1    1504
0    1504
Name: LSD, dtype: int64

In [15]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression 
from sklearn import neighbors, datasets
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score
#slicing the data
X=pd.concat([personality,legal],axis=1)
def kval(start,end):
    ks=list()
    for i in range(start,end):
        if i%2!=0:
            ks.append(i)
    return ks
ks=kval(40,50)

for drugs in illegal:
    y=illegal[drugs]
#stratify: cuz of the unbalanced data set: there are more class 1 than class 0. so we include more class 1 in both the test data and training set as well.
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.3, random_state=1, stratify=y)
    
    sc = StandardScaler()
    sc.fit(X_train)
    X_train_std = sc.transform(X_train)
    X_test_std = sc.transform(X_test)

    gs_knn = GridSearchCV(estimator=neighbors.KNeighborsClassifier(p=2, 
                               metric='minkowski'),
                      param_grid=[{'n_neighbors': ks,'weights':['uniform','distance']}],
                      scoring='accuracy',
                      cv=10)

    gs_knn = gs_knn.fit(X_train_std, y_train)          

    knn_params=gs_knn.best_params_

    
    # Choosing depth of the tree AND splitting criterion AND min_samples_leaf AND min_samples_split
    gs_dt2 = GridSearchCV(estimator=DecisionTreeClassifier(random_state=0),
                      param_grid=[{'max_depth': [1, 2, 3, 4, 5, 6, 7, None], 'criterion':['gini','entropy'], 
                                  'min_samples_leaf':[1,2,3,4,5],
                                  'min_samples_split':[2,3,4,5]}],
                      scoring='accuracy',
                      cv=10,
                      n_jobs=4)

    gs_dt2 = gs_dt2.fit(X_train,y_train)

    dt_params=gs_dt2.best_params_

    gs_lr2 = GridSearchCV(estimator=LogisticRegression(random_state=0),
                      param_grid=[{'C': [ 0.00001, 0.0001, 0.001, 0.01, 0.1 ,1 ,10 ,100, 1000, 10000, 100000, 1000000, 10000000],
                                 'penalty':['l1','l2']}],
                      scoring='accuracy',
                      cv=10)

    gs_lr2 = gs_lr2.fit(X_train,y_train)
#     print(gs_lr2.best_score_)
    lr_params=gs_lr2.best_params_)

    np.random.seed(42)

    
####################################################################################################################
##############################       AUC SCORE CALCULATION       ###################################################



    # Logistic Regression Classifier
    clf1 = LogisticRegression(penalty='l2', 
                              C=lr_params['C'],
                              random_state=1)

    # Decision Tree Classifier
    clf2 = DecisionTreeClassifier(max_depth=dt_params['max_depth'],
                                  criterion=dt_params['criterion'],
                                  random_state=0)


    # kNN Classifier
    clf3 = KNeighborsClassifier(n_neighbors=knn_params['n_neighbors'],
                                weights=knn_params['weights']
                                p=2,
                                metric='minkowski')

    auc_scores=np.zeros(3)
    i=0
    print('10-fold cross validation:\n')
    for clf, label in zip([clf1, clf2,clf3], clf_labels): #For all classifiers 
        scores = cross_val_score(estimator=clf,  #Estimate AUC based on cross validation
                                 X=X_train,
                                 y=y_train,
                                 cv=10,
                                 scoring='roc_auc')
        print("/n For Drug /n",drugs)
        print("ROC AUC: %0.2f (+/- %0.2f) [%s]" #Print peformance statistics based on cross-validation
              % (scores.mean(), scores.std(), label))
        auc_scores[i]=scores
        i+=1
    print("AUC SCORE RESULTS\n")
    print("The best model is: {0}  with score of {1} ".format(clf_labels[np.argmax(auc_scores)],auc_scores.max())
        

    
    knnscores=cross_val_score(gs_knn, X_train, y_train, 
                             scoring='accuracy', cv=10)
    print(drugs)
    #print('CV accuracy: %.3f +/- %.3f' % (np.mean(knnscores),
#                                           np.std(knnscores)))


    dtscores=cross_val_score(gs_dt2, X_train, y_train, 
                             scoring='accuracy', cv=10)
    #print('CV accuracy: %.3f +/- %.3f' % (np.mean(dtscores),
#                                           np.std(dtscores)))


    lrscores=cross_val_score(gs_lr2, X_train, y_train, 
                             scoring='accuracy', cv=10)
    #print('CV accuracy: %.3f +/- %.3f' % (np.mean(lrscores),
#                                           np.std(lrscores)))
    scores=[np.mean(knnscores)+np.std(knnscores),np.mean(dtscores)+np.std(dtscores),np.mean(lrscores)+np.std(lrscores)]
    if scores.index(min(scores))==0:
        print("KNN is the best model\nCV accuracy: %.3f +/- %.3f"% (np.mean(knnscores),
                                          np.std(knnscores)))
    elif scores.index(min(scores))==1:
        print("Decision Tree is the best model\nCV accuracy: %.3f +/- %.3f"% (np.mean(dtscores),
                                          np.std(dtscores)))
    elif scores.index(min(scores))==2:
        print("Logistic Regression is the best model\nCV accuracy: %.3f +/- %.3f"% (np.mean(lrscores),
                                          np.std(lrscores)))
    


Cannabis
Decision Tree is the best model
CV accuracy: 0.798 +/- 0.041
Coke
KNN is the best model
CV accuracy: 0.788 +/- 0.018
Crack
KNN is the best model
CV accuracy: 0.958 +/- 0.004
Ecstasy
KNN is the best model
CV accuracy: 0.775 +/- 0.030
Heroin
Decision Tree is the best model
CV accuracy: 0.936 +/- 0.003
Ketamine
KNN is the best model
CV accuracy: 0.889 +/- 0.003
Legalh
KNN is the best model
CV accuracy: 0.764 +/- 0.029
LSD
Decision Tree is the best model
CV accuracy: 0.808 +/- 0.018
Meth
KNN is the best model
CV accuracy: 0.839 +/- 0.009
Mushrooms
Decision Tree is the best model
CV accuracy: 0.765 +/- 0.024
Nicotine
Decision Tree is the best model
CV accuracy: 0.692 +/- 0.028




Semer




ValueError: This solver needs samples of at least 2 classes in the data, but the data contains only one class: 1

In [None]:
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier 
from sklearn.metrics import roc_curve
from sklearn.metrics import auc
from sklearn.model_selection import cross_val_score

# Logistic Regression Classifier
clf1 = LogisticRegression(penalty='l2', 
                          C=0.01,
                          random_state=1)

# Decision Tree Classifier
clf2 = DecisionTreeClassifier(max_depth=1,
                              criterion='entropy',
                              random_state=0)

# kNN Classifier
clf3 = KNeighborsClassifier(n_neighbors=51,
                            p=2,
                            metric='minkowski')

# Label the classifiers
clf_labels = ['Logistic regression', 'Decision tree', 'KNN']
all_clf = [clf1, clf2, clf3]

print('10-fold cross validation:\n')
for clf, label in zip([clf1, clf2, clf3], clf_labels): #For all classifiers 
    scores = cross_val_score(estimator=clf,  #Estimate AUC based on cross validation
                             X=X_train,
                             y=y_train,
                             cv=10,
                             scoring='roc_auc')
    print("ROC AUC: %0.2f (+/- %0.2f) [%s]" #Print peformance statistics based on cross-validation
          % (scores.mean(), scores.std(), label))

colors = [ 'orange', 'blue', 'green']      #Colors for visualization
linestyles = [':', '--', '-.', '-']        #Line styles for visualization
for clf, label, clr, ls in zip(all_clf,
               clf_labels, colors, linestyles):

    # assuming the label of the positive class is 1 and data is normalized
    y_pred = clf.fit(X_train,
                     y_train).predict_proba(X_test)[:, 1] # Make predictions based on the classifiers
    fpr, tpr, thresholds = roc_curve(y_true=y_test, # Build ROC curve
                                     y_score=y_pred)
    roc_auc = auc(x=fpr, y=tpr)                # Compute Area Under the Curve (AUC) 
    plt.plot(fpr, tpr,                         # Plot ROC Curve and create label with AUC values
             color=clr,
             linestyle=ls,
             label='%s (auc = %0.2f)' % (label, roc_auc))

plt.legend(loc='lower right')    # Where to place the legend
plt.plot([0, 1], [0, 1], # Visualize random classifier
         linestyle='--',
         color='gray',
         linewidth=2)

plt.xlim([-0.1, 1.1])   #limits for x axis
plt.ylim([-0.1, 1.1])   #limits for y axis
plt.grid(alpha=0.5)
plt.xlabel('False positive rate (FPR)')
plt.ylabel('True positive rate (TPR)')


#plt.savefig('ROC_all_classifiers', dpi=300)
plt.show()