In [44]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import preprocessing
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import random
from numpy.random import permutation
import math
from sklearn import metrics

In [45]:
random.seed(1234)

In [46]:
col_names = ['Age','Gender','Education', 'Country', 'Ethnicity', 'Nscore', 'Escore', 'Oscore', 'Ascore', 'Csore', 'Impulsive','SS','1','Amphet','3','4','5','6','7','Coke','9','10','Heroin','12','13','LSD','Meth','16','17','18','19']

In [47]:
directory = '~/Desktop/NYU/Fall 2019/Programming for Data Science/Project_CDS/data/'
data = pd.read_csv(directory + "drug_consumption.csv", names = col_names)

In [48]:
data['Age'] = round(data['Age'],3)
data['Gender'] = round(data['Gender'],3)
data['Education']= round(data['Education'],3)
data['Country'] = round(data['Country'],3)
data['Ethnicity'] = round(data['Ethnicity'],3)

data['Age'] = data['Age'].map({-0.952:"1", -0.079:"2", 0.498:"3", 1.094:"4", 1.822:"5", 2.592:"6"})

data['Gender'] = data['Gender'].map({0.482:'Female', -0.482:'Male'})

data['Education'] = data['Education'].map({-2.436:'1', -1.738: '2',-1.437:'3', -1.228:'4', -0.611: '5', -0.059:'6',0.455: '7', 1.164: '8', 1.984:'9'}) 

data['Country'] = data['Country'].map({-0.098: '1', 0.249:'2', -0.468:'3', -0.285:'4', 0.211:'5',0.961:'6', -0.570:'7'})

data['Ethnicity'] = data['Ethnicity'].map({-0.502:'1', 
-1.107:'2', 
1.907: '3', 
0.126: '4', 
-0.222: '5', 
0.114: '6',  
-0.317: '7'})

In [49]:
amended_data = data.iloc[:,[0,1,2,3,4,5,6,7,8,9,10,11,13,19,22,25,26]]

In [50]:
encoded_data = amended_data.copy()
encoded_data['Drug_indicator'] = amended_data[['Amphet', 'Coke', 'Heroin', 'LSD', 'Meth']].max(axis = 1)

In [51]:
def string_to_numeric(x):
    if x == 'Female' :
        return 1
    if x == 'Male' :
        return 0

In [52]:
encoded_data['Gender'] = encoded_data['Gender'].apply(string_to_numeric)

In [53]:
# Randomly shuffle the index.
random_indices = permutation(encoded_data.index)

# Set a cutoff for how many items we want in the test set (in this case 1/3 of the items)
test_cutoff = math.floor(len(encoded_data)/3)

# Generate the test set by taking the first 1/3 of the randomly shuffled indices.
test = encoded_data.loc[random_indices[1:test_cutoff]]

# Generate the train set with the rest of the data.
train = encoded_data.loc[random_indices[test_cutoff:]]

# Multi-class Classification - Random Forest

In [54]:
# The columns that we will be making predictions with.
x_columns = ['Age','Gender','Education','Country','Ethnicity','Nscore', 'Escore', 'Oscore', 'Ascore', 'Csore', 'Impulsive','SS']
# The column that we want to predict.
y_column = ['Drug_indicator']

In [55]:
train.head(5)

Unnamed: 0,Age,Gender,Education,Country,Ethnicity,Nscore,Escore,Oscore,Ascore,Csore,Impulsive,SS,Amphet,Coke,Heroin,LSD,Meth,Drug_indicator
674,1,0,6,6,7,0.22393,0.00332,-1.68062,-0.15487,0.25953,0.88113,0.7654,CL0,CL0,CL0,CL0,CL0,CL0
1214,1,0,6,2,7,-0.14882,1.45421,1.43533,0.13136,-1.38502,0.52975,0.7654,CL4,CL0,CL0,CL3,CL0,CL4
1567,2,1,8,6,7,-1.05308,1.2861,-0.45174,-0.15487,-0.00665,0.88113,-0.84637,CL0,CL0,CL0,CL0,CL0,CL0
1796,5,0,8,6,7,-0.58016,1.74091,-1.97495,-1.07533,1.63088,0.88113,0.40148,CL0,CL0,CL0,CL0,CL0,CL0
1863,1,1,5,7,7,0.62967,0.63779,2.90161,-1.07533,0.93949,0.88113,1.2247,CL3,CL3,CL0,CL4,CL3,CL4


In [56]:
trainy = np.array(train[y_column]).reshape(1257)

Criterion is the function to measure the quality of a split. Supported criteria are "gini" for the Gini impurity and "Entropy" for the information gain.

In [57]:
rfc=RandomForestClassifier(criterion='entropy')
rfc.fit(train[x_columns],trainy)
rfc_preds=rfc.predict(test[x_columns])



In [58]:
# Two ways of getting accuracy
print(rfc.score(test[x_columns],test[y_column]))
print("Accuracy:",metrics.accuracy_score(rfc_preds, test[y_column]))

0.4019138755980861
Accuracy: 0.4019138755980861


In [59]:
rfc2=RandomForestClassifier(criterion='gini')
rfc2.fit(train[x_columns],trainy)
rfc_preds2=rfc2.predict(test[x_columns])



In [60]:
# Two ways of getting accuracy
print(rfc2.score(test[x_columns],test[y_column]))
print("Accuracy:",metrics.accuracy_score(rfc_preds2, test[y_column]))

0.3843700159489633
Accuracy: 0.3843700159489633


# Binary Classification - logistic regression

## convert y to be binary

In [61]:
binary_data = encoded_data

In [62]:
binary_data.head(5)

Unnamed: 0,Age,Gender,Education,Country,Ethnicity,Nscore,Escore,Oscore,Ascore,Csore,Impulsive,SS,Amphet,Coke,Heroin,LSD,Meth,Drug_indicator
0,3,1,6,6,4,0.31287,-0.57545,-0.58331,-0.91699,-0.00665,-0.21712,-1.18084,CL2,CL0,CL0,CL0,CL0,CL2
1,2,0,9,6,7,-0.67825,1.93886,1.43533,0.76096,-0.14277,-0.71126,-0.21575,CL2,CL3,CL0,CL2,CL3,CL3
2,3,0,6,6,7,-0.46725,0.80523,-0.84732,-1.6209,-1.0145,-1.37983,0.40148,CL0,CL0,CL0,CL0,CL0,CL0
3,1,1,8,6,7,-0.14882,-0.80615,-0.01928,0.59042,0.58489,-1.37983,-1.18084,CL0,CL2,CL0,CL0,CL0,CL2
4,3,1,9,6,7,0.73545,-1.6334,-0.45174,-0.30172,1.30612,-0.21712,-0.21575,CL1,CL0,CL0,CL0,CL0,CL1


In [63]:
binary_data = binary_data.replace(['CL0','CL1','CL2','CL3','CL4','CL5','CL6'],[0,1,1,2,2,3,3])
binary_data['Drug_indicator'] = binary_data['Drug_indicator'].map({0:0, 1:1, 2:1, 3:1})

In [64]:
# Randomly shuffle the index.
random_indices = permutation(binary_data.index)

# Set a cutoff for how many items we want in the test set (in this case 1/3 of the items)
test_cutoff = math.floor(len(binary_data)/3)

# Generate the test set by taking the first 1/3 of the randomly shuffled indices.
test = binary_data.loc[random_indices[1:test_cutoff]]

# Generate the train set with the rest of the data.
train = binary_data.loc[random_indices[test_cutoff:]]

In [65]:
lr = LogisticRegression()
lr.fit(train[x_columns], train[y_column])
lr_preds = lr.predict(test[x_columns])

  y = column_or_1d(y, warn=True)


In [66]:
print(lr.score(test[x_columns], test[y_column]))

0.7703349282296651
