In [6]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 

In [9]:
column_headers = [
    'id',
    'age',
    'gender',
    'education',
    'country',
    'ethnicity',
    'neuroticism',
    'extraversion',
    'openness',
    'agreeableness',
    'conscientiousness',
    'impulsiveness',
    'sensation_seeking',
    'alcohol',
    'amphet',
    'amyl',
    'benzos',
    'caffeine',
    'cannabis',
    'chocolate',
    'cocaine',
    'crack',
    'mdma',
    'heroin',
    'ketamine',
    'legalh',
    'lsd',
    'meth',
    'mushrooms',
    'nicotine',
    'semeron', # fictious drug
    'vsa'      # volatile substance abuse
]

raw_data = pd.read_csv('drug_consumption.data', names=column_headers)
print(raw_data.head())

   id      age   gender  education  country  ethnicity  neuroticism  \
0   1  0.49788  0.48246   -0.05921  0.96082    0.12600      0.31287   
1   2 -0.07854 -0.48246    1.98437  0.96082   -0.31685     -0.67825   
2   3  0.49788 -0.48246   -0.05921  0.96082   -0.31685     -0.46725   
3   4 -0.95197  0.48246    1.16365  0.96082   -0.31685     -0.14882   
4   5  0.49788  0.48246    1.98437  0.96082   -0.31685      0.73545   

   extraversion  openness  agreeableness  ...  mdma  heroin  ketamine legalh  \
0      -0.57545  -0.58331       -0.91699  ...   CL0     CL0       CL0    CL0   
1       1.93886   1.43533        0.76096  ...   CL4     CL0       CL2    CL0   
2       0.80523  -0.84732       -1.62090  ...   CL0     CL0       CL0    CL0   
3      -0.80615  -0.01928        0.59042  ...   CL0     CL0       CL2    CL0   
4      -1.63340  -0.45174       -0.30172  ...   CL1     CL0       CL0    CL1   

   lsd meth mushrooms nicotine semeron  vsa  
0  CL0  CL0       CL0      CL2     CL0  CL0  


In [24]:
age_mapping = {
    -0.95197: '18-24',
    -0.07854: '25-34',
    0.49788: '35-44',
    1.09449: '45-54',
    1.82213: '55-64',
    2.59171: '65+',
}
gender_mapping = {
    0.48246: 'female',
    -0.48246: 'male',
}
education_mapping = {
    -2.43591: 'Left school before 16 years',
    -1.73790: 'Left school at 16 years',
    -1.43719: 'Left school at 17 years 30',
    -1.22751: 'Left school at 18 years',
    -0.61113: 'Some college or university, no degree',
    -0.05921: 'Professional certificate/ diploma',
    0.45468: 'University degree',
    1.16365: 'Masters degree',
    1.98437: 'Doctorate degree',
}
country_mapping = {
    -0.09765: 'Australia',
    0.24923: 'Canada',
    -0.46841: 'New Zealand',
    -0.28519: 'Other',
    0.21128: 'Republic of Ireland',
    0.96082: 'UK',
    -0.57009: 'USA',
}
ethnicity_mapping = {
    -0.50212: 'Asian',
    -1.10702: 'Black',
    1.90725: 'Mixed-Black/Asian',
    0.12600: 'Mixed-White/Asian',
    -0.22166: 'Mixed-White/Black',
    0.11440: 'Other',
    -0.31685: 'White',
}
neuroticism_mapping = {
    -3.46436: 12, -0.67825: 29, 1.02119: 46,
    -3.15735: 13, -0.58016: 30, 1.13281: 47,
    -2.75696: 14, -0.46725: 31, 1.23461: 48,
    -2.52197: 15, -0.34799: 32, 1.37297: 49,
    -2.42317: 16, -0.24649: 33, 1.49158: 50,
    -2.34360: 17, -0.14882: 34, 1.60383: 51,
    -2.21844: 18, -0.05188: 35, 1.72012: 52,
    -2.05048: 19, 0.04257: 36, 1.83990: 53,
    -1.86962: 20, 0.13606: 37, 1.98437: 54,
    -1.69163: 21, 0.22393: 38, 2.12700: 55,
    -1.55078: 22, 0.31287: 39, 2.28554: 56,
    -1.43907: 23, 0.41667: 40, 2.46262: 57,
    -1.32828: 24, 0.52135: 41, 2.61139: 58,
    -1.19430: 25, 0.62967: 42, 2.82196: 59,
    -1.05308: 26, 0.73545: 43, 3.27393: 60,
    -0.92104: 27, 0.82562: 44,
    -0.79151: 28, 0.91093: 45,
}
extraversion_mapping = {
    -3.27393: 16, -1.23177: 31, 0.80523: 45,
    -3.00537: 18,   -1.09207: 32, 0.96248: 46,
    -2.72827: 19,   -0.94779: 33, 1.11406: 47,
    -2.53830: 20,   -0.80615: 34, 1.28610: 48,
    -2.44904: 21,   -0.69509: 35, 1.45421: 49,
    -2.32338: 22,   -0.57545: 36, 1.58487: 50,
    -2.21069: 23,   -0.43999: 37, 1.74091: 51,
    -2.11437: 24,   -0.30033: 38, 1.93886: 52,
    -2.03972: 25,   -0.15487: 39, 2.12700: 53,
    -1.92173: 26,   0.00332: 40, 2.32338: 54,
    -1.76250: 27,   0.16767: 41, 2.57309: 55,
    -1.63340: 28,  0.32197: 42, 2.85950: 56,
    -1.50796: 29,   0.47617: 43, 3.00537: 58,
    -1.37639: 30,   0.63779: 44, 3.27393: 59
}
openness_mapping = {
    -3.27393: 24, -1.11902: 38, 0.58331: 50,
    -2.85950: 26, -0.97631: 39, 0.72330: 51,
    -2.63199: 28,-0.84732: 40,0.88309: 52,
    -2.39883: 29,-0.71727: 41,1.06238: 53,
    -2.21069: 30,-0.58331: 42,1.24033: 54,
    -2.09015: 31,-0.45174: 43,1.43533: 55,
    -1.97495: 32,-0.31776: 44,1.65653: 56,
    -1.82919: 33,-0.17779: 45,1.88511: 57,
    -1.68062: 34,-0.01928: 46,2.15324: 58,
    -1.55521: 35,0.14143: 47,2.44904: 59,
    -1.42424: 36, 0.29338: 48, 2.90161: 60, 
    -1.27553: 37, 0.44585: 49,
}
agreeableness_mapping = {
    -3.46436: 12,-1.34289: 34, 0.76096: 48,
    -3.15735: 16,-1.21213: 35,0.94156: 49,
    -3.00537: 18, -1.07533: 36,  1.11406: 50,
    -2.90161: 23,  -0.91699: 37,  1.2861: 51,
    -2.78793: 24,  -0.76096: 38, 1.45039: 52,
    -2.70172: 25, -0.60633: 39, 1.61108: 53,
    -2.53830: 26, -0.45321: 40,1.81866: 54,
    -2.35413: 27, -0.30172: 41, 2.03972: 55,
    -2.21844: 28, -0.15487: 42, 2.23427: 56,
    -2.07848: 29, -0.01729: 43, 2.46262: 57,
    -1.92595: 30, 0.13136: 44, 2.75696: 58,
    -1.77200: 31, 0.28783: 45, 3.15735: 59,
    -1.62090: 32, 0.43852: 46, 3.46436: 60,
    -1.47955: 33, 0.59042: 47,
}
conscientiousness_mapping = {
    -3.46436: 17,-1.25773: 32,0.58489: 46,
    -3.15735: 19,-1.13788: 33,0.7583: 47,
    -2.90161: 20,-1.01450: 34,0.93949: 48,
    -2.72827: 21,-0.89891: 35,1.13407: 49,
    -2.57309: 22,-0.78155: 36,1.30612: 50,
    -2.42317: 23,-0.65253: 37,1.46191: 51,
    -2.30408: 24,-0.52745: 38,1.63088: 52,
    -2.18109: 25,-0.40581: 39,1.81175: 53,
    -2.04506: 26,-0.27607: 40,2.04506: 54,
    -1.92173: 27,-0.14277: 41,2.33337: 55,
    -1.78169: 28,-0.00665: 42,2.63199: 56,
    -1.64101: 29,0.12331: 43,3.00537: 57,
    -1.51840: 30,0.25953: 44,3.46436: 59,
    -1.38502: 31,0.41594: 45,
}
labeled_data = raw_data.copy(deep=True)
labeled_data['gender'] = raw_data['gender'].apply(lambda x: gender_mapping[x])
labeled_data['education'] = raw_data['education'].apply(lambda x: education_mapping[x])
labeled_data['country'] = raw_data['country'].apply(lambda x: country_mapping[x])
labeled_data['ethnicity'] = raw_data['ethnicity'].apply(lambda x: ethnicity_mapping[x])

print(labeled_data.head())

   id      age  gender                          education country  \
0   1  0.49788  female  Professional certificate/ diploma      UK   
1   2 -0.07854    male                   Doctorate degree      UK   
2   3  0.49788    male  Professional certificate/ diploma      UK   
3   4 -0.95197  female                     Masters degree      UK   
4   5  0.49788  female                   Doctorate degree      UK   

           ethnicity  neuroticism  extraversion  openness  agreeableness  ...  \
0  Mixed-White/Asian      0.31287      -0.57545  -0.58331       -0.91699  ...   
1              White     -0.67825       1.93886   1.43533        0.76096  ...   
2              White     -0.46725       0.80523  -0.84732       -1.62090  ...   
3              White     -0.14882      -0.80615  -0.01928        0.59042  ...   
4              White      0.73545      -1.63340  -0.45174       -0.30172  ...   

   mdma  heroin  ketamine legalh  lsd meth mushrooms nicotine semeron  vsa  
0   CL0     CL0      

        id      age   gender  education  country  ethnicity  neuroticism  \
0        1  0.49788  0.48246   -0.05921  0.96082    0.12600      0.31287   
1        2 -0.07854 -0.48246    1.98437  0.96082   -0.31685     -0.67825   
2        3  0.49788 -0.48246   -0.05921  0.96082   -0.31685     -0.46725   
3        4 -0.95197  0.48246    1.16365  0.96082   -0.31685     -0.14882   
4        5  0.49788  0.48246    1.98437  0.96082   -0.31685      0.73545   
...    ...      ...      ...        ...      ...        ...          ...   
1880  1884 -0.95197  0.48246   -0.61113 -0.57009   -0.31685     -1.19430   
1881  1885 -0.95197 -0.48246   -0.61113 -0.57009   -0.31685     -0.24649   
1882  1886 -0.07854  0.48246    0.45468 -0.57009   -0.31685      1.13281   
1883  1887 -0.95197  0.48246   -0.61113 -0.57009   -0.31685      0.91093   
1884  1888 -0.95197 -0.48246   -0.61113  0.21128   -0.31685     -0.46725   

      extraversion  openness  agreeableness  ...  mdma  heroin  ketamine  \
0         -