<a id='START'></a>

# Feature Selection
This notebook creates ```feature``` and ```response``` dataframes from SQL database (or one cleaned file if done locally), then bins the reponses (drug usage frequency) into custom cohorts.

In [1]:
import pandas as pd
import pickle
from drugC_helper_funcs import create_non_light_heavy_cats

**If using SQL:**

In [2]:
connection_args = {
    'host': 'localhost',  # We are connecting to our _local_ version of psql
    'dbname': 'drug_consumption',    # DB that we are connecting to
    'port': 5432          # port we opened on AWS
}

# Import data
Access SQL database ```drug_consumption``` (see ```01_drugC_sqlDatabaseSetup.ipynb```) 
or read ```./data/drug_consumption_cleaned.pkl```.

In [3]:
sql = True

In [4]:
if sql:
    import psycopg2 as pg
    import pandas.io.sql as pd_sql
    connection = pg.connect(**connection_args)

In [5]:
if not sql:
    df = pd.read_pickle('./data/drug_consumption_cleaned.pkl')
    df

# Features DataFrame
Drop ```'country'```, ```'ethnicity'```, and ```'gender'```.

In [6]:
if sql:
    query = "SELECT id, age, education, nscore, escore, oscore, ascore, cscore, impulsiveness, ss FROM features;"
    df_features = pd.DataFrame(pd_sql.read_sql(query, connection))
    df_features = df_features.set_index(df_features['id']).drop(columns='id')
    df_features.head(3)

In [7]:
if not sql:
    all_features = df.columns[:12]
    df_features_all = df[all_features]
    # drop unnecessary features
    df_features = df_features_all.drop(columns = ['country','ethnicity','gender'])
    df_features.head(3)

# Responses Dataframe
Drop ```'semer'```, the fictitious drug. Add column tracking use of illicit drugs in the past month.

In [8]:
if sql:
    query = "SELECT * FROM drug_data;"

    df_responses = pd.DataFrame(pd_sql.read_sql(query, connection))
    df_responses = df_responses.set_index(df_responses['id']).drop(columns=['id','semer'])
    drug_list = df_responses.columns
    df_responses.head(3)

In [9]:
if not sql:
    drug_list = df.columns[12:]
    drug_list = drug_list.drop('semer')
    df_responses = df[drug_list]
    df_responses.head(3)

## Create Monthly Illicit User column

In [10]:
illicit_drugs = ['amphet','amyl','benzos','cannabis','coke','crack','ecstasy','heroin',
              'ketamine','legalh','lsd','meth','mushrooms','vsa']

nonillicit_drugs = ['alcohol','caff','choc','nicotine']

In [11]:
def determine_monthly_illicit_user(row):
    for drug in illicit_drugs:
        if (row[drug] >= 3):
            return 1
            
        else: return 0

In [12]:
df_responses['monthly_illicit_user'] = df_responses.apply(determine_monthly_illicit_user, axis=1)
df_responses['monthly_illicit_user'].value_counts()

0    1449
1     436
Name: monthly_illicit_user, dtype: int64

# Define user ranges for each drug
Monthly cohorts for all drugs.

Define ranges of drug use for each drug. Drug usage ranges from 0 to 6. EDA showed different personality types are correlated to different frequencies of drug use, so custom cohorts can be created for each drug that divide respondents into distinct personality traits.

In [13]:
# monthly cohorts
decade_cohort = [0,2]
month_cohort = [0,4]
drug_cohorts = {'alcohol': month_cohort,
                'amphet': month_cohort,
                'amyl': month_cohort,
                'benzos': month_cohort,
                'caff': month_cohort,
                'cannabis': month_cohort,
                'choc': month_cohort,
                'coke': month_cohort,
                'crack': month_cohort,
                'ecstasy': month_cohort,
                'heroin': month_cohort,
                'ketamine': month_cohort,
                'legalh': month_cohort,
                'lsd': month_cohort,
                'meth': month_cohort,
                'mushrooms': month_cohort,
                'nicotine': month_cohort,
                'vsa': month_cohort,
               }

In [14]:
for drug in drug_list:
    df_responses[drug] = df_responses[drug].apply(create_non_light_heavy_cats, cohorts=drug_cohorts[drug])
df_responses.sample(7)

Unnamed: 0_level_0,alcohol,amphet,amyl,benzos,caff,cannabis,choc,coke,crack,ecstasy,heroin,ketamine,legalh,lsd,meth,mushrooms,nicotine,vsa,monthly_illicit_user
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
860,1,1,0,1,1,1,1,1,1,1,1,1,0,0,1,0,1,0,1
1401,1,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,1,0,0
1693,1,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0
1682,1,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0
544,1,0,0,1,1,1,1,0,0,0,0,0,1,0,0,0,1,0,0
1389,1,1,0,1,1,1,1,0,0,0,0,0,0,0,0,1,1,0,1
165,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0


# Save responses and features as pickle
```./data/drugC_features.pkl``` and ```./data/drugC_responses.pkl```.

In [15]:
pd.to_pickle(df_features, './data/drugC_features.pkl')

In [16]:
pd.to_pickle(df_responses, './data/drugC_responses.pkl')

In [17]:
with open('./data/drugC_cohorts.pkl', 'wb') as handle:
    pickle.dump(drug_cohorts, handle, protocol=pickle.HIGHEST_PROTOCOL)

# [&#x21b8; to Top](#START) 