In [1]:
import pandas as pd

### Load Raw Data 

In [2]:
data_url = 'https://raw.githubusercontent.com/tahmeed14/classification-models-for-alcoholism/master/alcoholism_classification/Student%20Alcohol%20Consumption%20Merged.csv'
raw_df = pd.read_csv(data_url)
raw_df.head()

Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,...,famrel,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3
0,GP,F,18,U,GT3,A,4,4,at_home,teacher,...,4,3,4,1,1,3,6,5,6,6
1,GP,F,17,U,GT3,T,1,1,at_home,other,...,5,3,3,1,1,3,4,5,5,6
2,GP,F,15,U,LE3,T,1,1,at_home,other,...,4,3,2,2,3,3,10,7,8,10
3,GP,F,15,U,GT3,T,4,2,health,services,...,3,2,2,1,1,5,2,15,14,15
4,GP,F,16,U,GT3,T,3,3,other,other,...,4,3,2,1,2,5,4,6,10,10


### Select useful columns as features for the model

In [3]:
selection = ['sex', 'age', 'famsize', 'studytime', 'romantic', 'famrel', 'goout', 'Dalc', 'Walc', 'health', 'absences']

selection_df = raw_df[selection]
selection_df.head()

Unnamed: 0,sex,age,famsize,studytime,romantic,famrel,goout,Dalc,Walc,health,absences
0,F,18,GT3,2,no,4,4,1,1,3,6
1,F,17,GT3,2,no,5,3,1,1,3,4
2,F,15,LE3,2,no,4,2,2,3,3,10
3,F,15,GT3,3,yes,3,2,1,1,5,2
4,F,16,GT3,2,no,4,2,1,2,5,4


### Rename the columns with more descriptive titles

In [4]:
old_names = selection_df.columns

replacements = {
    'Dalc': 'weekday drinker', 'Walc': 'weekend drinker', 'famrel': 'family relationship',
    'goout': 'social activities', 'romantic': 'significant other',  
    'studytime': 'study time', 'famsize': 'household size', 'sex': 'gender'
} 

new_names = lambda entry: entry if entry not in replacements else replacements[entry]

new_columns = pd.Series(map(new_names, old_names))
selection_df.columns = new_columns

selection_df.head()

Unnamed: 0,gender,age,household size,study time,significant other,family relationship,social activities,weekday drinker,weekend drinker,health,absences
0,F,18,GT3,2,no,4,4,1,1,3,6
1,F,17,GT3,2,no,5,3,1,1,3,4
2,F,15,LE3,2,no,4,2,2,3,3,10
3,F,15,GT3,3,yes,3,2,1,1,5,2
4,F,16,GT3,2,no,4,2,1,2,5,4


In [5]:
drinker = selection_df['weekday drinker'] + selection_df['weekend drinker']
drinker.name = 'drinker'
selection_df = selection_df.join(drinker)
selection_df.head()

Unnamed: 0,gender,age,household size,study time,significant other,family relationship,social activities,weekday drinker,weekend drinker,health,absences,drinker
0,F,18,GT3,2,no,4,4,1,1,3,6,2
1,F,17,GT3,2,no,5,3,1,1,3,4,2
2,F,15,LE3,2,no,4,2,2,3,3,10,5
3,F,15,GT3,3,yes,3,2,1,1,5,2,2
4,F,16,GT3,2,no,4,2,1,2,5,4,3


In [6]:
selection_df.loc[1]

gender                   F
age                     17
household size         GT3
study time               2
significant other       no
family relationship      5
social activities        3
weekday drinker          1
weekend drinker          1
health                   3
absences                 4
drinker                  2
Name: 1, dtype: object

In [7]:
index_col = pd.Series(range(1, len(selection_df)+1), name='id')
selection_df = selection_df.join(index_col)
# selection_df = selection_df.set_index('id')

selection_df.head()

Unnamed: 0,gender,age,household size,study time,significant other,family relationship,social activities,weekday drinker,weekend drinker,health,absences,drinker,id
0,F,18,GT3,2,no,4,4,1,1,3,6,2,1
1,F,17,GT3,2,no,5,3,1,1,3,4,2,2
2,F,15,LE3,2,no,4,2,2,3,3,10,5,3
3,F,15,GT3,3,yes,3,2,1,1,5,2,2,4
4,F,16,GT3,2,no,4,2,1,2,5,4,3,5


### school:
    GP = 0
    MS = 1
   
### gender:
    M = 0
    F = 1
    
### houshold size:
    LE3 = 0
    GT3 = 1

In [8]:
# replace these values: ['no', 'yes', 'GP', 'MS', 'LE3', 'GT3', 'M', 'F']
#               with:   [  0,    1,     0,    1,     0,     1,   0,   1]

encoded_df = selection_df.replace(['no', 'yes', 'GP', 'MS', 'LE3', 'GT3', 'M', 'F'], [0, 1, 0, 1, 0, 1, 0, 1])
encoded_df[:5]

Unnamed: 0,gender,age,household size,study time,significant other,family relationship,social activities,weekday drinker,weekend drinker,health,absences,drinker,id
0,1,18,1,2,0,4,4,1,1,3,6,2,1
1,1,17,1,2,0,5,3,1,1,3,4,2,2
2,1,15,0,2,0,4,2,2,3,3,10,5,3
3,1,15,1,3,1,3,2,1,1,5,2,2,4
4,1,16,1,2,0,4,2,1,2,5,4,3,5


### Features and Output

In [9]:
old_columns = ['id', 'gender', 'age', 'household size', 'study time', 'significant other', 'family relationship', 'social activities', 'weekday drinker', 'weekend drinker', 'health', 'absences', 'drinker']
reserved = ['weekend drinker', 'weekday drinker']
new_columns = [x for x in old_columns if x not in reserved]
new_columns

['id',
 'gender',
 'age',
 'household size',
 'study time',
 'significant other',
 'family relationship',
 'social activities',
 'health',
 'absences',
 'drinker']

### Re-order the columns in the dataframes and save them as csv files

In [10]:
# Re-order the dataframes

selection_df = selection_df[new_columns]
encoded_df = encoded_df[new_columns]

In [11]:
raw_df.to_csv('../res/raw.csv', index=False)
selection_df.to_csv('../res/selection.csv', index=False)
encoded_df.to_csv('../res/encoded.csv', index=False)

## Database

In [12]:
import sqlite3

con = sqlite3.connect('../res/tables.sql')
cur = con.cursor()

In [13]:
profile_columns = ['id', 'gender', 'age', 'household size', 'family relationship', 'health']
activities_columns = ['id', 'study time', 'social activities', 'drinker', 'absences']


profile_values = encoded_df[profile_columns]
activities_values = encoded_df[activities_columns]



cur.executemany('INSERT INTO profile VALUES (?, ?, ?, ?, ?, ?)', profile_values.to_numpy())
cur.executemany('INSERT INTO activities VALUES (?, ?, ?, ?, ?)', activities_values.to_numpy())

con.commit()
con.close()