**Note:** Run this file only once, to run it again, delete [**'res/tables.sql'**](../res)

In [1]:
import pandas as pd

### Load Raw Data 

In [2]:
data_url = 'https://raw.githubusercontent.com/tahmeed14/classification-models-for-alcoholism/master/alcoholism_classification/Student%20Alcohol%20Consumption%20Merged.csv'
# data_url = '../res/raw.csv'
raw_df = pd.read_csv(data_url)
raw_df.head()

Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,...,famrel,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3
0,GP,F,18,U,GT3,A,4,4,at_home,teacher,...,4,3,4,1,1,3,6,5,6,6
1,GP,F,17,U,GT3,T,1,1,at_home,other,...,5,3,3,1,1,3,4,5,5,6
2,GP,F,15,U,LE3,T,1,1,at_home,other,...,4,3,2,2,3,3,10,7,8,10
3,GP,F,15,U,GT3,T,4,2,health,services,...,3,2,2,1,1,5,2,15,14,15
4,GP,F,16,U,GT3,T,3,3,other,other,...,4,3,2,1,2,5,4,6,10,10


In [3]:
raw_df.columns

Index(['school', 'sex', 'age', 'address', 'famsize', 'Pstatus', 'Medu', 'Fedu',
       'Mjob', 'Fjob', 'reason', 'guardian', 'traveltime', 'studytime',
       'failures', 'schoolsup', 'famsup', 'paid', 'activities', 'nursery',
       'higher', 'internet', 'romantic', 'famrel', 'freetime', 'goout', 'Dalc',
       'Walc', 'health', 'absences', 'G1', 'G2', 'G3'],
      dtype='object')

### Select useful columns as features for the model

In [4]:
selection = ['school', 'sex', 'age', 'famsize', 'studytime', 'failures', 'romantic', 'famrel', 'freetime', 'goout', 'Dalc', 'Walc', 'health', 'absences']

selection_df = raw_df[selection]
selection_df.head()

Unnamed: 0,school,sex,age,famsize,studytime,failures,romantic,famrel,freetime,goout,Dalc,Walc,health,absences
0,GP,F,18,GT3,2,0,no,4,3,4,1,1,3,6
1,GP,F,17,GT3,2,0,no,5,3,3,1,1,3,4
2,GP,F,15,LE3,2,3,no,4,3,2,2,3,3,10
3,GP,F,15,GT3,3,0,yes,3,2,2,1,1,5,2
4,GP,F,16,GT3,2,0,no,4,3,2,1,2,5,4


In [5]:
set(selection_df['school'])

{'GP', 'MS'}

### Rename the columns with more descriptive titles

In [6]:
old_names = selection_df.columns

replacements = {
    'Dalc': 'weekday drinker', 'Walc': 'weekend drinker', 'famrel': 'family relationship',
    'goout': 'social activities', 'romantic': 'significant other', 'freetime': 'free time',  
    'studytime': 'study time', 'famsize': 'household size', 'sex': 'gender'
} 

new_names = lambda entry: entry if entry not in replacements else replacements[entry]

new_columns = pd.Series(map(new_names, old_names))
selection_df.columns = new_columns

selection_df.head()

Unnamed: 0,school,gender,age,household size,study time,failures,significant other,family relationship,free time,social activities,weekday drinker,weekend drinker,health,absences
0,GP,F,18,GT3,2,0,no,4,3,4,1,1,3,6
1,GP,F,17,GT3,2,0,no,5,3,3,1,1,3,4
2,GP,F,15,LE3,2,3,no,4,3,2,2,3,3,10
3,GP,F,15,GT3,3,0,yes,3,2,2,1,1,5,2
4,GP,F,16,GT3,2,0,no,4,3,2,1,2,5,4


In [7]:
drinker = selection_df['weekday drinker'] + selection_df['weekend drinker']

# encode drinker values 
encoded_drinker = pd.Series(map(lambda x: 0 if x < 4 else 1, drinker), name='drinker')

selection_df = selection_df.join(encoded_drinker)
selection_df.head()

Unnamed: 0,school,gender,age,household size,study time,failures,significant other,family relationship,free time,social activities,weekday drinker,weekend drinker,health,absences,drinker
0,GP,F,18,GT3,2,0,no,4,3,4,1,1,3,6,0
1,GP,F,17,GT3,2,0,no,5,3,3,1,1,3,4,0
2,GP,F,15,LE3,2,3,no,4,3,2,2,3,3,10,1
3,GP,F,15,GT3,3,0,yes,3,2,2,1,1,5,2,0
4,GP,F,16,GT3,2,0,no,4,3,2,1,2,5,4,0


In [8]:
selection_df.loc[1]

school                  GP
gender                   F
age                     17
household size         GT3
study time               2
failures                 0
significant other       no
family relationship      5
free time                3
social activities        3
weekday drinker          1
weekend drinker          1
health                   3
absences                 4
drinker                  0
Name: 1, dtype: object

In [9]:
index_col = pd.Series(range(1, len(selection_df)+1), name='id')
selection_df = selection_df.join(index_col)
# selection_df = selection_df.set_index('id')

selection_df.head()

Unnamed: 0,school,gender,age,household size,study time,failures,significant other,family relationship,free time,social activities,weekday drinker,weekend drinker,health,absences,drinker,id
0,GP,F,18,GT3,2,0,no,4,3,4,1,1,3,6,0,1
1,GP,F,17,GT3,2,0,no,5,3,3,1,1,3,4,0,2
2,GP,F,15,LE3,2,3,no,4,3,2,2,3,3,10,1,3
3,GP,F,15,GT3,3,0,yes,3,2,2,1,1,5,2,0,4
4,GP,F,16,GT3,2,0,no,4,3,2,1,2,5,4,0,5


### school:
    GP = 0
    MS = 1
   
### gender:
    M = 0
    F = 1
    
### houshold size:
    LE3 = 0
    GT3 = 1

In [10]:
# replace these values: ['no', 'yes', 'GP', 'MS', 'LE3', 'GT3', 'M', 'F']
#               with:   [  0,    1,     0,    1,     0,     1,   0,   1]

encoded_df = selection_df.replace(['no', 'yes', 'GP', 'MS', 'LE3', 'GT3', 'M', 'F'], [0, 1, 0, 1, 0, 1, 0, 1])
encoded_df[:5]

Unnamed: 0,school,gender,age,household size,study time,failures,significant other,family relationship,free time,social activities,weekday drinker,weekend drinker,health,absences,drinker,id
0,0,1,18,1,2,0,0,4,3,4,1,1,3,6,0,1
1,0,1,17,1,2,0,0,5,3,3,1,1,3,4,0,2
2,0,1,15,0,2,3,0,4,3,2,2,3,3,10,1,3
3,0,1,15,1,3,0,1,3,2,2,1,1,5,2,0,4
4,0,1,16,1,2,0,0,4,3,2,1,2,5,4,0,5


### Features and Output

In [11]:
columns = ['id', 'gender', 'age', 'household size', 'study time', 'failures', 'significant other', 'family relationship', 'free time', 'social activities', 'absences', 'health', 'weekday drinker', 'weekend drinker', 'drinker']
# reserved = ['weekend drinker', 'weekday drinker']
# new_columns = [x for x in old_columns if x not in reserved]
# new_columns

### Re-order the columns in the dataframes and save them as csv files

In [12]:
# Re-order the dataframes
columns = ['id', 'gender', 'age', 'significant other', 'household size', 'family relationship', 'free time', 'social activities', 'study time','failures', 'absences', 'health', 'weekday drinker', 'weekend drinker', 'drinker']
selection_df = selection_df[columns]
encoded_df = encoded_df[columns]

In [13]:
selection_df.head()

Unnamed: 0,id,gender,age,significant other,household size,family relationship,free time,social activities,study time,failures,absences,health,weekday drinker,weekend drinker,drinker
0,1,F,18,no,GT3,4,3,4,2,0,6,3,1,1,0
1,2,F,17,no,GT3,5,3,3,2,0,4,3,1,1,0
2,3,F,15,no,LE3,4,3,2,2,3,10,3,2,3,1
3,4,F,15,yes,GT3,3,2,2,3,0,2,5,1,1,0
4,5,F,16,no,GT3,4,3,2,2,0,4,5,1,2,0


In [14]:
encoded_df.head()

Unnamed: 0,id,gender,age,significant other,household size,family relationship,free time,social activities,study time,failures,absences,health,weekday drinker,weekend drinker,drinker
0,1,1,18,0,1,4,3,4,2,0,6,3,1,1,0
1,2,1,17,0,1,5,3,3,2,0,4,3,1,1,0
2,3,1,15,0,0,4,3,2,2,3,10,3,2,3,1
3,4,1,15,1,1,3,2,2,3,0,2,5,1,1,0
4,5,1,16,0,1,4,3,2,2,0,4,5,1,2,0


In [15]:
raw_df.to_csv('../res/raw.csv', index=False)
selection_df.to_csv('../res/selection.csv', index=False)
encoded_df.to_csv('../res/encoded.csv', index=False)

## Database

In [24]:
import sqlite3
import numpy as np

con = sqlite3.connect('../res/tables.sql')
sqlite3.register_adapter(np.int32, int)
sqlite3.register_adapter(np.int64, int)
cur = con.cursor()

### Create database tables

In [25]:
profile_columns = ['id', 'gender', 'age', 'significant other','household size', 'family relationship', 'health']
activities_columns = ['id', 'study time', 'absences', 'free time', 'social activities', 'failures', 'weekday drinker', 'weekend drinker', 'drinker']

cur.execute('CREATE TABLE profile ( id INT NOT NULL PRIMARY KEY ASC, gender NUMERIC, age NUMERIC, "significant other" NUMERIC, "household size" NUMERIC, "family relationship" NUMERIC, health NUMERIC)')
cur.execute('CREATE TABLE activities (id INT NOT NULL PRIMARY KEY ASC, "study time" INT, absences INT, "free time" INT, "social activities" INT, failures INT, "weekday drinker" INT, "weekend drinker" INT, drinker INT)')
con.commit()

### Populate tables

In [26]:
profile_values = encoded_df[profile_columns]
activities_values = encoded_df[activities_columns]



cur.executemany('INSERT INTO profile VALUES (?, ?, ?, ?, ?, ?, ?)', profile_values.to_numpy())
cur.executemany('INSERT INTO activities VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)', activities_values.to_numpy())

con.commit()
con.close()