# Classyfing semester grades regarding alcohol consumption and family situation between youg people

# uzasadnienie problemu 

#### necessary imports

In [110]:
from msrest.exceptions import HttpOperationError
from azureml.core import Workspace, Datastore, Dataset
from azureml.data.dataset_type_definitions import PromoteHeadersBehavior
import pandas as pd
import os

# Clean dataset locally

At first we need to make some changes to input data to make dataset computable. It is not possible to easily apply changes and load them straight online. More convenient way is to apply operations locally and then upload data to cloud.

So lets load the data

In [111]:
school_dataset = pd.read_csv('original_data/mat.csv')

First we list all variables and decide which are relevant and which are not

In [112]:
# print all available collumns from dataset
school_dataset.columns

Index(['school', 'sex', 'age', 'address', 'famsize', 'Pstatus', 'Medu', 'Fedu',
       'Mjob', 'Fjob', 'reason', 'guardian', 'traveltime', 'studytime',
       'failures', 'schoolsup', 'famsup', 'paid', 'activities', 'nursery',
       'higher', 'internet', 'romantic', 'famrel', 'freetime', 'goout', 'Dalc',
       'Walc', 'health', 'absences', 'G1', 'G2', 'G3'],
      dtype='object')

Let's mark the data that we don't want to analyze during classification
* drop mothers and fathers job, beacause it is not properly identifiable and cannot be easily distinguished and classified
* reason - why they picked this school -  irrelevant too
* nursery - attended nursery school - i dont want to take that into consideration



In [113]:
# school_dataset.drop(['Mjob', 'Fjob', 'reason', 'nursery'], axis=1)

Let's convert other values to numeric representations

In [114]:
# school - is binary because only two schools participated in study
# GP - 1, MS - 0
school_dataset['school'].replace({"GP": 1, "MS": 0}, inplace=True)
school_dataset['school'].unique()

array([1, 0])

In [115]:
# sex, Female - 1, M - 0
school_dataset['sex'].replace({"F": 1, "M": 0}, inplace=True)
school_dataset['sex'].unique()

array([1, 0])

In [116]:
# adress - U = urban places = 1, R = rural = 0
school_dataset['address'].replace({"U": 1, "R": 0}, inplace=True)
school_dataset['address'].unique()

array([1, 0])

In [117]:
# famsize - family size indicatin LE3 - 0 (less or equal to 3), GT3 - 1 (greater than 3)
school_dataset['famsize'].replace({"GT3": 1, "LE3": 0}, inplace=True)
school_dataset['famsize'].unique()

array([1, 0])

In [118]:
# Pstatus T - 1(Parents living together), A - 0 (Parents living apart)
school_dataset['Pstatus'].replace({"T": 1, "A": 0}, inplace=True)
school_dataset['Pstatus'].unique()

array([0, 1])

In [119]:
# guardian: indicating influence of father(1)/mother(2)/other(0)
school_dataset['guardian'].replace({"father": 1, "mother": 2, "other": 0}, inplace=True)
school_dataset['guardian'].unique()

array([2, 1, 0])

### Convert yes/no information to 1/0 adequatly

* schoolsup - extra educational support yes - 1, no - 0
* famsup - family educational support yes - 1, no - 0
* paid - extra paid classes in matter of subject yes - 1, no - 0
* activities - extra paid acivities yes - 1, no - 0
* higher - want to take higher education yes - 1, no - 0
* internet - has internet access at home
* romantic - with a romantic relationship

In [120]:
yes_no_to_numeric = [
    'schoolsup', 'famsup', 'paid', 'activities',
    'higher', 'internet', 'romantic', 'nursery'
]

for col in yes_no_to_numeric:
    school_dataset[col].replace({"yes": 1, "no": 0}, inplace=True)
    print(school_dataset[col].unique())

[1 0]
[0 1]
[0 1]
[0 1]
[1 0]
[0 1]
[0 1]
[1 0]


## other dataset columns description

* Medu -  Mother's education (numeric: 0 - none, 1 - primary education (4th grade), 2 - 5th to 9th grade, 3 - secondary
* Fedu - Father's education (numeric: 0 - none, 1 - primary education (4th grade), 2 - 5th to 9th grade, 3 - secondary
* traveltime - Home to school travel time (numeric: 1 - &lt;15 min., 2 - 15 to 30 min., 3 - 30 min. to 1 hour, or 4 - &gt;1 hour)
* studytime - Weekly study time (numeric: 1 - &lt;2 hours, 2 - 2 to 5 hours, 3 - 5 to 10 hours, or 4 - &gt;10 hours)
* failures - Number of past class failures (numeric: n if 1&lt;=n&lt;3, else 4)
* famrel - Quality of family relationships (numeric: from 1 - very bad to 5 - excellent)
* freetime - Free time after school (numeric: from 1 - very low to 5 - very high)
* goout - Going out with friends (numeric: from 1 - very low to 5 - very high)
* Dalc - Workday alcohol consumption (numeric: from 1 - very low to 5 - very high)
* Walc - Weekend alcohol consumption (numeric: from 1 - very low to 5 - very high)
* health - Current health status (numeric: from 1 - very bad to 5 - very good)
* absences - Number of school absences (numeric: from 0 to 93)
* G1 - First period grade (numeric: from 0 to 20)
* G2 - Second period grade (numeric: from 0 to 20)

## OUTPUT VARIABLE

* G3 - Final grade (numeric: from 0 to 20, output target)
 ---
 
#### or we can predict alcohol consumption basing on first two period grades Dalc, Walc ?


In [121]:
# save data locally on disk
school_dataset.to_csv('dataset/mat.csv')

# Configuration

In this section we can specify which actions before or after script are going to be presented

In [122]:
# this variable indicates if we want to upload new dataset
send_dataset_to_cloud = True
#specify folder in which we have data for presentation
local_dataset_source ='dataset'
# indicate where do we upload our dataset in cloud and where do we get data from our cloud data blob
upstream_dataset_path = 'datasets/tabular/'

In [123]:
#login to Microsoft account and connect with configured Azure workspace
workspace = Workspace.from_config()

Function `from_config` loads `config.json` file from directory where the notebook is run from.

File looks like the following example

```
{
    "subscription_id": "<your azure subscription id>",
    "resource_group": "<resource group name where your AML resource is placed>",
    "workspace_name": "<AML workspace name>"
}
```

# Dataset from/to cloud

In [124]:
# get default AML workspace datastore to get datasets or upload new ones
datastore = workspace.get_default_datastore()

# blob_datastore_name=os.getenv("BLOB_DATASTORE_NAME") 
# account_name=os.getenv("BLOB_ACCOUNTNAME") # Storage account name
# container_name=os.getenv("BLOB_CONTAINER") # Name of Azure blob container
# account_key=os.getenv("BLOB_ACCOUNT_KEY") # Storage account key

# try:
#     datastore = Datastore.get(workspace, blob_datastore_name)
#     print("Found Blob Datastore with name: %s" % blob_datastore_name)
# except HttpOperationError:
#     datastore = Datastore.register_azure_blob_container(
#        workspace=workspace,
#        datastore_name=blob_datastore_name,
#        account_name=account_name, # Storage account name
#        container_name=container_name, # Name of Azure blob container
#        account_key=account_key) # Storage account key
#     print("Registered blob datastore with name: %s" % blob_datastore_name)

# blob_data_ref = DataReference(
#    datastore=blob_datastore,
#    data_reference_name="blob_test_data",
#    path_on_datastore="testdata")

In [125]:
# decide if we want to upload data to cloud
if send_dataset_to_cloud:
    datastore.upload(
        src_dir = local_dataset_source,
        target_path = upstream_dataset_path,
        overwrite = True,
        show_progress = True)

# specify dataset path and datastore which stores it
datastore_paths = [
    (datastore, upstream_dataset_path + 'mat.csv')
#     (datastore, upstream_dataset_path + 'por.csv')
]

# read data from cloud blob as tabular data read from csv files
school_dataset = Dataset.Tabular.from_delimited_files(
    path=datastore_paths, separator=',',
    header=PromoteHeadersBehavior.ALL_FILES_HAVE_SAME_HEADERS
)

Uploading an estimated of 2 files
Uploading dataset/mat.csv
Uploaded dataset/mat.csv, 1 files out of an estimated total of 2
Uploading dataset/por.csv
Uploaded dataset/por.csv, 2 files out of an estimated total of 2
Uploaded 2 files


#### Drop irrelevant columns

In [128]:
school_dataset = school_dataset.drop_columns(['Mjob', 'Fjob', 'reason', 'nursery'])

# Split dataset into train and test subsets

In [129]:
train_dataset, test_dataset = school_dataset.random_split(0.9, seed=1)

### Check for remote AML compute instances

In [130]:
from azureml.core.compute import AmlCompute
from azureml.core.compute import ComputeTarget

ComputeTarget.list(workspace)

[]