# Modelization

In [1]:
# Set code path
import sys
import os
sys.path.append(os.path.abspath(os.path.join(os.path.dirname("Model.ipynb"), '..', 'code')))

In [2]:
from impute_and_drop import impute_missing, drop_data
from one_hot_encoding import one_hot_encoding
#from preprocess import get_preprocessed_data
import json
from sklearn.ensemble import RandomForestClassifier

In [3]:
#data = get_preprocessed_data(data_file_name="cis2080.csv", labels_file_name="descriptive_var_names.json")
from utils import *
data = load_my_data()

## Subset data
Select only those atributes whose metadata is considered complete.

In [4]:
with open("../metadata/descriptive_var_names.json") as f:
   var_names = json.load(f)

selected_vars = []
for key, value in var_names.items():
    if (value["description"] != "incomplete"):
        selected_vars.append(value["name"])

subdata = data[selected_vars].drop(columns=["day", "month", "year"])

## Missing values imputation
### Exploration

In [5]:
subdata

Unnamed: 0,region,population,cigarettes,cigars,drink_loc1,drink_loc2,political_espectrum,age,income,occupation,socioeconomic_condition,sex,education_level,sector,status
0,Andalusia,30000.0,,,,,4.0,17.0,125000,Skilled workers,Students,female,High school,Industry,Skilled workers
1,Andalusia,30000.0,15.0,0.0,Pubs and caffeterias,,3.0,33.0,125000,Farmers,Non-skilled workers,male,High school,Industry,Non-skilled workers
2,Andalusia,30000.0,,,,,3.0,68.0,25000,Skilled workers,Retirees and pensioners,female,Elementary school,Industry,No information
3,Andalusia,30000.0,,,,,3.0,39.0,125000,Skilled workers,Non payed housekeeping,female,Elementary school,Construction,Skilled workers
4,Andalusia,30000.0,,,Home,Pubs and caffeterias,3.0,41.0,175000,Profesionals,Technicians and middle management,male,Bachelor's degree,Servicies,Upper/Upper-middle class
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17611,La Rioja,6000.0,,,,,7.0,67.0,75000,Skilled workers,Non payed housekeeping,female,Elementary school,Industry,Skilled workers
17612,La Rioja,6000.0,,,,,,16.0,,Business owners,Students,male,High school,Agriculture,Old middle class
17613,La Rioja,6000.0,,,,,8.0,52.0,125000,Business owners,Non payed housekeeping,female,Elementary school,Agriculture,Old middle class
17614,La Rioja,6000.0,30.0,0.0,Home,Pubs and caffeterias,,26.0,,Business owners,Farmers,male,Vocational trainning,Agriculture,Old middle class


In [6]:
# Check that there is only one type of missing data
# Maybe this is not necesary

missing_types = []
for var_name in subdata.columns:
    missing_types.extend(list(subdata[var_name][subdata[var_name].isna()].unique()))
    
print(missing_types)

[nan, nan, nan, nan, nan, <NA>, nan, nan, nan, nan, nan, nan]


In [7]:
# How common are nans per variable?
for var_name in subdata.columns:
    prop_nas = sum(subdata[var_name].isna())/len(subdata)
    if prop_nas != 0.0:
        print(var_name + ": " + str(round(prop_nas, 3)))

population: 0.001
cigarettes: 0.649
cigars: 0.652
drink_loc1: 0.464
drink_loc2: 0.799
political_espectrum: 0.336
age: 0.002
income: 0.319
occupation: 0.006
sex: 0.001
sector: 0.007



| name                | prop  | type | comment                              |
| ------------------- | ----- | ---- | ------------------------------------ |
| population          | 0.001 | num  | median in their region               |
| cigarettes          | 0.656 | num  | 0                                    |
| cigars              | 0.979 | num  | 0                                    |
| political_espectrum | 0.336 | num  | 34%                                  |
| age                 | 0.003 | num  | median or drop rows                  |
| income              | 0.319 | num  | 32%                                  |
| occupation          | 0.045 | cat  | N.C.                                 |
| drink_loc1          | 0.464 | cat  | N.C.                                 |
| drink_loc2          | 0.799 | cat  | N.C.                                 |
| sex                 | 0.001 | cat  | New label: "No answer"               |
| sector              | 0.035 | cat  | N.S.                                 |



In [8]:
data = impute_missing(subdata)
data = drop_data(data)

## One hot encoding

In [9]:
data = one_hot_encoding(data)

## RandomForestClassifier

In [10]:
RFC = RandomForestClassifier(n_estimators=5, max_depth=4, random_state=1)