# Feature Redaction

In questo file andiamo a ripulire i dataframe creati dal task successivo applicando una **riduzione delle feature**.<br>
Il metodo utilizzato maggiormente è il **KMeans**.

In [None]:
import nbimporter
from t1_ETLBasics import calories_to_df,sleep_to_df,exercise_to_df
from t3_TSToEvents import calories_to_events, sleep_to_events,exercise_to_events
import dateutil as du
import pandas as pd
from sklearn.cluster import KMeans
import numpy as np
import matplotlib.pyplot as plt
from scipy.spatial import distance
PATH = '../../pmdata/'

## Calories

In [None]:
cal = calories_to_events(calories_to_df(PATH,[1]))

In [None]:
cal

In [None]:
def calories_kmeans(df, n_clusters=2):
    """Funzione che applica il kmeans al df delle calorie in cui riduco ad una etichetta la media,
    la deviazione standard e la somma delle calorie. 

    Parametri:
    - df: dataframe delle calorie
    - n_cluster: numero di cluster per applicare Kmeans

    Return:
    - df: dataframe delle calorie modificato con aggiunta del cluster"""
    mean_std_sum_list = [(x[0], x[1],x[2]) for x in  [list(x) for x in list(df.to_numpy())]]
    mean_std_sum_set = set(mean_std_sum_list)
    mean_list = [x[0] for x in mean_std_sum_list]
    min_mean, max_mean = min(mean_list), max(mean_list)
    std_list = [x[1] for x in mean_std_sum_list]
    min_std, max_std = min(std_list), max(std_list)
    sum_list = [x[2] for x in mean_std_sum_list]
    min_sum, max_sum = min(sum_list), max(sum_list)
    mean_range = max_mean - min_mean
    std_range = max_std - min_std
    sum_range = max_sum - min_sum
    norm_mean_std_sum_list = [((x[0] - min_mean)/mean_range, (x[1] - min_std)/std_range, (x[2] - min_sum)/sum_range) for x in  mean_std_sum_list ]
    points = np.array(norm_mean_std_sum_list)
    kmeans = KMeans(n_clusters=n_clusters, random_state=0).fit(points)
    cluster_indexes = kmeans.predict(points)
    df['cluster_index'] = cluster_indexes
    cluster_centers =  kmeans.cluster_centers_
    point_centers = [ cluster_centers[i] for i in cluster_indexes ]
    df['cluster_center'] = point_centers
    df = df.drop(['c_mean','c_std','c_sum'], axis=1)
    return df

In [None]:
cal_kmeans = calories_kmeans(cal,5)

In [None]:
#cal_kmeans.to_csv('./dataframe/cal_kmeans.csv')
cal_kmeans

## Sleep

In [None]:
sleep = sleep_to_events(sleep_to_df(PATH,[1,2,3]))

In [None]:
sleep

In [None]:
def print_kmeans(points,cluster_indexes):
    plt.scatter(
     [i[0] for i in points], 
     [i[1] for i in points],
     [i[2] for i in points],
    c =  list(cluster_indexes)) 
    plt.show()

In [None]:
def sleep_kmeans(df,n_cluster=2):
    """funzione che applica Kmeans al dataFrame per il sonno (sleep)

        Paramentri:
        - df: dataFrame sleep
        - n_cluster: numero di cluster
        
        Return:
        - ritorna il df aggiornato dopo che ho applicato Kmeans e ridotto le feature"""
    minutes_list = [(x[1],x[2],x[3]) for x in [list(x) for x in list(df.to_numpy())]]
    asleep_list = [x[0] for x in minutes_list]
    min_asleep, max_asleep = min(asleep_list), max(asleep_list)
    awake_list = [x[1] for x in minutes_list]
    min_awake, max_awake = min(awake_list), max(awake_list)
    afterWU_list = [x[2] for x in minutes_list]
    min_afterWU, max_afterWU = min(afterWU_list), max(afterWU_list)
    afterWU_range = max_afterWU - min_afterWU
    asleep_range = max_asleep - min_asleep
    awake_range = max_awake - min_awake
    norm_minutes_list = [((x[0] - min_asleep)/asleep_range, (x[1] - min_awake)/awake_range, (x[2]- min_afterWU)/afterWU_range) for x in  minutes_list]
    points = np.array(norm_minutes_list)
    kmeans = KMeans(n_clusters=n_cluster, random_state=0).fit(points)
    cluster_indexes = kmeans.predict(points)
    df['cluster_index'] = cluster_indexes
    cluster_centers =  kmeans.cluster_centers_
    point_centers = [ cluster_centers[i] for i in cluster_indexes ]
    df['cluster_center'] = point_centers
    arr = []
    for e in df['efficiency']:
        if(e >= 96 and e<= 100):
            arr.append('Alta')
        elif(e >= 91 and e<= 95):
            arr.append('Media-Alta')
        elif(e >= 86 and e<= 90):
            arr.append('Media-Bassa')
        else:
            arr.append('Bassa')
    df['eff'] = arr
    #print_kmeans(points,cluster_indexes)
    df = df.drop(['minutesAsleep','minutesAwake','minutesAfterWakeup','efficiency'], axis=1)
    df['efficiency'] = df['eff']
    df = df.drop(['eff'], axis=1)
    return df


In [None]:
sl_kmean = sleep_kmeans(sleep,5)

In [None]:
#sl_kmean.to_csv('./dataframe/sleep_kmeans.csv')
sl_kmean

## Exercise

In [None]:
ex = exercise_to_events(exercise_to_df(PATH,[2]))

In [None]:
#ex.to_csv('./dataframe/exercise.csv')
ex

In [None]:
def exercise_kmeans(df,n_cluster=2):
    """funzione che applica Kmeans al dataFrame per gli esercizi (exercise)

        Paramentri:
        - df: dataFrame exercise
        - n_cluster: numero di cluster
        
        Return:
        - ritorna il df aggiornato dopo che ho applicato Kmeans e ridotto le feature"""
    steps_durations_calories_list = [(x[4],x[5],x[7]) for x in [list(x) for x in list(df.to_numpy())]]
    steps_list = [x[0] for x in steps_durations_calories_list]
    min_steps, max_steps = min(steps_list), max(steps_list)
    durations_list = [x[1] for x in steps_durations_calories_list]
    min_duration, max_duration = min(durations_list), max(durations_list)
    calories_list = [x[2] for x in steps_durations_calories_list]
    min_calories, max_calories = min(calories_list), max(calories_list)
    calories_range = max_calories - min_calories
    stepes_range = max_steps - min_steps
    durations_range = max_duration - min_duration
    norm_steps_durations_calories_list = [((x[0] - min_steps)/stepes_range, (x[1] - min_duration)/durations_range, (x[2]- min_calories)/calories_range) for x in  steps_durations_calories_list]
    points = np.array(norm_steps_durations_calories_list)
    kmeans = KMeans(n_clusters=n_cluster, random_state=0).fit(points)
    cluster_indexes = kmeans.predict(points)
    df['cluster_index'] = cluster_indexes
    cluster_centers =  kmeans.cluster_centers_
    point_centers = [ cluster_centers[i] for i in cluster_indexes ]
    df['cluster_center'] = point_centers
    #print_kmeans(points,cluster_indexes)
    df = df.drop(['steps','durationMin','calories','extendHour','startHour'], axis=1)
    return df

In [None]:
ex_kmeans = exercise_kmeans(ex,5)

In [None]:
#ex_kmeans.to_csv('./dataframe/exercise_kmeans.csv')
ex_kmeans