In [1]:
import pandas as pd
import numpy as np
import zipfile
from scipy import stats

In [2]:
parquet_file = pd.read_parquet('content/series_train.parquet')

In [3]:
parquet_file.describe()

Unnamed: 0,step,X,Y,Z,enmo,anglez,non-wear_flag,light,battery_voltage,time_of_day,weekday,quarter,relative_date_PCIAT
count,314569100.0,314569100.0,314569100.0,314569100.0,314569100.0,314569100.0,314569100.0,314569100.0,314569100.0,314569100.0,314569100.0,314569100.0,314569100.0
mean,185896.3,-0.0591091,0.02461483,-0.1529442,0.03361527,-12.51974,0.2717566,36.12054,3855.488,44610010000000.0,4.007238,2.366369,64.56994
std,117272.8,0.5199602,0.4806943,0.6506529,0.1086833,50.01252,0.4413042,180.8949,160.2819,24069450000000.0,1.976651,1.190769,90.45778
min,0.0,-8.040816,-5.429414,-8.040491,0.0,-90.0,0.0,0.0,3080.917,0.0,1.0,1.0,-143.0
25%,84673.0,-0.4676856,-0.2483811,-0.769328,0.0003109713,-51.58335,0.0,1.666667,3753.0,25040000000000.0,2.0,1.0,17.0
50%,178060.0,-0.01253703,0.007400024,-0.1876513,0.006778589,-11.39532,0.0,5.521324,3824.0,45450000000000.0,4.0,2.0,34.0
75%,280008.0,0.2541175,0.3002974,0.3387372,0.0258096,19.90912,1.0,15.33333,3976.0,64975000000000.0,6.0,4.0,74.0
max,756211.0,8.022779,7.90695,8.125557,11.3262,89.98114,1.0,20445.5,6000.0,86395000000000.0,7.0,4.0,748.0


In [4]:
parquet_file.isnull().values.any()

np.False_

dataframe does not contain any null values

In [5]:
parquet_file.columns


Index(['step', 'X', 'Y', 'Z', 'enmo', 'anglez', 'non-wear_flag', 'light',
       'battery_voltage', 'time_of_day', 'weekday', 'quarter',
       'relative_date_PCIAT', 'id'],
      dtype='object')

In [6]:
numerical_features = [
    "step",
    "X",
    "Y",
    "Z",
    "enmo",
    "anglez",
    "light",
    "battery_voltage",
    "relative_date_PCIAT",
    "time_of_day"
]

# Categorical features
categorical_features = [
    "id",
    "non-wear_flag",
    "weekday",
    "quarter"
]

In [7]:
parquet_file['id'].unique()

['00115b9f', '001f3379', '00f332d1', '01085eb3', '012cadd8', ..., 'fe9c71d8', 'fecc07d6', 'ff18b749', 'ffcd4dbd', 'ffed1dd5']
Length: 996
Categories (996, object): ['00115b9f', '001f3379', '00f332d1', '01085eb3', ..., 'fecc07d6', 'ff18b749', 'ffcd4dbd', 'ffed1dd5']

996 unique patiens in the parquet file

# Removing outliers 

In [8]:
# Make a copy of the original DataFrame
df_parquet_outliers = parquet_file.copy()
parquet_columns = df_parquet_outliers.columns

# Initialize a mask of True values for all rows (no outliers initially)
mask = pd.Series(True, index=df_parquet_outliers.index)

# Iterate over each column in columns_df
for column in parquet_columns:
    # Check if the column is numeric
    if pd.api.types.is_numeric_dtype(df_parquet_outliers[column]):
        # Calculate Q1, Q3, and IQR for the column
        q1 = df_parquet_outliers[column].quantile(0.25)
        q3 = df_parquet_outliers[column].quantile(0.75)
        IQR = q3 - q1
        lower_bound = q1 - 1.5 * IQR
        upper_bound = q3 + 1.5 * IQR

        # Update the mask: keep rows within bounds or where values are NaN
        mask &= ((df_parquet_outliers[column] >= lower_bound) & (df_parquet_outliers[column] <= upper_bound)) | df_parquet_outliers[column].isna()

# Apply the mask to filter rows, retaining the original values
df_parquet_outliers = df_parquet_outliers[mask]


In [9]:
len(df_parquet_outliers) / len(parquet_file)

0.6882702187683383

In [10]:
df_parquet_outliers

Unnamed: 0,step,X,Y,Z,enmo,anglez,non-wear_flag,light,battery_voltage,time_of_day,weekday,quarter,relative_date_PCIAT,id
4,4,0.022005,0.023763,-1.014323,0.016848,-88.130775,0.0,33.166668,4181.000000,57235000000000,4,3,41.0,00115b9f
5,5,0.023281,0.025156,-1.015599,0.016680,-88.130775,0.0,31.333334,4181.000000,57240000000000,4,3,41.0,00115b9f
6,6,0.024818,0.027109,-1.015781,0.017497,-88.130775,0.0,29.500000,4181.000000,57245000000000,4,3,41.0,00115b9f
7,7,0.022578,0.024766,-1.015026,0.015589,-88.141670,0.0,27.666666,4181.000000,57250000000000,4,3,41.0,00115b9f
8,8,0.022318,0.024792,-1.015990,0.016576,-88.130775,0.0,25.833334,4181.000000,57255000000000,4,3,41.0,00115b9f
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
314163266,409392,0.176080,0.446595,0.166321,0.031660,10.082080,0.0,1.826087,3421.000000,35545000000000,5,1,72.0,ffcd4dbd
314163267,409393,-0.444139,-0.140924,-0.238413,0.047348,-19.918631,0.0,1.913043,3420.833252,35550000000000,5,1,72.0,ffcd4dbd
314163268,409394,-0.102454,-0.312730,0.274058,0.048986,18.730869,0.0,2.000000,3420.666748,35555000000000,5,1,72.0,ffcd4dbd
314163269,409395,0.798383,-0.279327,0.356194,0.034082,21.289242,0.0,7.000000,3420.500000,35560000000000,5,1,72.0,ffcd4dbd


Leaving us with 68% of our original values

# Normalization

normalizing numerical features

In [11]:
from sklearn.preprocessing import MinMaxScaler

In [12]:
df_parquet_minmax = df_parquet_outliers.sample(300, random_state=42)

Sampled 300 values, because using all values in this case would take very long

In [13]:
scaler = MinMaxScaler()
df_prquet_normalize_normalized = pd.DataFrame(scaler.fit_transform(df_parquet_minmax[numerical_features]), columns=numerical_features)
df_parquet_minmax.loc[:, numerical_features] = scaler.fit_transform(df_parquet_minmax[numerical_features])

 0.15269653 0.43769738 0.00622841 0.21274821 0.5432717  0.3499626
 0.18358271 0.54470918 0.01708954 0.73602808 0.25540176 0.13939599
 0.77985153 0.09715454 0.52671535 0.06451644 0.66637501 0.44893351
 0.63054751 0.06140517 0.22472847 0.40669988 0.12927117 0.20721511
 0.01133379 0.16792088 0.35393128 0.11064259 0.0932093  0.23661108
 0.20930101 0.4919894  0.48167708 0.21204119 0.11610733 0.32684582
 0.15194069 0.029603   0.67290419 0.48072592 0.45319711 0.140228
 0.77320125 0.70161462 0.39394229 0.098551   0.31327965 0.44461914
 0.65884193 0.18845567 0.23460525 0.57715782 0.41815476 0.50941683
 0.24408555 0.31427377 0.59253841 0.21872662 0.167999   0.35391761
 0.25225533 0.28802033 0.08199661 0.16024132 0.53459021 0.27713966
 0.14422598 0.391831   0.57864998 0.19408057 0.02238047 0.33208205
 0.11690224 0.4775795  0.07145968 0.66229109 0.44873821 0.34501347
 0.32768955 0.75659803 0.58707171 0.06770584 0.33082817 0.78169915
 0.26860075 0.56352134 0.65789859 0.0962405  0.52845751 0.5531797

In [14]:
df_parquet_minmax

Unnamed: 0,step,X,Y,Z,enmo,anglez,non-wear_flag,light,battery_voltage,time_of_day,weekday,quarter,relative_date_PCIAT,id
256530761,0.537537,1.000000,0.491299,0.412550,0.150670,0.445982,0.172222,0.000000,0.478681,0.610435,3,4,0.819005,cfcf9dc9
312509384,0.778475,0.521225,0.480940,0.026551,0.000000,0.019291,0.161111,0.000000,0.178022,0.558144,7,1,0.864253,fdf4691f
9208253,0.224273,0.345113,0.853108,0.183662,0.216841,0.279830,0.000000,0.438972,0.660549,0.660514,1,4,0.389140,06c8b2fb
276847605,0.136738,0.298960,0.536419,0.055829,0.040491,0.131144,0.000000,0.050975,0.774066,0.821639,1,3,0.316742,e3bce7e1
86135910,0.732429,0.576870,0.454035,0.980612,0.000000,0.954582,1.000000,0.000000,0.308571,0.194608,3,3,0.556561,419d05c6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
243229381,0.130949,0.496685,0.494671,0.015065,0.015606,0.001446,1.000000,0.013792,0.801319,0.440575,2,3,0.321267,c55187b4
238300700,0.513440,0.471201,0.999554,0.511173,0.141480,0.510739,1.000000,0.031582,0.486593,0.899610,7,2,0.371041,c2b40d6f
20199673,0.109910,0.437395,0.991932,0.538468,0.034676,0.528796,0.000000,0.323332,0.750330,0.945321,4,1,0.597285,0eddd8e5
139131781,0.704236,0.904829,0.741307,0.375502,0.000000,0.418185,0.000000,0.202518,0.350769,0.598963,4,4,0.823529,6e9beaee


# One Hot Encoding on categorical Values

In [15]:
df_parquet_minmax.head()

Unnamed: 0,step,X,Y,Z,enmo,anglez,non-wear_flag,light,battery_voltage,time_of_day,weekday,quarter,relative_date_PCIAT,id
256530761,0.537537,1.0,0.491299,0.41255,0.15067,0.445982,0.172222,0.0,0.478681,0.610435,3,4,0.819005,cfcf9dc9
312509384,0.778475,0.521225,0.48094,0.026551,0.0,0.019291,0.161111,0.0,0.178022,0.558144,7,1,0.864253,fdf4691f
9208253,0.224273,0.345113,0.853108,0.183662,0.216841,0.27983,0.0,0.438972,0.660549,0.660514,1,4,0.38914,06c8b2fb
276847605,0.136738,0.29896,0.536419,0.055829,0.040491,0.131144,0.0,0.050975,0.774066,0.821639,1,3,0.316742,e3bce7e1
86135910,0.732429,0.57687,0.454035,0.980612,0.0,0.954582,1.0,0.0,0.308571,0.194608,3,3,0.556561,419d05c6


In [16]:
categorical_features.remove("id")


Remove id from Categorical_features list, appending time-period

In [17]:
categorical_features

['non-wear_flag', 'weekday', 'quarter']

In [18]:
import sklearn

In [19]:
from pandas.api.types import is_string_dtype
from pandas.api.types import is_numeric_dtype



# Initialize OneHotEncoder
encoder = sklearn.preprocessing.OneHotEncoder(sparse_output=False)
one_hot_encoded = encoder.fit_transform(df_parquet_minmax[categorical_features])

one_hot_parquetdf = pd.DataFrame(one_hot_encoded,
                          columns=encoder.get_feature_names_out(categorical_features))

print(one_hot_parquetdf.shape)
print(df_parquet_minmax.shape)

df_parquet_minmax.reset_index(drop=True, inplace=True)
one_hot_parquetdf.reset_index(drop=True, inplace=True)
df_sklearn_encoded = pd.concat([df_parquet_minmax.drop(categorical_features, axis=1), one_hot_parquetdf], axis=1)



print(df_sklearn_encoded.shape)


pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)


df_sklearn_encoded.head()

(300, 20)
(300, 14)
(300, 31)


Unnamed: 0,step,X,Y,Z,enmo,anglez,light,battery_voltage,time_of_day,relative_date_PCIAT,id,non-wear_flag_0.0,non-wear_flag_0.16111112,non-wear_flag_0.17222223,non-wear_flag_0.38333333,non-wear_flag_0.45,non-wear_flag_0.6888889,non-wear_flag_0.87777776,non-wear_flag_0.9611111,non-wear_flag_1.0,weekday_1,weekday_2,weekday_3,weekday_4,weekday_5,weekday_6,weekday_7,quarter_1,quarter_2,quarter_3,quarter_4
0,0.537537,1.0,0.491299,0.41255,0.15067,0.445982,0.0,0.478681,0.610435,0.819005,cfcf9dc9,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,0.778475,0.521225,0.48094,0.026551,0.0,0.019291,0.0,0.178022,0.558144,0.864253,fdf4691f,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
2,0.224273,0.345113,0.853108,0.183662,0.216841,0.27983,0.438972,0.660549,0.660514,0.38914,06c8b2fb,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,0.136738,0.29896,0.536419,0.055829,0.040491,0.131144,0.050975,0.774066,0.821639,0.316742,e3bce7e1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,0.732429,0.57687,0.454035,0.980612,0.0,0.954582,0.0,0.308571,0.194608,0.556561,419d05c6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
