In [None]:
import numpy as np
import pandas as pd
import sklearn
import seaborn as sns
import missingno as msno

from sklearn.impute import SimpleImputer

In [None]:
df_raw = pd.read_csv('household_power_consumption.txt',  sep=';', low_memory=False)

In [None]:
# copy dataframe
df = df_raw.copy()

# drop column
df = df.drop(['Voltage'], axis = 1)

# rename column
df = df.rename(columns={'Sub_metering_1':'Kitchen',
                        'Sub_metering_2':'Laundry room', 
                        'Sub_metering_3':'Electric water-heater',
                        'Global_active_power': 'Active_power',
                        'Global_reactive_power': 'Reactive_power',
                        'Global_intensity': 'Intensity'})

In [None]:
# change data type
columns_float = df.columns.drop(['Date','Time'])
df[columns_float] = df[columns_float].apply(pd.to_numeric, errors='coerce')

df['Date'] = pd.to_datetime(df['Date'])
df['Time'] = pd.to_datetime(df['Time'],  format= '%H:%M:%S').dt.time

In [None]:
df.head()

In [None]:
# df['Year'] = pd.DatetimeIndex(df['Date']).year
# df['Month'] = pd.DatetimeIndex(df['Date']).month
# df['Day'] = pd.DatetimeIndex(df['Date']).day

In [None]:
# drop  '2007-01-01' >= data < '2010-11-01'
df = df[df.Date < pd.to_datetime('2010-11-01')]
df = df[df.Date >= pd.to_datetime('2007-01-01')]

In [None]:
# # validate correct drop data
# df.value_counts(subset =['Year','Month']).sort_index(ascending = False)

In [None]:
# check missing values

def missing_values_table(dataframe, na_name=False):
    na_columns = [col for col in dataframe.columns if dataframe[col].isnull().sum() > 0]

    n_miss = dataframe[na_columns].isnull().sum().sort_values(ascending=False)
    ratio = (dataframe[na_columns].isnull().sum() / dataframe.shape[0] * 100).sort_values(ascending=False)
    missing_df = pd.concat([n_miss, np.round(ratio, 2)], axis=1, keys=['n_miss', 'ratio'])
    print(missing_df, end="\n")

    if na_name:
        return na_columns
missing_values_table(df)

In [None]:
msno.matrix(df);

In [None]:
# change nan to most_frequent (strategy: 'mean', 'median', 'most_frequent', 'constant')
imputer = SimpleImputer(missing_values=np.nan, strategy='most_frequent')


df['Active_power'] = imputer.fit_transform(df[['Active_power']])

df['Reactive_power'] = imputer.fit_transform(df[['Reactive_power']])

df['Intensity'] = imputer.fit_transform(df[['Intensity']])

df['Kitchen'] = imputer.fit_transform(df[['Kitchen']])

df['Laundry room'] = imputer.fit_transform(df[['Laundry room']])

df['Electric water-heater'] = imputer.fit_transform(df[['Electric water-heater']])


In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.describe().T.apply(lambda x: round(x,2))

In [None]:
df.isnull().sum() / len(df)

In [None]:
correlation_matrix = np.round(df.corr(), 3)
correlation_matrix

In [None]:
sns.set(rc={'figure.figsize':(10,8)})
color_map = sns.diverging_palette(240, 10, n=10)
heatmap = sns.heatmap(correlation_matrix, cmap=color_map, annot=True, square=True);

In [None]:
sns.set_theme(style="whitegrid")
sns.boxplot(data=df, orient="h", palette="Set2");
