In [None]:
! pip install optuna textstat category_encoders
! pip install https://github.com/pandas-profiling/pandas-profiling/archive/master.zip

In [None]:
! unzip Data.zip

In [None]:
import os, random, optuna, textstat
import numpy as np
import pandas as pd
from pandas.api.types import CategoricalDtype
from pandas_profiling import ProfileReport

from scipy.stats import mode
from sklearn.model_selection import cross_validate, KFold, RepeatedKFold, train_test_split
from sklearn.preprocessing import FunctionTransformer, StandardScaler, OneHotEncoder, RobustScaler, OrdinalEncoder
from sklearn.metrics import mean_squared_error, mean_absolute_error, balanced_accuracy_score
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer, SimpleImputer, KNNImputer
from sklearn.compose import ColumnTransformer
from sklearn.neighbors import NearestNeighbors, LocalOutlierFactor
from sklearn.metrics import accuracy_score, mean_squared_error
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.cluster import KMeans, DBSCAN
from sklearn.decomposition import PCA
from sklearn.feature_selection import mutual_info_regression
from sklearn.base import TransformerMixin

import category_encoders as ce

import lightgbm as lgb
from lightgbm import LGBMClassifier, LGBMRegressor, early_stopping, Dataset

import seaborn as sns
from matplotlib import pyplot as plt

plt.style.use("seaborn-whitegrid")
plt.rc("figure", autolayout=True)
plt.rc(
    "axes",
    labelweight="bold",
    labelsize="large",
    titleweight="bold",
    titlesize=14,
    titlepad=10,
)

In [1]:
! gdown "1d0XpolF_YMrXX_Wib6lMZVQMgBt2bpc0"

Downloading...
From: https://drive.google.com/uc?id=1d0XpolF_YMrXX_Wib6lMZVQMgBt2bpc0
To: /content/power.csv
  0% 0.00/14.2M [00:00<?, ?B/s]100% 14.2M/14.2M [00:00<00:00, 204MB/s]


In [None]:
'''Set dataset directory'''
ROOT_DIR = "/content/Data"

# 1. Exploratory Data Analysis
- look at the dataset basics (size of the data, data types, look at a few examples etc.)
- look for any missing data
- look at target value
- look for any outliers in the data

## 1(a) Profile the dataset

In [None]:
df = pd.read_csv(os.path.join(ROOT_DIR, "power.csv"))

display(df.head())

In [None]:
print("df shape:", df.shape)

In [None]:
df.info()

In [None]:
ProfileReport(df)

From the initial dataset profiling:

## 1(b) look at the missing values
- look for any patterns in missing data
- look at some examples of missing data

In [None]:
df.isna().sum()

## 1(c) Look at the target variable

In [None]:
plt.figure(figsize=(12,8))
sns.distplot(df[''])

## 1(d) Look for Outliers

In [None]:
df.describe()

In [None]:
variable =''

df[(df[variable] > df[variable].mean()+3*df[variable].std()) | (df[variable] < df[variable].mean()-3*df[variable].std())]

# 2. Import and Preprocess Data
- import some helper functions to do imputation and deal with data outliers
- use a function to do the data import and cleaning

In [None]:
def simple_impute(df):
    '''
    Impute the numerical columns by the median value for each column and
    impute the categorical columns by the most frequent, or mode, for each column
    Note: one can easily switch in different imputers for each of the data types to something like kNN or iterative
    '''
    df= df.copy()
    # Impute missing values for numerical data
    # imp_num = IterativeImputer(estimator=ExtraTreesRegressor(), initial_strategy='median', max_iter=20)
    imp_num = SimpleImputer(strategy='median')
    numerical_df = df.select_dtypes("number")
    numerical_df = pd.DataFrame(data=imp_num.fit_transform(numerical_df), index=df.index, columns =numerical_df.columns)
    
    if df.select_dtypes("category").shape[1] >0:
        # Imput missing values for categorical data
        # imp_cat = IterativeImputer(estimator=ExtraTreesClassifier(), initial_strategy='most_frequent', max_iter=20)
        imp_cat = SimpleImputer(strategy='most_frequent')
        categorical_df = df.select_dtypes("category")
        enc = OrdinalEncoder()
        categorical_df = pd.DataFrame(data=enc.fit_transform(categorical_df), columns=categorical_df.columns)
        categorical_imputations = enc.inverse_transform(imp_cat.fit_transform(categorical_df))
        categorical_df = pd.DataFrame(data=categorical_imputations, index=df.index, columns =categorical_df.columns, dtype="category")
        return categorical_df.join(numerical_df).reindex(columns= df.columns)
    else:
        return numerical_df

In [None]:
class ML_Impute(TransformerMixin):
      '''
      Impute missing values by treating the imputational as a machine learning problem. For numerical
      columns, we can treat the problem as a regression problem, and for categorical, a classification problem.
      For this method, we'll iterate through all of the columns with one column being the target variable
      and the others as being predictor variables
      '''

    def __init__(self, params={}):
        self.params = {}
        self.models = {}

    def fit(self, df):
        df = df.copy()
        # label encode categorical variables
        columns = df.columns.to_list()
        cat_cols = df.select_dtypes("category").columns.to_list()
        self.enc = OrdinalEncoder()
        df[cat_cols] = self.enc.fit_transform(df[cat_cols])

        # Randomized column selection
        for i in random.sample(range(len(df.columns)), len(df.columns)):

        # Starting with most null values to least
        # for i in np.argsort(-df.isnull().sum().values):
            column = columns[i]
            # Check to make sure there are null values that need to be imputed
            if not df[column].isnull().any():
                continue

            print("Creating Imputation Model for Column: {}".format(column))

            # Create train, test, and validation data using the null values of the column of interest
            X= df.loc[df[column].notnull()]
            y = X.pop(column)
            X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.1)

            X_test = df.loc[df[column].isnull()]
            _ = X_test.pop(column)

            # If we have more data, we use more estimators for the imputation model
            n_estimators = min(100, int(len(X_train) / 10))
            if column in cat_cols:
                model = LGBMClassifier(**self.params, verbose=-1, n_estimators=n_estimators)
            else:
                model = LGBMRegressor(**self.params, verbose=-1, n_estimators=n_estimators)

            model.fit(X_train, y_train, eval_set=[(X_valid, y_valid)],
                      callbacks = [early_stopping(min(20, int(len(X_train) / 10)))])
            print("Score of Column {} is {}".format(column, model.score(X,y)))
            self.models[column] = model 

    def transform(self, df):
        df = df.copy()

        # label encode categorical variables
        columns = df.columns.to_list()
        cat_cols = df.select_dtypes("category").columns.to_list()
        df[cat_cols] = self.enc.transform(df[cat_cols])

        for column in self.models.keys():
          X_test = df.loc[df[column].isnull()]
          _ = X_test.pop(column)
          model = self.models[column]
          preds =model.predict(X_test)
          m = df[column].isna()
          df.loc[m, column]  = preds.flatten()

        if len(cat_cols) >0:
          df[cat_cols] = self.enc.inverse_transform(df[cat_cols])

        return df

    def fit_transform(self, df):
        self.fit(df)
        df = self.transform(df)
        return df


In [None]:
def cap_outliers(df, variables=[None]):
    df = df.copy()
    for variable in variables:
    upper_limit = df[variable].mean() + 3*df[variable].std()
    lower_limit = df[variable].mean() - 3*df[variable].std()
    df[variable] = np.where(df[variable]> upper_limit, upper_limit, np.where(
        df[variable]<lower_limit, lower_limit, df[variable]
    ))
    return df

In [None]:
# Wrapper function to read in, encode and impute missing values for the data

def preprocess_data(df, cat_features=[None], outlier_features=[None]):
    
    # Specify categorical variables
    for name in cat_features:
        df[name] = df[name].astype("category")
        # Add a None category for missing values
        if "None" not in df[name].cat.categories:
            df[name].cat.add_categories("None", inplace=True)

    # create test data set
    train, test = train_test_split(df, test_size=0.2)

    # Preprocessing
    imputer = ML_Impute()
    train = imputer.fit_transform(train)
    train = cap_outliers(train, outlier_features)

    test = imputer.transform(test)
    test = cap_outliers(test, outlier_features)


    return train, test

In [None]:
#Now, load in and preprocess the data

df = pd.read_csv(os.path.join(ROOT_DIR, ""), index_col='')
target_var = ""

train, test = load_data(df, cat_features = [], outlier_features=[])

# 3. Create a Baseline
- specify a baseline scoring function
- create a baseline model. 
  - For regression use `LGBMRegressor` and `scoring='neg_root_mean_squared_error'` and multiply the result by `-1`
  - For classification use `LGBMClassifier` and `scoring='balanced_accuracy'`

In [None]:
def score_dataset(X, y, 
                  model=LGBMRegressor(n_estimators=1000, verbose=-1, random_state=42)
                 ):
    
    
    scores = cross_validate(
        model, X, y, cv=5, n_jobs=-1, scoring='neg_root_mean_squared_error', return_train_score=True
    )
    
    return {"Training":-1*np.mean(scores["train_score"]), "Validation":-1*np.mean(scores["test_score"])}

In [None]:
X = train.copy()
y = X.pop(target_var)
X = ce.OrdinalEncoder().fit_transform(X)

score_dataset(X, y)

# 4. Featurize the Data
- remove uniformative features
- create interactions
- Binning
- Indicate Outliers
- Try different encodings for categorical variables

In [None]:
def make_mi_scores(X, y):
    X = X.copy()
    for colname in X.select_dtypes(["object", "category"]):
        X[colname], _ = X[colname].factorize()
    # All discrete features should now have integer dtypes
    discrete_features = [pd.api.types.is_integer_dtype(t) for t in X.dtypes]
    mi_scores = mutual_info_regression(X, y, discrete_features=discrete_features, random_state=0)
    mi_scores = pd.Series(mi_scores, name="MI Scores", index=X.columns)
    mi_scores = mi_scores.sort_values(ascending=False)
    return mi_scores

In [None]:
X = train.copy()
y = X.pop(target_var)
mi_scores = make_mi_scores(X, y)
mi_scores

In [None]:
# Try removing some of the uninformative features to see if that improves scores
uninformative_features = [

]

X = train.copy()
y = X.pop(target_var)
X = X.loc[:,~X.columns.isin(uninformative_features)]
X = ce.OrdinalEncoder().fit_transform(X)

score_dataset(X, y)

For mathematical transforms, try things like log transforms:

`X['feature'] = np.log1p(X['feature'])`

For interations try multiplying or dividing features, especially between levels
of a categorical feature and a continuous feature. Use subject matter expertise here

`df_new['A_B_interation'] = ce.OneHotEncoder().fit_transform(X['A']) * X['B']`

In [None]:
def mathematical_transforms(df):
    X = pd.DataFrame(index=df.index)

    return X

def interactions(df):
    X = pd.DataFrame(index=df.index)

    return X

In [None]:
X = train.copy()
y = X.pop(target_var)
#X = X.join(mathematical_transforms(X))
#X = X.join(interactions(X))
X = ce.OrdinalEncoder().fit_transform(X)

score_dataset(X, y)

Notes:

In [None]:
def cluster_labels(df, features, n_clusters=10):
    X = df.copy()
    X_scaled = X.loc[:, features]
    X_scaled = ce.OneHotEncoder().fit_transform(X_scaled)
    X_scaled = (X_scaled - X_scaled.mean(axis=0)) / (X_scaled.std(axis=0)+0.000001)
    kmeans = KMeans(n_clusters=n_clusters, n_init=50)
    X_new = pd.DataFrame(index=X.index)
    X_new["Cluster"] = kmeans.fit_predict(X_scaled)
    X_new["Cluster"] = X_new["Cluster"].astype("category")
    return X_new["Cluster"]


def cluster_distance(df, features, n_clusters=10):
    X = df.copy()
    X_scaled = X.loc[:, features]
    X_scaled = ce.OneHotEncoder().fit_transform(X_scaled)
    X_scaled = (X_scaled - X_scaled.mean(axis=0)) / (X_scaled.std(axis=0)+0.000001)
    kmeans = KMeans(n_clusters=n_clusters, n_init=50)
    X_cd = kmeans.fit_transform(X_scaled)
    # Label features and join to dataset
    X_cd = pd.DataFrame(
        X_cd, columns=[f"Centroid_{i}" for i in range(X_cd.shape[1])], index=X.index
    )
    return X_cd

In [None]:
cluster_features = [
    
]

In [None]:
X = train.copy()
y = X.pop(target_var)
X = X.join(cluster_distance(X, vape, n_clusters=10))
X = ce.OrdinalEncoder().fit_transform(X)

score_dataset(X, y)

In [None]:
X = train.copy()
y = X.pop(target_var)
X = X.join(cluster_labels(X, vape, n_clusters=10))
X = ce.OrdinalEncoder().fit_transform(X)

score_dataset(X, y)

Notes:

In [None]:
def flag_outliers(df):
    df = df.copy()
    df = ce.OneHotEncoder().fit_transform(df)
    clf = LocalOutlierFactor()
    df["Outlier"] = clf.fit_predict(df)
    return df["Outlier"]

In [None]:
X = train.copy()
y = X.pop(target_var)
X = X.join(flag_outliers(X))
X = ce.OrdinalEncoder().fit_transform(X)

score_dataset(X, y)

Notes:

# 5. Finalize Features for Final Model

In [None]:
def create_features(df, df_test=None):
    X = df.copy()
    
    if df_test is not None:
        X_test = df_test.copy()
        X = pd.concat([X, X_test])
        
    # Add in engineered features
    X = X.join(flag_outliers(X))
    X = ce.OrdinalEncoder().fit_transform(X)

    
    # Reform splits
    if df_test is not None:
        X_test = X.loc[df_test.index, :]
        X.drop(df_test.index, inplace=True)
    

    if df_test is not None:
        return X, X_test
    else:
        return X

In [None]:
X = train.copy()
X_test = test.copy()
y = X.pop(target_var)
y_test = X_test.pop(target_var)

X, X_test = create_features(train, test)

In [None]:
score_dataset(X, y)

# 6. Hyperparameter Tuning
- you need to specify the objective (i.e. `'regression'`, `'multiclass'` with '`num_class`', etc.) 

In [None]:
def objective(trial, X, y):
    # Specify a search space using distributions across plausible values of hyperparameters.
    param = {
        "objective": "",
        "verbosity": -1,              
        'lambda_l1': trial.suggest_loguniform('lambda_l1', 1e-8, 10.0),
        'lambda_l2': trial.suggest_loguniform('lambda_l2', 1e-8, 10.0),
        'num_leaves': trial.suggest_int('num_leaves', 2, 512),
        'feature_fraction': trial.suggest_uniform('feature_fraction', 0.1, 1.0),
        'bagging_fraction': trial.suggest_uniform('bagging_fraction', 0.1, 1.0),
        'bagging_freq': trial.suggest_int('bagging_freq', 0, 15),
        'min_child_samples': trial.suggest_int('min_child_samples', 1, 100),
    }
    
    # Run LightGBM for the hyperparameter values
    lgbcv = lgb.cv(param,
                   lgb.Dataset(X, label=y),
                   folds= KFold(n_splits=5, shuffle=True),
                   verbose_eval=False,                   
                   early_stopping_rounds=10,                   
                   num_boost_round=100
                  )
    
    cv_score = lgbcv['l2-mean'][-1]
    
    # Return metric of interest
    return cv_score

In [None]:
optuna.logging.set_verbosity(optuna.logging.WARNING) 
study = optuna.create_study(direction='minimize')
study.optimize(lambda trial: objective(trial, X, y), timeout=300, n_trials=5) 

In [None]:
optuna.visualization.plot_optimization_history(study)

In [None]:
print(study.best_params)

In [None]:
print(study.best_value**0.5)

# 7. Fit final model and check predictions
- you need to specify the objective (i.e. `'regression'`, `'multiclass'` with '`num_class`', etc.) 

In [None]:
preds = []
skf = RepeatedKFold(n_splits=3, n_repeats=2)

for fold_idx, (train_index, valid_index) in enumerate(skf.split(X, y)):
    X_train, X_valid = X.iloc[train_index], X.iloc[valid_index]
    y_train, y_valid = y.iloc[train_index], y.iloc[valid_index]

    lgb_train = lgb.Dataset(X_train, y_train)
    lgb_eval = lgb.Dataset(X_valid, y_valid, reference=lgb_train)
    lgb_params = {
        'objective': '',
        'verbose': -1,
        'n_estimators': 500,
        **study.best_params
    }
    model = lgb.train(lgb_params, lgb_train, valid_sets=lgb_eval, callbacks=[lgb.early_stopping(10)])

    y_pred = model.predict(X_valid)
    score = mean_squared_error(y_valid, y_pred, squared=False)
    print("Fold {} MSE Score: {}".format(fold_idx, score))
    print("----------------------")
    preds.append( model.predict(X_test))

In [None]:
# Use average for ensembling of the labels

final_preds = np.mean(preds, axis=0)

In [None]:
fig, axs = plt.subplots(1, 2, sharey=True, figsize=(16,8))
sns.distplot(y_test, ax=axs[0])
axs[0].set_title("Distribution of Test Target Variable")
sns.distplot(final_preds , ax=axs[1])
axs[1].set_title("Distribution of Predicted Target Variable")

In [None]:
print("Test Accuracy: {}".format(mean_squared_error(y_test, final_preds, squared=False)))