__Libraries__

In [11]:
import numpy as np
import pandas as pd
import random
from lda import LDA
from sklearn.calibration import CalibratedClassifierCV
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.preprocessing._data import _handle_zeros_in_scale
from sklearn.linear_model import SGDClassifier
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics import accuracy_score

__Reading CSV__

In [2]:
df_train = pd.read_csv('data/aps_failure_training_set.csv')
df_test = pd.read_csv('data/aps_failure_test_set.csv')

__Replace Nan Values__

In [3]:
df_train['class'] = df_train['class'].replace(['pos','neg'],[1,0])
df_train = df_train.replace('na',np.NaN)

#df_test['class'] = df_test['class'].replace(['pos','neg'],[1,0])
df_test = df_test.replace('na',np.NaN)

__Deleting Features With Zero Variance__

In [4]:
df_train = df_train.astype(float)
for i in df_train:
  if df_train[i].std() == 0:
    df_train = df_train.drop([i],axis=1)
    print('The feature with zero variance is : ',i)
df_train.shape

df_test = df_test.astype(float)
for i in df_test:
  if df_test[i].std() == 0:
    df_test = df_test.drop([i],axis=1)
    print('The feature with zero variance is : ',i)
df_test.shape

The feature with zero variance is :  cd_000
The feature with zero variance is :  cd_000


(16000, 170)

__Deleting Duplicates__

In [5]:
df_train = df_train.drop_duplicates(keep = 'first')
df_train = df_train.T.drop_duplicates().T
print(df_train.shape)

df_test = df_test.drop_duplicates(keep = 'first')
df_test = df_test.T.drop_duplicates().T
print(df_test.shape)

(60000, 171)
(16000, 170)


__Calculating Missing Values__

In [6]:
missing_feature_count = dict(df_train.drop('class',axis=1).isnull().sum())
missing_feature_count = dict(sorted(missing_feature_count.items(), key=lambda item:item[1],reverse=True))

In [7]:
features_tobe_eliminated = []
median_imp_features = []
model_imp_features = []
for i in missing_feature_count.keys():
  percent = (missing_feature_count[i]/df_train.shape[0])
  if percent > 0.20:
    features_tobe_eliminated.append(i)
  else:
    median_imp_features.append(i)

print("Features to be eliminated : ",features_tobe_eliminated)
print("Number of features to be eliminated : ",len(features_tobe_eliminated))
print("\nFeatures for median imputaton : ",median_imp_features)
print("Number of features for median imputaton : ",len(median_imp_features))

Features to be eliminated :  ['br_000', 'bq_000', 'bp_000', 'bo_000', 'ab_000', 'cr_000', 'bn_000', 'bm_000', 'bl_000', 'bk_000', 'ad_000', 'cf_000', 'cg_000', 'ch_000', 'co_000', 'ct_000', 'cu_000', 'cv_000', 'cx_000', 'cy_000', 'cz_000', 'da_000', 'db_000', 'dc_000']
Number of features to be eliminated :  24

Features for median imputaton :  ['ec_00', 'cm_000', 'cl_000', 'ed_000', 'ak_000', 'ca_000', 'dm_000', 'df_000', 'dg_000', 'dh_000', 'dl_000', 'dj_000', 'dk_000', 'eb_000', 'di_000', 'ac_000', 'bx_000', 'cc_000', 'bd_000', 'ds_000', 'dt_000', 'dp_000', 'dq_000', 'dr_000', 'du_000', 'dv_000', 'bc_000', 'cp_000', 'de_000', 'do_000', 'dy_000', 'ef_000', 'ar_000', 'bz_000', 'dx_000', 'dz_000', 'ea_000', 'eg_000', 'be_000', 'dd_000', 'ce_000', 'ax_000', 'ae_000', 'af_000', 'av_000', 'bf_000', 'bs_000', 'cb_000', 'bu_000', 'bv_000', 'cq_000', 'dn_000', 'ba_000', 'ba_001', 'ba_002', 'ba_003', 'ba_004', 'ba_005', 'ba_006', 'ba_007', 'ba_008', 'ba_009', 'cn_000', 'cn_001', 'cn_002', 'cn_

__Train & CV Split__

In [8]:
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42, stratify=y)

X_train = df_train.drop('class',axis=1)
y_train = df_train['class']

X_test = df_test

X_train.reset_index(drop=True,inplace=True)
y_train.reset_index(drop=True,inplace=True)
X_test.reset_index(drop=True,inplace=True)

print(10*'='+" Train Data "+10*'=')
print(X_train.shape)
print(y_train.shape)

print(10*'='+" Test Data "+10*'=')
print(X_test.shape)

(60000, 170)
(60000,)
(16000, 170)


In [9]:
X_train_droped = X_train.drop(features_tobe_eliminated,axis=1)
X_test_droped = X_test.drop(features_tobe_eliminated,axis=1)

print(10*'='+" Train Data "+10*'=')
print(X_train_droped.shape)
print(10*'='+" Test Data "+10*'=')
print(X_test_droped.shape)

(60000, 146)
(16000, 146)


__Scaling__

In [12]:
import numpy as np
from scipy import stats, sparse
from sklearn.utils.validation import check_array, FLOAT_DTYPES

class myRobustScaler:
    def __init__(self, with_centering=True, with_scaling=True, quantile_range=(25.0, 75.0), copy=True, unit_variance=False):
        self.with_centering = with_centering
        self.with_scaling = with_scaling
        self.quantile_range = quantile_range
        self.unit_variance = unit_variance
        self.copy = copy
        self.center_ = None
        self.scale_ = None


    def fit(self, X):
        X = check_array(X, accept_sparse="csc", dtype=FLOAT_DTYPES, force_all_finite="allow-nan")

        q_min, q_max = self.quantile_range
        if not 0 <= q_min <= q_max <= 100:
            raise ValueError("Invalid quantile range: %s" % str(self.quantile_range))

        if self.with_centering:
            if sparse.issparse(X):
                raise ValueError("Cannot center sparse matrices: use `with_centering=False` instead.")
            self.center_ = np.nanmedian(X, axis=0)
        else:
            self.center_ = None

        if self.with_scaling:
            quantiles = []
            for feature_idx in range(X.shape[1]):
                if sparse.issparse(X):
                    column_nnz_data = X.data[X.indptr[feature_idx] : X.indptr[feature_idx + 1]]
                    column_data = np.zeros(shape=X.shape[0], dtype=X.dtype)
                    column_data[: len(column_nnz_data)] = column_nnz_data
                else:
                    column_data = X[:, feature_idx]

                quantiles.append(np.nanpercentile(column_data, self.quantile_range))

            quantiles = np.transpose(quantiles)

            self.scale_ = quantiles[1] - quantiles[0]
            self.scale_ = _handle_zeros_in_scale(self.scale_, copy=False)
            if self.unit_variance:
                adjust = stats.norm.ppf(q_max / 100.0) - stats.norm.ppf(q_min / 100.0)
                self.scale_ = self.scale_ / adjust
        else:
            self.scale_ = None

    def transform(self, X):
        X = check_array(X, accept_sparse=("csr", "csc"), copy=self.copy, dtype=FLOAT_DTYPES, force_all_finite="allow-nan")

        if sparse.issparse(X):
            if self.with_scaling:
                inplace_column_scale(X, 1.0 / self.scale_)
        else:
            if self.with_centering:
                X -= self.center_
            if self.with_scaling:
                X /= self.scale_
        return X

    def fit_transform(self, X):
        self.fit(X)
        return self.transform(X)

scaler = myRobustScaler()

# Fit the scaler to the training data
X_train_scaled = scaler.fit_transform(X_train_droped)

# Transform the test data using the parameters learned from the training data
X_test_scaled = scaler.transform(X_test_droped)

__Median Imputation__

In [13]:
class MedianImputer:
    def __init__(self, strategy='mean'):
        self.strategy = strategy
        self.statistics_ = None

    def fit(self, X):
        if self.strategy not in ['mean', 'median']:
            raise ValueError("Invalid strategy. Please use 'mean' or 'median'.")

        if self.strategy == 'mean':
            self.statistics_ = np.nanmean(X, axis=0)
        elif self.strategy == 'median':
            self.statistics_ = np.nanmedian(X, axis=0)

    def transform(self, X):
        if self.statistics_ is None:
            raise ValueError("Imputer has not been fitted. Call fit() first.")

        X_imputed = X.copy()
        for i in range(X.shape[1]):
            nan_mask = np.isnan(X[:, i])
            if np.any(nan_mask):
                X_imputed[nan_mask, i] = self.statistics_[i]

        return X_imputed

    def fit_transform(self, X):
        self.fit(X)
        return self.transform(X)

In [14]:
median_imputer = MedianImputer(strategy='median')
median_imputer.fit(X_train_scaled)
X_train_median = median_imputer.transform(X_train_scaled)
X_test_median = median_imputer.transform(X_test_scaled)

__LDA__

In [15]:
lda = LDA(n_components=5)

# Fit the LDA model with the normalized features and target variable
lda.fit(X_train_median, y_train)

# Transform the features using the fitted LDA model
x_train_lda = np.real(lda.transform(X_train_median))
x_test_lda = np.real(lda.transform(X_test_median))

__Oversampling With SMOTE__

In [16]:
def nearest_neighbour(X):
    nbs = NearestNeighbors(n_neighbors=100, metric='euclidean', algorithm='kd_tree').fit(X)
    euclidean, indices = nbs.kneighbors(X)
    return indices

def SMOTE_100(X):
    indices2 = nearest_neighbour(X)
    matrix = []

    for m in range(len(indices2)):
        t = X[indices2[m]]
        newt = pd.DataFrame(t)
        matrix.append([])
        for j in range(len(newt.columns)):
            matrix[m].append(random.choice(newt[j]))
    return matrix

def apply_SMOTE(X_train, Y_train, num_iterations=1):
    for _ in range(num_iterations):
        unique, counts = np.unique(Y_train, return_counts=True)
        minority_shape = dict(zip(unique, counts))[1]

        if isinstance(X_train, pd.DataFrame):
            x1 = np.ones((minority_shape, X_train.shape[1]))
            x1 = [X_train.iloc[i] for i, v in enumerate(Y_train) if v == 1.0]
            x1 = np.array(x1)
        elif isinstance(X_train, np.ndarray):
            x1 = np.ones((minority_shape, X_train.shape[1]))
            x1 = X_train[Y_train == 1.0]

        sampled_instances = SMOTE_100(x1)

        X_train = np.concatenate((X_train, sampled_instances), axis=0)

        y_sampled_instances = np.ones(minority_shape)
        Y_train = np.concatenate((Y_train, y_sampled_instances), axis=0)

    return X_train, Y_train

# Example usage:
x_train_lda_final, y_train_final = apply_SMOTE(x_train_lda, y_train, num_iterations=6)
pd.DataFrame(y_train_final).value_counts()

1.0    64000
0.0    59000
Name: count, dtype: int64

__BEST__

In [17]:
param_grid = {
    'alpha': [0.0001, 0.001, 0.01, 0.1, 1],
    'loss': ['hinge', 'log_loss', 'modified_huber'],
    'class_weight': [None, 'balanced'],
    'n_jobs': [-1],
    'random_state': [42],
    'penalty': ['l1', 'l2', 'elasticnet'],
}


# Create the SGDClassifier
sgd_classifier = SGDClassifier()

# Create the GridSearchCV object
grid_search = GridSearchCV(estimator=sgd_classifier, param_grid=param_grid, scoring='accuracy', n_jobs=-1, cv=5)

# Fit the model to the training data
grid_search.fit(x_train_lda_final, y_train_final)

# Print the best parameters and corresponding accuracy
print("Best Parameters: ", grid_search.best_params_)
print("Best Accuracy: ", grid_search.best_score_)

y_pred = grid_search.predict(x_test_lda)

Best Parameters:  {'alpha': 0.0001, 'class_weight': None, 'loss': 'hinge', 'n_jobs': -1, 'penalty': 'l1', 'random_state': 42}
Best Accuracy:  0.9589024390243901


In [18]:
y_pred_df = pd.DataFrame(y_pred, columns=['class'])
y_pred_df['id'] = df_test['id']
y_pred_df['class'] = y_pred_df['class'].replace([1,0],['pos','neg'])
y_pred_df.to_csv('predicted_labels.csv', index=False)