In [372]:
import pandas as pd
data = pd.read_csv("https://docs.google.com/spreadsheets/d/e/2PACX-1vQtBXo5cBnDsM2fmfHPm6u72KGUS5FjPHNGMxOfYjA9-CAhmnRpwkIw_rOR3sANJIToiUU__6fbBvig/pub?gid=572763137&single=true&output=csv")

In [373]:
#finding number of null values per column
data.isnull().sum()
#filling null values with median of each column
data.fillna(data.median(numeric_only=True), inplace=True)

#Seprating Inputs and Outputs
X=data.drop(columns=['Time','Pass/Fail'])
y=data['Pass/Fail']

X.shape

(1567, 590)

In [374]:
#Dropping Constant Columns before Stadardization
X = X.loc[:, X.std()!=0]
X.shape

(1567, 474)

In [375]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train_scaled = pd.DataFrame(scaler.fit_transform(X_train), columns=X_train.columns, index=X_train.index)
X_test_scaled = pd.DataFrame(scaler.transform(X_test), columns=X_test.columns, index=X_test.index)


In [376]:
#Applying logistic Regression
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
lr.fit(X_train_scaled, y_train)
y_pred = lr.predict(X_test_scaled)

In [377]:
#using error rate as evaluation metrics because it was ordered to do so but observe data is imbalanced so we should opt for precision, f1 score, roc auc curve
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_test, y_pred)
accuracy


0.8821656050955414

In [378]:
print(X_train_scaled.shape)

(1253, 474)


**Now We will apply filter based feature selection techniques to reduce number of features and still get approximately same error **

**1. Deleting Duplicate Columns **



In [379]:
def get_duplicate_columns(df):

    duplicate_columns = {}
    seen_columns = {}

    for column in df.columns:
        current_column = df[column]

        # Convert column data to bytes
        try:
            current_column_hash = current_column.values.tobytes()
        except AttributeError:
            current_column_hash = current_column.to_string().encode()

        if current_column_hash in seen_columns:
            if seen_columns[current_column_hash] in duplicate_columns:
                duplicate_columns[seen_columns[current_column_hash]].append(column)
            else:
                duplicate_columns[seen_columns[current_column_hash]] = [column]
        else:
            seen_columns[current_column_hash] = column

    return duplicate_columns
duplicate_columns = get_duplicate_columns(X_train_scaled)

In [380]:
print(duplicate_columns) # key and its corresponding values are duplicate columns
print(X_train_scaled.shape)

{'74': ['209', '342'], '206': ['347', '478']}
(1253, 474)


In [381]:
#dropping duplicate columns from train and test data

for drop_list in duplicate_columns.values():
  X_train_scaled =  X_train_scaled.drop(columns=drop_list)
  X_test_scaled =  X_test_scaled.drop(columns=drop_list)


In [382]:
print(X_train_scaled.shape)

(1253, 470)


Till Now 4 features has been dropped.

**2. Variance Threshold Method**

In [383]:
from sklearn.feature_selection import VarianceThreshold

selector = VarianceThreshold(threshold=0.01)
sel = selector.fit(X_train_scaled)
column=X_train_scaled.columns[sel.get_support()]

X_train_scaled = sel.transform(X_train_scaled)
X_test_scaled = sel.transform(X_test_scaled)

X_train_scaled = pd.DataFrame(X_train_scaled, columns = column)
X_test_scaled = pd.DataFrame(X_test_scaled, columns=column)
X_train_scaled.shape

(1253, 470)

Variance Threshold Method didn't make any difference since all column passed this test.

**3. Correlation Method**

In [384]:
corr_matrix = X_train_scaled.corr()
columns= corr_matrix.columns
to_drop=[]
for i in range(len(columns)):
  for j in range(i+1, len(columns)):
    if abs(corr_matrix.loc[columns[i], columns[j]])>0.9:
      to_drop.append(columns[j])

to_drop = set(to_drop) #since repeating values

X_train_scaled.drop(columns=to_drop, axis=1, inplace=True)
X_test_scaled.drop(columns=to_drop, axis=1, inplace=True)

X_train_scaled.shape

(1253, 264)

**ANOVA Method**

In [385]:
from sklearn.feature_selection import f_classif
from sklearn.feature_selection import SelectKBest

sele = SelectKBest(f_classif, k=100).fit(X_train_scaled, y_train)

# display selected feature names
X_train_scaled.columns[sele.get_support()]

columns = X_train_scaled.columns[sele.get_support()]
X_train_scaled = sele.transform(X_train_scaled)
X_test_scaledd = sele.transform(X_test_scaled)

X_train_scaled = pd.DataFrame(X_train_scaled, columns=columns)
X_test_scaled = pd.DataFrame(X_test_scaled, columns=columns)
print(X_train_scaled.shape)


(1253, 100)


***Now Moment Of Truth : To check if error remains approximately same or not ***

In [386]:
#Applying logistic Regression
lr1 = LogisticRegression()
lr1.fit(X_train_scaled, y_train)
y_pred = lr1.predict(X_test_scaled)

In [387]:
#using error rate as evaluation metrics because it was ordered to do so but observe data is imbalanced so we should opt for precision, f1 score, roc auc curve
accuracy1 = accuracy_score(y_test, y_pred)
error = 1-accuracy1
accuracy

0.8821656050955414