In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import OneHotEncoder
import xgboost as xgb
import qnorm
from sklearn.decomposition import PCA
from scipy import stats
from scipy.linalg import norm

## Corelation function 

##pcimp = array of feature importance
#i1 = number of components
#X = the feature dataset
#x_pca (PCA-transformed data), and met (a method indicator).
# X1 is just normal normalised data and then we applying pca and then 
#obtaining feature importance and then we finding correlation between the both feature importance once obtained on apply
#random forest to without pca applied data and second on pca applied data


def corell(pcimp, i1, X, x_pca, met):
    r = X.shape[0]  # Number of samples (rows) in x
    corr_matrix = np.zeros((r, i1))

    for i in range(X.shape[1]):
        for j in range(i1):
            cosine = np.dot(X.iloc[:,i], x_pca[:,j]) / (norm(X.iloc[:,i]) * norm(x_pca[:,j]))
            corr_matrix[i][j] = cosine

    res = [0] * X.shape[1]

    if met == 1:
        for ii in range(X.shape[1]):
            feat1 = 0
            for j in range(i1):
                feat1 = feat1 + pcimp[j] * corr_matrix[ii][j]
            temp = 0
            for k in range(i1):
                temp = temp + corr_matrix[ii][k]
            res[ii] = abs(feat1) / temp

    if met == 2:
        for ii in range(X.shape[1]):
            feat1 = 0
            for j in range(i1):
                feat1 = feat1 + pcimp[j] * corr_matrix[ii][j]
            temp = 0
            for k in range(i1):
                temp = temp + corr_matrix[ii][k]
            res[ii] = abs(feat1) / temp

    return res

# Read and preprocess the data
df = pd.read_csv("divorce.csv")
# df = df.fillna(0)

X = df.drop(["Class"], axis=1)
X = qnorm.quantile_normalize(X)
y = df["Class"]

rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X, y)

feature_scores1 = pd.Series(rf_model.feature_importances_, index=X.columns)
# print(feature_scores1)

arr = [0.1, 0.2, 0.4, 0.6, 0.8]
temp = 0

idf2 = df
df3 = df
y = idf2['Class']
X1 = idf2.drop(['Class'], axis=1)
X1 = qnorm.quantile_normalize(X1)

dftemp = pd.DataFrame(columns=['Sparseness', 'Components', 'Normal', 'BackNormal', 'BackSparse', 'Other'])

for spar in arr:

    #generates a sparse version of the original data by randomly setting some values to zero based on the specified sparseness level.
    data = idf2.values[:, :-1]
    mask = np.random.choice([True, False], size=data.shape, p=[1-spar, spar])
    data_sparse = np.where(mask, data, 0)
    df3.iloc[:, :-1] = data_sparse

    X = df3.drop(['Class'], axis=1)
    X = qnorm.quantile_normalize(X)
    y = df3['Class']
    rf_model.fit(X, y)
    sparse_x = pd.Series(rf_model.feature_importances_, index=X.columns)

    for i1 in range(2, 20, 4):
        scaled_data = X1
        pca = PCA(svd_solver='full', n_components=i1)
        pca.fit(scaled_data)
        x_pca = pca.transform(scaled_data)
        rfc = RandomForestRegressor(n_estimators=100, random_state=42)
        rfc.fit(x_pca, y)
        feature_importances = rfc.feature_importances_
        res = corell(feature_importances, i1, X1, x_pca, 1)
        # print(res)

        scaled_data1 = X
        pca.fit(scaled_data1)
        x_pca1 = pca.transform(scaled_data1)
        rfc1 = RandomForestRegressor(n_estimators=100, random_state=42)
        rfc1.fit(x_pca1, y)
        feature_importances1 = rfc1.feature_importances_
        res1 = corell(feature_importances1, i1, X, x_pca1, 1)
## corrcoef function computes the Pearson correlation coefficient
        normalsp = abs(np.corrcoef(sparse_x, feature_scores1)[0, 1])
        backn = np.corrcoef(feature_scores1, res)[0, 1]
        backsp = np.corrcoef(feature_scores1, res1)[0, 1]
        backs2 = np.corrcoef(res, res1)[0, 1]

        new_row = {'Sparseness': spar*100, 'Components': i1, 'Normal': normalsp, 'BackNormal': backn, 'BackSparse': backsp, 'Other': backs2}
        dftemp.loc[temp] = [spar*100, i1, normalsp, backn, backsp, backs2]

        temp += 1
# Print the results
print(dftemp)

    Sparseness  Components    Normal  BackNormal  BackSparse     Other
0         10.0         2.0  0.763366   -0.050189    0.004522 -0.919001
1         10.0         6.0  0.763366    0.014423   -0.049652 -0.150622
2         10.0        10.0  0.763366   -0.043012   -0.021776 -0.215758
3         10.0        14.0  0.763366    0.034009   -0.053629  0.091261
4         10.0        18.0  0.763366   -0.041555   -0.016149 -0.237108
5         20.0         2.0  0.519404   -0.050189   -0.015851 -0.094355
6         20.0         6.0  0.519404    0.014423    0.025816  0.054784
7         20.0        10.0  0.519404   -0.043012    0.057684  0.069827
8         20.0        14.0  0.519404    0.034009    0.086577  0.202069
9         20.0        18.0  0.519404   -0.041555   -0.032961  0.186521
10        40.0         2.0  0.535644   -0.050189   -0.061933 -0.026177
11        40.0         6.0  0.535644    0.014423   -0.025637 -0.133832
12        40.0        10.0  0.535644   -0.043012    0.037322  0.013260
13    