# Feature Engineering & Feature Extraction

## Attaching Resources

### Importing Necessary Libraries

In [10]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_selection import mutual_info_regression
from sklearn.tree import DecisionTreeRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split

import warnings

warnings.filterwarnings('ignore')
plt.rcParams['figure.figsize'] = (15,8)
plt.rcParams['figure.dpi'] = 250
sns.set_style(style='darkgrid')
plt.tight_layout()
%matplotlib inline

<Figure size 3750x2000 with 0 Axes>

### Adding Dataset

In [2]:
ep = pd.read_csv('https://raw.githubusercontent.com/kunal-mallick/Energy_Production/main/Data%20Preprocessing%20%26%20EDA/EDA.csv')

ep

Unnamed: 0,temperature,exhaust_vacuum,amb_pressure,r_humidity,energy_production,log_temp,sqrt_temp,log_ev,sqrt_ev,log_ap,sqrt_ap,log_rh,sqrt_rh
0,9.59,38.56,1017.01,60.10,481.30,2.260721,3.096773,3.652215,6.209670,6.924622,31.890594,4.096010,7.752419
1,12.04,42.34,1019.72,94.67,465.36,2.488234,3.469870,3.745732,6.506919,6.927283,31.933055,4.550397,9.729851
2,13.87,45.08,1024.42,81.69,465.48,2.629728,3.724245,3.808439,6.714164,6.931882,32.006562,4.402932,9.038252
3,13.72,54.30,1017.89,79.08,467.05,2.618855,3.704052,3.994524,7.368853,6.925487,31.904388,4.370460,8.892694
4,15.14,49.64,1023.78,75.00,463.58,2.717340,3.891015,3.904797,7.045566,6.931257,31.996562,4.317488,8.660254
...,...,...,...,...,...,...,...,...,...,...,...,...,...
9522,17.10,49.69,1005.53,81.82,457.32,2.839078,4.135215,3.905804,7.049113,6.913270,31.710093,4.404522,9.045441
9523,24.73,65.34,1015.42,52.80,446.92,3.208017,4.972927,4.179604,8.083316,6.923058,31.865655,3.966511,7.266361
9524,30.44,56.24,1005.19,56.24,429.34,3.415758,5.517246,4.029628,7.499333,6.912932,31.704732,4.029628,7.499333
9525,23.00,66.05,1020.61,80.29,421.57,3.135494,4.795832,4.190412,8.127115,6.928156,31.946987,4.385645,8.960469


## Splitting The Data Into Target and Independent set

In [3]:
x = ep.drop(columns = ['energy_production'])
y = ep['energy_production']

In [4]:
x

Unnamed: 0,temperature,exhaust_vacuum,amb_pressure,r_humidity,log_temp,sqrt_temp,log_ev,sqrt_ev,log_ap,sqrt_ap,log_rh,sqrt_rh
0,9.59,38.56,1017.01,60.10,2.260721,3.096773,3.652215,6.209670,6.924622,31.890594,4.096010,7.752419
1,12.04,42.34,1019.72,94.67,2.488234,3.469870,3.745732,6.506919,6.927283,31.933055,4.550397,9.729851
2,13.87,45.08,1024.42,81.69,2.629728,3.724245,3.808439,6.714164,6.931882,32.006562,4.402932,9.038252
3,13.72,54.30,1017.89,79.08,2.618855,3.704052,3.994524,7.368853,6.925487,31.904388,4.370460,8.892694
4,15.14,49.64,1023.78,75.00,2.717340,3.891015,3.904797,7.045566,6.931257,31.996562,4.317488,8.660254
...,...,...,...,...,...,...,...,...,...,...,...,...
9522,17.10,49.69,1005.53,81.82,2.839078,4.135215,3.905804,7.049113,6.913270,31.710093,4.404522,9.045441
9523,24.73,65.34,1015.42,52.80,3.208017,4.972927,4.179604,8.083316,6.923058,31.865655,3.966511,7.266361
9524,30.44,56.24,1005.19,56.24,3.415758,5.517246,4.029628,7.499333,6.912932,31.704732,4.029628,7.499333
9525,23.00,66.05,1020.61,80.29,3.135494,4.795832,4.190412,8.127115,6.928156,31.946987,4.385645,8.960469


## Performing Feature Selection

### Performing Mutual Info

In [8]:
mutual_scores = mutual_info_regression(x, y)
mutual_scores

array([1.16066482, 0.94671904, 0.27537857, 0.11639387, 1.16069129,
       1.1580981 , 0.95167558, 0.94804752, 0.27533268, 0.27519508,
       0.11489959, 0.11502426])

In [9]:
mutual_info_test = pd.DataFrame(mutual_scores).T
mutual_info_test.columns = x.columns

mutual_info_test

Unnamed: 0,temperature,exhaust_vacuum,amb_pressure,r_humidity,log_temp,sqrt_temp,log_ev,sqrt_ev,log_ap,sqrt_ap,log_rh,sqrt_rh
0,1.160665,0.946719,0.275379,0.116394,1.160691,1.158098,0.951676,0.948048,0.275333,0.275195,0.1149,0.115024


### Performing Tree Based method

In [11]:
dt = DecisionTreeRegressor()
dt.fit(x,y)

In [12]:
dt.feature_importances_

array([0.08519241, 0.00963103, 0.00728644, 0.00471706, 0.74121238,
       0.07802015, 0.01977381, 0.02967347, 0.00658193, 0.00613053,
       0.00475293, 0.00702787])

In [None]:
imp_cols = pd.DataFrame({
    'Features' : x.columns,
    'MI' : mutual_info_test,
    'IG' : dt.feature_importances_
})

imp_cols

In [None]:
#imp_cols.to_csv('Feature Selection.csv', index=False)

### Compairing All Result

In [None]:
r = chi_test.T
r.columns = ['chi_test']

r.sort_values(by='chi_test', ascending=False)

In [None]:
r2 = rfe_df.T
r2.columns = ['rfe']

r2.sort_values(by='rfe')

In [None]:
r.sort_values(by='chi_test', ascending=False).plot(kind='bar')

In [None]:
r2.sort_values(by='rfe').plot(kind='bar')

In [None]:
sns.barplot(x = imp_cols.sort_values(by='IG', ascending=False).loc[:,'IG'],
            y = imp_cols.sort_values(by='IG', ascending=False).loc[:,'Features'])

### Picking Features After Feature Engineering

In [None]:
ch = pd.read_csv(r'C:\Users\Kunal Mallick\Documents\GitHub\Churn_Prediction\Data Preprocessing & EDA\eda.csv')

ch.columns

#### Filtering Columns

In [None]:
x = ch.iloc[:,[1,2,3,5,7,8,14,70,71,72,73]]

x

## Normalizing The Dataset

In [None]:
sc = StandardScaler()
xtrain  = pd.DataFrame(sc.fit_transform(x), columns=x.columns)

In [None]:
xtrain

In [None]:
xtest = pd.DataFrame(sc.fit_transform(xtest_raw), columns=xtest_raw.columns)

In [None]:
xtest

## Performing Dimension Reductionality

### Creating PCA With Default Value

In [None]:
pca_def = PCA()
pcs_def = pca_def.fit_transform(xtrain)
pcs_def

In [None]:
pca_def.explained_variance_ratio_

In [None]:
np.cumsum(pca_def.explained_variance_ratio_)

- As it is clear that 99% of the data came from the first column alone, we will use n component 8.

### Creating PCA With 3 Component For Xtrain

In [None]:
pca_8 = PCA(n_components = 8)
pcs_8 = pca_8.fit_transform(xtrain)
pcs_8

#### Creating DataFrame

In [None]:
def col_pc_n(n):

    col = []
    p = 'pc'

    for i in range(0,n):
        col_n = p + str(i)
        col.append(col_n)

    return col

In [None]:
wn_scaled_pca = pd.DataFrame(pcs_8, columns=col_pc_n(8))

wn_scaled_pca

### Creating PCA With 3 Component For Xtest

In [None]:
pca_8 = PCA(n_components = 8)
pcs_8_xtest = pca_8.fit_transform(xtest)
pcs_8_xtest

#### Creating DataFrame

In [None]:
def col_pc_n(n):

    col = []
    p = 'pc'

    for i in range(0,n):
        col_n = p + str(i)
        col.append(col_n)

    return col

In [None]:
wn_scaled_pca_xtest = pd.DataFrame(pcs_8_xtest, columns=col_pc_n(8))

wn_scaled_pca_xtest

## Exporting Dataframe

In [None]:
wn_scaled_pca.to_csv('xtrain.csv', index=False)
wn_scaled_pca_xtest.to_csv('xtest.csv', index=False)
ytrain.to_csv('ytrain.csv', index=False)
ytest.to_csv('ytest.csv', index=False)