Experiment-4 : Feature Selection Techniques: Correlation Based Feature Selection Techniques


In [None]:
# Import Dependencies
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
%matplotlib inline

In [None]:
# Load Dataset
df = pd.read_csv('/content/Coronavirus Tweets.csv', encoding='latin-1')
df.shape

(41157, 6)

In [None]:
df.head()

Unnamed: 0,UserName,ScreenName,Location,TweetAt,OriginalTweet,Sentiment
0,3799,48751,London,16/03/20,@MeNyrbie @Phil_Gahan @Chrisitv https://t.co/i...,Neutral
1,3800,48752,UK,16/03/20,advice Talk to your neighbours family to excha...,Positive
2,3801,48753,Vagabonds,16/03/20,Coronavirus Australia: Woolworths to give elde...,Positive
3,3802,48754,,16/03/20,My food stock is not the only one which is emp...,Positive
4,3803,48755,,16/03/20,"Me, ready to go at supermarket during the #COV...",Extremely Negative


In [None]:
# Get Numerical features from dataset
numerics = ['int16', 'int32','int64', 'float16', 'float32', 'float64']
numerical_features = list(df.select_dtypes(include=numerics).columns)

In [None]:
data = df[numerical_features]

In [None]:
data.shape

(41157, 2)

In [None]:
data.head()

Unnamed: 0,UserName,ScreenName
0,3799,48751
1,3800,48752
2,3801,48753
3,3802,48754
4,3803,48755


In [None]:
X = data.drop(['UserName', 'ScreenName'], axis=1)
X.shape

(41157, 0)

In [None]:
y = data['UserName']
y.shape

(41157,)

In [None]:
# Train Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=101)
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((28809, 0), (28809,), (12348, 0), (12348,))

Observation: Cannot visualize these correlated features due to size of an array being zero and thus a heatmap cannot be constructed. Thus, now we use brute force method to find correlated features.

Brute Force Method to Find Correlated Features

In [None]:
# Brute Force Method to find Correlation between features
def correlation(data, threshold=None):
    # Set of all names of correlated columns
    col_corr = set()
    corr_mat = data.corr()
    for i in range(len(corr_mat.columns)):
        for j in range(i):
            if (abs(corr_mat.iloc[i,j]) > threshold):
                colname = corr_mat.columns[i]
                col_corr.add(colname)
    return col_corr

In [None]:
# no of correlated features
correlated_features = correlation(data=X_train, threshold=0.8)
len(set(correlated_features))

0

In [None]:
X_train.drop(labels=correlated_features, axis=1, inplace=True)
X_test.drop(labels=correlated_features, axis=1, inplace=True)

X_train.shape, X_test.shape

((28809, 0), (12348, 0))

Calculating Groups of Higly Correlated Features

In [None]:
# Load Dataset
df = pd.read_csv('/content/Coronavirus Tweets.csv', encoding='latin-1')
df.shape

(41157, 6)

In [None]:
# Get Numerical features from dataset
numerics = ['int16', 'int32','int64', 'float16', 'float32', 'float64']
numerical_features = list(df.select_dtypes(include=numerics).columns)

In [None]:
data1 = df[numerical_features]

In [None]:
data1.head()

Unnamed: 0,UserName,ScreenName
0,3799,48751
1,3800,48752
2,3801,48753
3,3802,48754
4,3803,48755


In [None]:
X = data1.drop(['UserName', 'ScreenName'], axis=1)
X.shape

(41157, 0)

In [None]:
y = data1['UserName']
y.shape

(41157,)

In [None]:
# Train Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=101)
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((28809, 0), (28809,), (12348, 0), (12348,))

In [None]:
# Build a Dataframe with Correlation between Features
corr_matrix = X_train.corr()
# Take absolute values of correlated coefficients
corr_matrix = corr_matrix.abs().unstack()
corr_matrix = corr_matrix.sort_values(ascending=False)
# Take only features with correlation above threshold of 0.8
corr_matrix = corr_matrix[corr_matrix >= 0.8]
corr_matrix = corr_matrix[corr_matrix < 1]
corr_matrix = pd.DataFrame(corr_matrix).reset_index()
corr_matrix.columns = ['feature1', 'feature2', 'Correlation']
corr_matrix.head()

Unnamed: 0,feature1,feature2,Correlation


In [None]:
# Get groups of features that are correlated amongs themselves
grouped_features = []
correlated_groups = []

for feature in corr_matrix.feature1.unique():
    if feature not in grouped_features:
        # Find all features correlated to a single feature
        correlated_block = corr_matrix[corr_matrix.feature1 == feature]
        grouped_features = grouped_features + list(correlated_block.feature2.unique()) + [feature]
        
        # Append block of features to the list
        correlated_groups.append(correlated_block)

print('Found {} correlated feature groups'.format(len(correlated_groups)))
print('out of {} total features.'.format(X_train.shape[1]))

Found 0 correlated feature groups
out of 0 total features.


In [None]:
# Visualize Correlated Feature Groups
for group in correlated_groups:
    print(group)
    print('\n')