In [48]:
import numpy as np
import os
import glob
import plotly.express as px
import plotly.graph_objs as go
from plotly.subplots import make_subplots
from sklearn.decomposition import PCA
from tqdm import tqdm
from scipy.stats import pearsonr
import pandas as pd


## 2 Task Outline
Instead of blindly throwing that data at machine learning algorithms, a good data scientist will first do some exploratory data analysis. In your team, we would like you to analyze the data with respect to the following aspects:

### 4. 

### Feature/Label agreement: 

### Which features seem useful for classification? 

### Which ones are correlated with the labels?

In [6]:
# Load the Dataset

# Load the Dataset

label_files = glob.glob("Dataset/**/**labels.npy", recursive=True)
label_files.sort()

"""
'Dataset/comcuc/133067.labels.npy',
'Dataset/comcuc/182336.labels.npy',
'Dataset/comcuc/189031.labels.npy',
'Dataset/comcuc/240515.labels.npy',
'Dataset/comcuc/25627.labels.npy',
"""

data_files = glob.glob("Dataset/**/**.npy", recursive=True)
data_files = [file for file in data_files if not ".label" in file]
data_files.sort()

"""
'Dataset/comcuc/133067.npy',
'Dataset/comcuc/182336.npy',
'Dataset/comcuc/189031.npy',
'Dataset/comcuc/240515.npy',
"""

birds = ['comcuc', 'cowpig1', 'eucdov', 'eueowl1', 'grswoo', 'tawowl1']

def create_feature_dict ():
    
    with open('Dataset/feature_names.txt', mode='r') as f:
        features = f.readlines()

    features = [feature[:-1] for feature in features]
    values = [[] for _ in range(len(features))]

    return dict(zip(features, values))

def create_labels ():
    
    with open('Dataset/feature_names.txt', mode='r') as f:
        features = f.readlines()

    features = [feature[:-1] for feature in features]
    return features

def load_data_file (file):
    
    # create a new feature dict
    feature_dict = create_feature_dict()

    # load the numpy file
    file = np.load(file)

    for section in file:
        for i, key in enumerate(feature_dict.keys()):
            feature_dict[key].append(section[i])

    return feature_dict

def get_features_by_bird (bird):

    # create a new dict to store the values for each feature
    features_by_category = create_feature_dict()

    # iterate through the files of the respective bird
    for file in tqdm(list([file for file in data_files if bird in file])):

        # load the data from the features
        file_data = load_data_file(file)

        # append the new data to the feature dict
        for key in file_data.keys():
            features_by_category[key].append(file_data[key])

    for key in features_by_category.keys():
        features_by_category[key] = np.asarray(features_by_category[key])

    return features_by_category

def get_bird_features ():
    bird_feautres = {'comcuc': 1,
        'cowpig1': 2,
        'eucdov': 3,
        'eueowl1': 4,
        'grswoo': 5,
        'tawowl1': 6}

    for bird in tqdm(birds):

        features_by_category = get_features_by_bird(bird)
        
        bird_feautres.update({bird: features_by_category})

    return bird_feautres


In [19]:
correlations_by_bird = {
    'comcuc': [],
    'cowpig1': [],
    'eucdov': [],
    'eueowl1': [],
    'grswoo': [],
    'tawowl1': []}

# iterate through the bird
for bird in tqdm(birds):

    # get all files which are in the directory of the current bird
    current_label_files = [file for file in label_files if bird in file]
    current_data_files = [file for file in data_files if bird in file]

    # create a list to store the correlations by file
    correlations_by_file = [] 

    # iterate through the data files and their respective label file
    for data_f, label_f in zip(current_data_files, current_label_files):

        # create a list to store the correlation coeffiencts by feature
        correlations_by_feature = []

        # load the label array and the data array
        label_arr = np.load(label_f)[:, 0]
        data_arr = np.load(data_f)

        # iterate through all the features of the data file
        for i in range(data_arr.shape[1]):
            feature_arr = data_arr[:, i]

            try:
                # compute the pearson correlation coefficient
                corr, _ = pearsonr(label_arr, feature_arr)
            except:
                corr = 0
        
            # store the computed correlation
            correlations_by_feature.append(corr)
            
        correlations_by_file.append(correlations_by_feature)
    
    correlations_by_bird[bird].append(correlations_by_file)



100%|██████████| 6/6 [02:53<00:00, 28.93s/it]


In [31]:
# create a list to store the correlations by file
correlations_by_file = [] 

# iterate through the data files and their respective label file
for data_f, label_f in tqdm(zip(data_files, label_files)):

    # create a list to store the correlation coeffiencts by feature
    correlations_by_feature = []

    # load the label array and the data array
    label_arr = np.load(label_f)[:, 0]
    data_arr = np.load(data_f)

    # iterate through all the features of the data file
    for i in range(data_arr.shape[1]):
        feature_arr = data_arr[:, i]

        corr, _ = pearsonr(label_arr, feature_arr)

        if type(corr) == np.float64:

            correlations_by_feature.append(corr)

    if correlations_by_feature != []:
        correlations_by_file.append(correlations_by_feature)

    

1200it [05:31,  3.62it/s]


In [46]:
correlations_by_file = np.asarray(correlations_by_file)

overall_correlation_by_feature = []

for i in range(correlations_by_file.shape[1]):

    correlations = correlations_by_file[:, i]
    overall_correlation = sum(correlations) / correlations_by_file.shape[0]

    overall_correlation_by_feature.append(overall_correlation)




labels = create_labels()


# Sort the values and labels in ascending order
sorted_values, sorted_labels = zip(*sorted(zip(overall_correlation_by_feature, labels), reverse= True))

    

# Create the bar plot
fig = go.Figure([go.Bar(x=sorted_labels, y=sorted_values)])

# Update the layout
fig.update_layout(
    title='Pearson R Correlation over all Files for each Feature',
    xaxis_title='Labels',
    yaxis_title='Correlation',
    xaxis_tickangle=-45,
    height = 1000,
    width = 2000
)

# Show the plot
fig.show()

In [67]:
df = pd.DataFrame({'data': overall_correlation_by_feature, 'labels': labels})
df = df.sort_values(by= 'data', ascending=False)


print('The Top 20 best features to use accordingly to the Pearson correlation coefficient:\n')
for _ in list(df["labels"][:20]):
    print(_)




The Top 20 best features to use accordingly to the Pearson correlation coefficient:

raw_energy_mean
raw_flux_mean
cln_contrast_mean_3
raw_contrast_mean_3
raw_energy_std
raw_flux_std
raw_power_mean
raw_mfcc_mean_0
cln_mfcc_mean_0
raw_melspect_mean_7
raw_power_std
raw_melspect_mean_8
cln_melspect_mean_7
cln_melspect_mean_8
raw_melspect_mean_9
cln_melspect_mean_9
raw_melspect_mean_6
cln_melspect_mean_6
cln_flux_mean
cln_energy_mean
