In [2]:
import numpy as np
import os
import glob
import plotly.express as px
import plotly.graph_objs as go
from plotly.subplots import make_subplots
from sklearn.decomposition import PCA
from tqdm import tqdm


## 2 Task Outline
Instead of blindly throwing that data at machine learning algorithms, a good data scientist will first do some exploratory data analysis. In your team, we would like you to analyze the data with respect to the following aspects:

### 3. Feature characteristics: 
### How are the features distributed? 
### Are there any pairs or subsets of features that seem highly correlated or redundant?

In [3]:
# Load the Dataset

# Load the Dataset

label_files = glob.glob("Dataset/**/**labels.npy", recursive=True)
label_files.sort()

"""
'Dataset/comcuc/133067.labels.npy',
'Dataset/comcuc/182336.labels.npy',
'Dataset/comcuc/189031.labels.npy',
'Dataset/comcuc/240515.labels.npy',
'Dataset/comcuc/25627.labels.npy',
"""

data_files = glob.glob("Dataset/**/**.npy", recursive=True)
data_files = [file for file in data_files if not ".label" in file]
data_files.sort()

"""
'Dataset/comcuc/133067.npy',
'Dataset/comcuc/182336.npy',
'Dataset/comcuc/189031.npy',
'Dataset/comcuc/240515.npy',
"""

birds = ['comcuc', 'cowpig1', 'eucdov', 'eueowl1', 'grswoo', 'tawowl1']

def create_feature_dict ():
    
    with open('Dataset/feature_names.txt', mode='r') as f:
        features = f.readlines()

    features = [feature[:-1] for feature in features]
    values = [[] for _ in range(len(features))]

    return dict(zip(features, values))

def create_labels ():
    
    with open('Dataset/feature_names.txt', mode='r') as f:
        features = f.readlines()

    features = [feature[:-1] for feature in features]
    return features


In [4]:
def load_data_file (file):
    
    # create a new feature dict
    feature_dict = create_feature_dict()

    # load the numpy file
    file = np.load(file)

    for section in file:
        for i, key in enumerate(feature_dict.keys()):
            feature_dict[key].append(section[i])

    return feature_dict




feature = 'cln_melspect_mean_53'
feature_data = load_data_file('Dataset/comcuc/133067.npy').get(feature)


# create the plotly figure
fig = go.Figure()

# add the area graph trace
fig.add_trace(go.Scatter(
    x=np.arange(100),
    y=feature_data,
    mode='lines',
    fill='tozeroy',
    line_color='#1f77b4',
    fillcolor='#1f77b4'
))

# update the layout
fig.update_layout(
    title=feature,
    xaxis_title='Index',
    yaxis_title='Value'
)

# show the plotly figure
fig.show()

In [5]:
bird_feautres = {'comcuc': 1,
        'cowpig1': 2,
        'eucdov': 3,
        'eueowl1': 4,
        'grswoo': 5,
        'tawowl1': 6}


def get_features_by_bird (bird):

    # create a new dict to store the values for each feature
    features_by_category = create_feature_dict()

    # iterate through the files of the respective bird
    for file in tqdm(list([file for file in data_files if bird in file])):

        # load the data from the features
        file_data = load_data_file(file)

        # append the new data to the feature dict
        for key in file_data.keys():
            features_by_category[key].append(file_data[key])

    for key in features_by_category.keys():
        features_by_category[key] = np.asarray(features_by_category[key])

    return features_by_category

# iterate through all birds / directories

for bird in tqdm(birds):

    features_by_category = get_features_by_bird(bird)
    
    bird_feautres.update({bird: features_by_category})




#features_by_category = get_features_by_bird('comcuc')





100%|██████████| 200/200 [00:02<00:00, 92.38it/s]
100%|██████████| 200/200 [00:02<00:00, 90.54it/s]
100%|██████████| 200/200 [00:02<00:00, 96.14it/s] 
100%|██████████| 200/200 [00:02<00:00, 90.76it/s]
100%|██████████| 200/200 [00:02<00:00, 90.63it/s]
100%|██████████| 200/200 [00:02<00:00, 91.00it/s]
100%|██████████| 6/6 [00:16<00:00,  2.81s/it]


In [6]:


dicts = bird_feautres
n_components = 10
labels = create_labels()

# Create the figure with subplots
fig = make_subplots(rows=len(dicts.values()), cols=1, subplot_titles=list(dicts.keys()))

# Iterate over the dictionaries and perform PCA
for i, d in tqdm(enumerate(dicts)):
    #print(f'Performing PCA on dictionary {d}')
    # Stack the arrays for the current dictionary
    arrays = []
    for value in list(dicts.get(d).values()):
        arrays.append(value)
    data = np.stack(arrays)

    # Reshape the data to 2D array
    data = data.reshape(548, -1)  # Reshape to (548, 20000)
    # Perform PCA
    pca = PCA(n_components=n_components)
    pca_data = pca.fit_transform(data)
    # Create a scatter plot for the downprojection
    fig.add_trace(
        go.Scatter(go.Scatter(x=pca_data[:,0], y=pca_data[:,1], mode='markers', text = labels, name = d)),
        row=i+1, col=1
    )
    #fig = go.Scatter(go.Scatter(x=pca_data[:,0], y=pca_data[:,1], mode='markers'))

# Set the layout of the figure
fig.update_layout(height=2500, width=800, title_text="Dimensionality Reduction of all Features")

# Show the figure
fig.show()



6it [00:30,  5.03s/it]


In [24]:
labels = create_labels()




# Create the figure with subplots
fig = make_subplots(
    rows=len(dicts.values()), 
    cols=1, 
    subplot_titles=['comcuc_mean', 'comcuc_std'],   #, 'cowpig1_mean', 'cowpig1_std', 'eucdov_mean', 'eucdov_std', 'eueowl1_mean', 'eueowl1_std', 'grswoo_mean', 'grswoo_std', 'tawowl1_mean', 'tawowl1_std'
    vertical_spacing=0.03, 
    horizontal_spacing=0.05
    )

# iterate thorugh the different birds
for i, d in tqdm(enumerate(bird_feautres)):

    means = []
    stds = []
    for feature_array in list(bird_feautres.get(d).values()):
        
        # normalize the feature array using mean and std
        mean = np.mean(feature_array)
        std = np.std(feature_array)
        
        means.append(mean)
        stds.append(std)
        
        #feature_array = (feature_array - mean) / std
        
        #means.append(np.mean(feature_array))
        #stds.append(np.std(feature_array))



 
    
    fig.add_trace(
        go.Bar(y=means, x = labels, name = f"{d}_mean"),
        row=i+1, col=1
    )   

    fig.add_trace(
        go.Bar(y=stds, x = labels, name = f"{d}_std"),
        row=i+2, col=1
    ) 

    break

# Set the layout of the figure
fig.update_layout(height=3000, width=2000, title_text="Feature Distribution")

fig.show()

0it [00:00, ?it/s]


In [9]:
means

[6.713867e-08,
 3.3569336e-08,
 -5.9509276e-08,
 2.5939942e-08,
 3.2806398e-08,
 5.9509276e-08,
 -3.967285e-08,
 -1.8310548e-08,
 -2.4414062e-08,
 -2.746582e-08,
 1.5258789e-09,
 -6.1035155e-09,
 -1.373291e-08,
 6.2561035e-08,
 2.822876e-08,
 -2.4414062e-08,
 5.1879884e-08,
 1.6479493e-07,
 -4.577637e-09,
 1.7623901e-07,
 -7.095337e-08,
 -1.7776489e-07,
 -3.6621095e-08,
 -3.3721923e-07,
 -9.994507e-08,
 -9.384155e-08,
 1.02233884e-07,
 -2.532959e-07,
 -1.1444092e-08,
 -3.051758e-08,
 7.6293943e-10,
 2.7542114e-07,
 1.8005372e-07,
 6.2561035e-08,
 -1.4038086e-07,
 9.1552735e-08,
 -2.609253e-07,
 -2.746582e-08,
 1.586914e-07,
 3.326416e-07,
 -2.9907227e-07,
 2.5634765e-07,
 -1.4648438e-07,
 3.4179686e-07,
 -1.525879e-08,
 -8.239746e-08,
 2.8381348e-07,
 -7.93457e-08,
 -2.5939943e-07,
 -2.4414062e-08,
 -4.2724608e-08,
 1.6479493e-07,
 2.4414062e-08,
 -1.8005372e-07,
 3.6621094e-07,
 -9.4604495e-08,
 -1.5563965e-07,
 4.5776368e-08,
 -1.373291e-08,
 0.0,
 2.0446777e-07,
 3.051758e-08,
 -1.8