In [None]:
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import seaborn as sns
data = pd.read_csv("/Users/henrysun_1/Desktop/Duke/2024-2025/Summer 2024/fishics/data/data_for_henry_2024.csv")
data

In [None]:
# Extract the relevant columns for PCA (columns L to CH, which are the fish ASVs)
fish_asvs = data.iloc[:, 11:]

# Perform PCA
pca = PCA(n_components=2)
pca_result = pca.fit_transform(fish_asvs)

# Add PCA results to the dataframe
data['PCA1'] = pca_result[:, 0]
data['PCA2'] = pca_result[:, 1]

# color by date
# plt.figure(figsize=(10, 8))
# sns.scatterplot(x='PCA1', y='PCA2', data=data, hue='date', s=100, palette='tab10')
# plt.title('PCA of Fish ASVs colored by date')
# plt.xlabel('Principal Component 1')
# plt.ylabel('Principal Component 2')
# plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
# plt.grid(True)
# plt.show()

plt.figure(figsize=(10, 8))
sns.scatterplot(x='PCA1', y='PCA2', data=data, hue='date', s=100, palette='tab10')

# Annotate points with station numbers
# for i, station in enumerate(data['station']):
#     plt.annotate(station, (data['PCA1'].iloc[i], data['PCA2'].iloc[i]), fontsize=8, color='black', ha='right', va='bottom')

plt.title('PCA of Fish ASVs colored by date')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
plt.grid(True)
plt.show()

# Outlier station 6

Station 6 is an outlier across both plots. Appears to be much more similar to data in the summer
NorSeaRob 0.028368794
Am_butterfish 0.822695035
FourSpot Flounder 0.14893617

In [None]:
data
selected_columns = data[['PCA1', 'PCA2', 'date']]
selected_columns.to_csv('pca_latent_space.csv', index=False)


In [None]:
from sklearn.manifold import TSNE
tsne = TSNE(n_components=2, random_state=30, perplexity=20, learning_rate=10, n_iter=1000) #test with diff perplexity
tsne_result = tsne.fit_transform(fish_asvs)
# Perform t-SNE

# Add t-SNE results to the dataframe
data['tSNE1'] = tsne_result[:, 0]
data['tSNE2'] = tsne_result[:, 1]

# plt.figure(figsize=(12, 8))
# sns.scatterplot(x='tSNE1', y='tSNE2', data=data, hue='date', s=100, palette='tab10')
# plt.title('t-SNE of Fish ASVs colored by Date')
# plt.xlabel('t-SNE Component 1')
# plt.ylabel('t-SNE Component 2')
# plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
# plt.grid(True)
# plt.show()


plt.figure(figsize=(10, 8))
sns.scatterplot(x='tSNE1', y='tSNE2', data=data, hue='date', s=100, palette='tab10')

# Annotate points with station numbers
# for i, station in enumerate(data['station']):
#     plt.annotate(station, (data['tSNE1'].iloc[i], data['tSNE2'].iloc[i]), fontsize=8, color='black', ha='right', va='bottom')

plt.title('tSNE of Fish ASVs colored by date')
plt.xlabel('tSNE Component 1')
plt.ylabel('tSNE Component 2')
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
plt.grid(True)
plt.show()



In [None]:
data
selected_columns = data[['tSNE1', 'tSNE2', 'date']]
selected_columns.to_csv('tsne_latent_space.csv', index=False)


In [None]:
# Extract species columns
columns_to_remove = ['PCA1', 'PCA2', 'tSNE1', 'tSNE2']
save_data = data.drop(columns=columns_to_remove, errors='ignore')

species_columns = save_data.columns[data.columns.get_loc("primers") + 1:]

# Convert relative abundance to presence/absence (1 if > 0, else 0)
Data_presence = save_data[species_columns].applymap(lambda x: 1 if x > 0 else 0)

# Export presence/absence data
Data_presence.to_csv("presence_absence_data.csv", index=False)


## PCA Seasonal Analysis

In [None]:

#Winter
winter_data = data.loc[data['sampling_bout']=='Feb_24']
winter_fish_asvs = winter_data.iloc[:, 11:]
winter_pca = PCA(n_components=2)
winter_pca_result = winter_pca.fit_transform(winter_fish_asvs)
winter_data['PCA1'] = winter_pca_result[:, 0]
winter_data['PCA2'] = winter_pca_result[:, 1]

spring_data = data.loc[data['sampling_bout']=='Jun_23']
spring_fish_asvs = spring_data.iloc[:, 11:]
spring_pca = PCA(n_components=2)
spring_pca_result = spring_pca.fit_transform(spring_fish_asvs)
spring_data['PCA1'] = spring_pca_result[:, 0]
spring_data['PCA2'] = spring_pca_result[:, 1]

summer_data = data.loc[data['sampling_bout']=='Aug_23']
summer_fish_asvs = summer_data.iloc[:, 11:]
summer_pca = PCA(n_components=2)
summer_pca_result = summer_pca.fit_transform(summer_fish_asvs)
summer_data['PCA1'] = summer_pca_result[:, 0]
summer_data['PCA2'] = summer_pca_result[:, 1]


In [None]:
plt.figure(figsize=(10, 8))
sns.scatterplot(x='PCA1', y='PCA2', data=winter_data, s=100, palette='tab10')

# Annotate points with station numbers
# for i, station in enumerate(data['station']):
#     plt.annotate(station, (data['tSNE1'].iloc[i], data['tSNE2'].iloc[i]), fontsize=8, color='black', ha='right', va='bottom')

plt.title('PCA of Winter Fish ASVs')
plt.xlabel('PCA Component 1')
plt.ylabel('tSNE Component 2')
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
plt.grid(True)
plt.show()

In [None]:
plt.figure(figsize=(10, 8))
sns.scatterplot(x='PCA1', y='PCA2', data=spring_data, s=100, palette='tab10')

# Annotate points with station numbers
# for i, station in enumerate(data['station']):
#     plt.annotate(station, (data['tSNE1'].iloc[i], data['tSNE2'].iloc[i]), fontsize=8, color='black', ha='right', va='bottom')

plt.title('PCA of Spring Fish ASVs')
plt.xlabel('PCA Component 1')
plt.ylabel('tSNE Component 2')
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
plt.grid(True)
plt.show()

In [None]:
plt.figure(figsize=(10, 8))
sns.scatterplot(x='PCA1', y='PCA2', data=summer_data, s=100, palette='tab10')

# Annotate points with station numbers
# for i, station in enumerate(data['station']):
#     plt.annotate(station, (data['tSNE1'].iloc[i], data['tSNE2'].iloc[i]), fontsize=8, color='black', ha='right', va='bottom')

plt.title('PCA of Summer Fish ASVs')
plt.xlabel('PCA Component 1')
plt.ylabel('tSNE Component 2')
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
plt.grid(True)
plt.show()

In [None]:
# Save data
winter_data
w_selected_columns = winter_data[['PCA1', 'PCA2', 'date']]
w_selected_columns.to_csv('winter_pca_latent_space.csv', index=False)

spring_data
sp_selected_columns = spring_data[['PCA1', 'PCA2', 'date']]
sp_selected_columns.to_csv('spring_pca_latent_space.csv', index=False)

summer_data
su_selected_columns = summer_data[['PCA1', 'PCA2', 'date']]
su_selected_columns.to_csv('summer_pca_latent_space.csv', index=False)


## tSNE Seasonal Analysis

In [None]:
winter_tsne = TSNE(n_components=2, random_state=30, perplexity=20, learning_rate=10, n_iter=1000) #test with diff perplexity
winter_tsne_result = winter_tsne.fit_transform(winter_fish_asvs)
# Perform t-SNE
winter_data['tSNE1'] = winter_tsne_result[:, 0]
winter_data['tSNE2'] = winter_tsne_result[:, 1]
winter_data

spring_tsne = TSNE(n_components=2, random_state=30, perplexity=20, learning_rate=10, n_iter=1000) #test with diff perplexity
spring_tsne_result = spring_tsne.fit_transform(spring_fish_asvs)
# Perform t-SNE
spring_data['tSNE1'] = spring_tsne_result[:, 0]
spring_data['tSNE2'] = spring_tsne_result[:, 1]
spring_data

summer_tsne = TSNE(n_components=2, random_state=30, perplexity=20, learning_rate=10, n_iter=1000) #test with diff perplexity
summer_tsne_result = summer_tsne.fit_transform(summer_fish_asvs)
# Perform t-SNE
summer_data['tSNE1'] = summer_tsne_result[:, 0]
summer_data['tSNE2'] = summer_tsne_result[:, 1]
summer_data


In [None]:
# plt.figure(figsize=(10, 8))
# sns.scatterplot(x='tSNE1', y='tSNE2', data=winter_data, s=100, palette='tab10')
# plt.title('PCA of Winter Fish ASVs')
# plt.xlabel('PCA Component 1')
# plt.ylabel('tSNE Component 2')
# plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
# plt.grid(True)
# plt.show()

# plt.figure(figsize=(10, 8))
# sns.scatterplot(x='tSNE1', y='tSNE2', data=spring_data, s=100, palette='tab10')
# plt.title('PCA of Spring Fish ASVs')
# plt.xlabel('PCA Component 1')
# plt.ylabel('tSNE Component 2')
# plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
# plt.grid(True)
# plt.show()

# plt.figure(figsize=(10, 8))
# sns.scatterplot(x='tSNE1', y='tSNE2', data=summer_data, s=100, palette='tab10')
# plt.title('PCA of Summer Fish ASVs')
# plt.xlabel('PCA Component 1')
# plt.ylabel('tSNE Component 2')
# plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
# plt.grid(True)
# plt.show()

In [19]:
# Save data
winter_data
w_selected_columns = winter_data[['tSNE1', 'tSNE2', 'date']]
w_selected_columns.to_csv('winter_tsne_latent_space.csv', index=False)

spring_data
sp_selected_columns = spring_data[['tSNE1', 'tSNE2', 'date']]
sp_selected_columns.to_csv('spring_tsne_latent_space.csv', index=False)

summer_data
su_selected_columns = summer_data[['tSNE1', 'tSNE2', 'date']]
su_selected_columns.to_csv('summer_tsne_latent_space.csv', index=False)

In [20]:
# Extract species columns
columns_to_remove = ['PCA1', 'PCA2', 'tSNE1', 'tSNE2']
save_data = winter_data.drop(columns=columns_to_remove, errors='ignore')

species_columns = save_data.columns[data.columns.get_loc("primers") + 1:]

# Convert relative abundance to presence/absence (1 if > 0, else 0)
Data_presence = save_data[species_columns].applymap(lambda x: 1 if x > 0 else 0)

# Export presence/absence data
Data_presence.to_csv("winter_presence_absence_data.csv", index=False)


In [21]:
# Extract species columns
columns_to_remove = ['PCA1', 'PCA2', 'tSNE1', 'tSNE2']
save_data = spring_data.drop(columns=columns_to_remove, errors='ignore')

species_columns = save_data.columns[data.columns.get_loc("primers") + 1:]

# Convert relative abundance to presence/absence (1 if > 0, else 0)
Data_presence = save_data[species_columns].applymap(lambda x: 1 if x > 0 else 0)

# Export presence/absence data
Data_presence.to_csv("spring_presence_absence_data.csv", index=False)

In [22]:
# Extract species columns
columns_to_remove = ['PCA1', 'PCA2', 'tSNE1', 'tSNE2']
save_data = summer_data.drop(columns=columns_to_remove, errors='ignore')

species_columns = save_data.columns[data.columns.get_loc("primers") + 1:]

# Convert relative abundance to presence/absence (1 if > 0, else 0)
Data_presence = save_data[species_columns].applymap(lambda x: 1 if x > 0 else 0)

# Export presence/absence data
Data_presence.to_csv("summer_presence_absence_data.csv", index=False)