In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
df = pd.read_csv('allseasons.csv')
df=df.drop(['Unnamed: 0'],axis=1)
df

## Exploratory Data Analysis

#### Part 1: Temperature, salinity, and stratification avg by season

In [None]:
columns = ['Surface_Temp', 'Surface_Salinity', 'Bottom_Temp', 'Bottom_Salinity', 'temp_strat', 'salt_strat']

##change based off season
sliced_df=df.loc[df['sampling_bout']=='Jun_23']
# sliced_df=df.loc[df['sampling_bout']=='Aug_23']
sliced_df=df.loc[df['sampling_bout']=='Feb_24']
summary_stats = sliced_df[columns].describe()
summary_stats

Average Surface Temp (Feb, June, Aug): 5.88, 16.62, 22.23  
Average Surface Salinity: 28.815, 29.741, 30.581  
Average Temperature Stratification: 0.281, 2.518, 4.739

#### Part 2: Dimensionality Reduction using PCA/tSNE on Dataset (*with Oceanographic Data*)

In [None]:
fish_asvs = df.iloc[:, 11:]
fish_asvs
tsne_df = df
from sklearn.manifold import TSNE
tsne = TSNE(n_components=2, random_state=20, perplexity=20, learning_rate=5, n_iter=300) #test with diff perplexity
tsne_result = tsne.fit_transform(fish_asvs)
# Perform t-SNE

# Add t-SNE results to the dataframe
tsne_df['tSNE1'] = tsne_result[:, 0]
tsne_df['tSNE2'] = tsne_result[:, 1]
tsne_df


In [None]:
import seaborn as sns

plt.figure(figsize=(10, 8))
sns.scatterplot(x='tSNE1', y='tSNE2', data=tsne_df, hue='date', s=100, palette='tab10')

plt.title('tSNE of Fish ASVs colored by date')
plt.xlabel('tSNE Component 1')
plt.ylabel('tSNE Component 2')
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
plt.grid(True)
plt.show()

## Build Model

Specifications for model: keep date, oceanographic variables.
If date is a strong predictor, influence of migratory species may be too high! Remove.

In [None]:
column_to_move = df.pop("date")
# insert column with insert(location, column_name, column_value)

df.insert(92, "date", column_to_move)
df


In [None]:
## Keep date - modify existing code
## Plot feature importance
ml_df = df.iloc[:,10:] # type: ignore
ml_df = ml_df.drop(['tSNE1','tSNE2'],axis=1)
ml_df

In [None]:
ml_df['date'] = pd.to_datetime(ml_df['date'], format='%m/%d/%y')
# Convert datetime to timestamp (e.g., number of days since a reference date)
ml_df['date_numeric'] = (ml_df['date'] - pd.Timestamp('1970-01-01')) // pd.Timedelta('1D')
ml_df

In [None]:

from sklearn.multioutput import MultiOutputClassifier
from sklearn.model_selection import train_test_split
# from sklearn.ensemble import RandomForestRegressor
# from sklearn.metrics import mean_squared_error, r2_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.metrics import classification_report
import numpy as np

## MULTI OUTPUT CLASSIFIER
import warnings
warnings.filterwarnings("ignore")

features = ml_df[['Surface_Salinity','Bottom_Salinity','salt_strat', 'Surface_Temp','Bottom_Temp','temp_strat']]

labels = ml_df.loc[:,'Atl_croaker_(nibea98)':'Atl_salmon'].applymap(lambda x: 1 if x > 0 else 0)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.3, random_state=42)

# Initialize and train the multi-output Random Forest classifier
rf_classifier = MultiOutputClassifier(RandomForestClassifier(n_estimators=100, random_state=42))
rf_classifier.fit(X_train, y_train)

# Predict on the test set
y_pred = rf_classifier.predict(X_test)

accuracies = []
precisions = []
recalls = []
f1_scores = []
for i, species in enumerate(labels.columns):
    accuracy = accuracy_score(y_test.iloc[:, i], y_pred[:, i])
    # precision = precision_score(y_test.iloc[:, i], y_pred[:, i])
    # recall = recall_score(y_test.iloc[:, i], y_pred[:, i])
    # f1 = f1_score(y_test.iloc[:, i], y_pred[:, i])
    
    accuracies.append(accuracy)

    print(f"Classification report for {species}:\n", classification_report(y_test.iloc[:, i], y_pred[:, i]))
    # print(f"Accuracy for {species}: {accuracy}")

# Calculate overall average accuracy across all species
average_accuracy = np.mean(accuracies)
average_accuracy

# # Evaluate the model
# for i, species in enumerate(labels.columns):
#     print(f"Classification report for {species}:\n", classification_report(y_test.iloc[:, i], y_pred[:, i]))

In [None]:
## Plot
feature_importance_df = pd.DataFrame() # type: ignore
for i, species in enumerate(labels.columns):
    importances = rf_classifier.estimators_[i].feature_importances_
    feature_importance_df[species] = importances

# Calculate mean feature importance across all species
feature_importance_df['mean_importance'] = feature_importance_df.mean(axis=1)

# Plot feature importances
plt.figure(figsize=(10, 8))
plt.barh(features.columns, feature_importance_df['mean_importance'])
plt.xlabel('Feature Importance')
plt.title('Mean Feature Importance Across All Species')
plt.gca().invert_yaxis()
plt.show()

### Ran without rare species

In [None]:
threshold = len(ml_df) * 0.2

# Count the non-zero occurrences of each species
species_columns = ml_df.columns[:-10]  # Adjust the slice to exclude non-species columns if needed
species_counts = (ml_df[species_columns] > 0).sum()
filtered_species = species_counts[species_counts >= threshold].index
filtered_df = ml_df[filtered_species]
non_species_columns = ml_df.columns[-10:]  # Adjust this if the non-species columns are at the end
filtered_df = pd.concat([filtered_df, ml_df[non_species_columns]], axis=1)
filtered_df = filtered_df.dropna()

filtered_df
filtered_ml_df = filtered_df
filtered_ml_df.to_csv('test')

In [None]:
## MULTI OUTPUT CLASSIFIER
import warnings
warnings.filterwarnings("ignore")

features = filtered_ml_df[['Surface_Temp','Bottom_Temp','Surface_Salinity','Bottom_Salinity','temp_strat','salt_strat']]
## with date
# features = ml_df[['Surface_Salinity','Bottom_Salinity','salt_strat', 'Surface_Temp','Bottom_Temp','temp_strat','date_numeric']]

labels = filtered_ml_df.loc[:,'Atl_croaker_(nibea98)':'Winter_or_Yellowtail_flounder'].applymap(lambda x: 1 if x > 0 else 0)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.2, random_state=42)

# Initialize and train the multi-output Random Forest classifier
rf_classifier = MultiOutputClassifier(RandomForestClassifier(n_estimators=100, random_state=42
                                                            , min_samples_split=5, min_samples_leaf= 2, max_depth=None, bootstrap=True
                                                             ))
rf_classifier.fit(X_train, y_train)

# Predict on the test set
y_pred = rf_classifier.predict(X_test)

accuracies = []
precisions = []
recalls = []
f1_scores = []
for i, species in enumerate(labels.columns):
    accuracy = accuracy_score(y_test.iloc[:, i], y_pred[:, i])
    # precision = precision_score(y_test.iloc[:, i], y_pred[:, i])
    # recall = recall_score(y_test.iloc[:, i], y_pred[:, i])
    # f1 = f1_score(y_test.iloc[:, i], y_pred[:, i])
    
    accuracies.append(accuracy)

    print(f"Classification report for {species}:\n", classification_report(y_test.iloc[:, i], y_pred[:, i]))
    # print(f"Accuracy for {species}: {accuracy}")

# Calculate overall average accuracy across all species
average_accuracy = np.mean(accuracies)
average_accuracy

# # Evaluate the model
# for i, species in enumerate(labels.columns):
#     print(f"Classification report for {species}:\n", classification_report(y_test.iloc[:, i], y_pred[:, i]))

In [None]:

feature_importance_df = pd.DataFrame() # type: ignore
for i, species in enumerate(labels.columns):
    importances = rf_classifier.estimators_[i].feature_importances_
    feature_importance_df[species] = importances

# Calculate mean feature importance across all species
feature_importance_df['mean_importance'] = feature_importance_df.mean(axis=1)

# # Plot feature importances
plt.figure(figsize=(10, 8))
plt.barh(features.columns, feature_importance_df['mean_importance'])
plt.xlabel('Feature Importance')
plt.title('Mean Feature Importance Across All Species')
plt.gca().invert_yaxis()
plt.show()
importances
# feature_importance_df


## Hyperparameter tuning
Finding best combination of weights for model: tbd 7/19

In [None]:
from sklearn.metrics import classification_report 
from sklearn.model_selection import train_test_split 
import pandas as pd 
from sklearn.ensemble import RandomForestClassifier 
from sklearn.model_selection import GridSearchCV,RandomizedSearchCV

#### Randomizedsearch

In [None]:
param_dist = {
    'estimator__n_estimators' : [50,100,200,500],
    'estimator__max_depth': [None, 10, 20, 30],
    'estimator__min_samples_split': [2, 5, 10],
    'estimator__min_samples_leaf': [1, 2, 4],
    'estimator__bootstrap': [True, False]
}

rf_classifier = MultiOutputClassifier(RandomForestClassifier())
rf_classifier.fit(X_train, y_train)
y_pred = rf_classifier.predict(X_test)


randomized_search = RandomizedSearchCV(estimator=rf_classifier, param_distributions=param_dist, n_iter=20, cv=3, random_state=42)
randomized_search.fit(X_train, y_train)

# Get the best hyperparameters and model
best_params_rand = randomized_search.best_params_
best_model_rand = randomized_search.best_estimator_

# Evaluate the best model
y_pred_best_rand = best_model_rand.predict(X_test)
accuracy_best_rand = accuracy_score(y_test, y_pred_best_rand)
print(f"Best RF Accuracy: {accuracy_best_rand:.2f}")
print(f"Best Hyperparameters: {best_params_rand}")


#### Gridsearch

Best parameters according to previous grid search: {'estimator__bootstrap': True, 'estimator__max_depth': None, 'estimator__max_features': 'auto', 'estimator__min_samples_leaf': 1, 'estimator__min_samples_split': 2}

Best Hyperparameters from Random Search: {'estimator__min_samples_split': 5, 'estimator__min_samples_leaf': 2, 'estimator__max_depth': None, 'estimator__bootstrap': True}

Define new grid based on Random search best params above

In [None]:
features = ml_df[['Surface_Temp','Bottom_Temp','Surface_Salinity','Bottom_Salinity','temp_strat','salt_strat']]
labels = ml_df.loc[:,'Atl_croaker_(nibea98)':'Atl_salmon'].applymap(lambda x: 1 if x > 0 else 0)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.3, random_state=42)

rf_classifier = MultiOutputClassifier(RandomForestClassifier(n_estimators=100, min_samples_leaf=2,min_samples_split=5,
                                                             max_depth=None,bootstrap=True,random_state=42))
rf_classifier.fit(X_train, y_train)
y_pred = rf_classifier.predict(X_test)

accuracies = []
for i, species in enumerate(labels.columns):
    accuracy = accuracy_score(y_test.iloc[:, i], y_pred[:, i])
    accuracies.append(accuracy)

    print(f"Classification report for {species}:\n", classification_report(y_test.iloc[:, i], y_pred[:, i]))
    # print(f"Accuracy for {species}: {accuracy}")

# Calculate overall average accuracy across all species
average_accuracy = np.mean(accuracies)
average_accuracy
## avg accuracy of 88.9%, improvement of 0.1% from default