# Transfer of GI knowledge from Yeast to Arabidopsis for the following scenarios:
1. Using only the SMF data
2. Using all the neutrality functions
2. Using only the established neutrality functions (Mani 2008 PNAS)

### Goal:
The established Arabidopsis models serve as a baseline performance. The idea is to 
use the Yeast model, which performed really well, to increase the prediction accuracy 
on the significantly smaller Arabidopsis dataset.

### Scenario 1. Using only the SMF data

In [8]:
import joblib
import pandas as pd
import xgboost as xgb

# Load in the data
ara = pd.read_csv('../ara_data/1_feature_tables/20240923_melissa_ara_data_features/W_TSC_emmean_feature_table.tsv', sep='\t', index_col=0)
ara_test = pd.read_csv('../ara_data/1_feature_tables/20240923_melissa_ara_data_features/W_TSC_emmean_test_instances.txt', sep='\t', header=None)

# Load trained yeast xgb model and retrain with Arabidopsis data
ara.rename(columns={'MA_new': 'Query_SMF_new', 'MB_new': 'Array_SMF_new'}, inplace=True)
ara_train = ara.loc[~ara.index.isin(ara_test[0]), ['W_TSC_emmean', 'Query_SMF_new', 'Array_SMF_new']]
ara_y_train = ara_train.loc[:, 'W_TSC_emmean']
ara_train.drop(columns='W_TSC_emmean', inplace=True)

ara_test = ara.loc[ara.index.isin(ara_test[0]), ['W_TSC_emmean', 'Query_SMF_new', 'Array_SMF_new']]
ara_y_test = ara_test.loc[:, 'W_TSC_emmean']
ara_test.drop(columns='W_TSC_emmean', inplace=True)

yeast_mod = joblib.load('../output/1_xgb_regression_yeast/20241016_results/DMF_from_SMF_only_model_rep_9.pkl')
# yeast_mod.fit(ara_train, ara_y_train)

# Predict
ara_y_test_preds = yeast_mod.predict(ara_test)

In [9]:
from sklearn.metrics import r2_score
r2_score(ara_y_test, ara_y_test_preds)

0.5503288575800729

In [10]:
yeast_mod.fit(ara_train, ara_y_train) # If I retrain
r2_score(ara_y_test, yeast_mod.predict(ara_test))

0.20292576608715895