In [66]:
import os
import pandas as pd
pd.options.mode.chained_assignment = None
from sklearn.model_selection import KFold
from sklearn.preprocessing import MinMaxScaler
from sklearn.svm import SVR

In [2]:
output_files = !ls ./amplicon_outputs/*.csv

primer_files = !ls ./primer_inputs/*.csv

In [102]:
primer_df = pd.DataFrame([])

for primer_file in primer_files:
  exp_name = os.path.splitext(os.path.basename(primer_file))[0].rstrip("_primer_stats")
  df = pd.read_csv(primer_file)
  df['exp'] = exp_name
  df['hybrid_score_max'] = df.loc[:, df.columns.str.endswith('hybrid_score')].apply(max, axis = 'columns')
  df['hybrid_score_mean'] = df.loc[:, df.columns.str.endswith('hybrid_score')].mean(axis = 'columns')
  df['hybrid_score_median'] = df.loc[:, df.columns.str.endswith('hybrid_score')].median(axis = 'columns')  
  df['percent_hybrid_max'] = df.loc[:, df.columns.str.endswith('percent_hybrid')].apply(max, axis = 'columns')
  df['percent_hybrid_mean'] = df.loc[:, df.columns.str.endswith('percent_hybrid')].mean(axis = 'columns')
  df['percent_hybrid_median'] = df.loc[:, df.columns.str.endswith('percent_hybrid')].median(axis = 'columns')
  df['hybrid_max_run_max'] = df.loc[:, df.columns.str.endswith('hybrid_max_run')].apply(max, axis = 'columns')
  df['hybrid_max_run_mean'] = df.loc[:, df.columns.str.endswith('hybrid_max_run')].mean(axis = 'columns')
  df['hybrid_max_run_median'] = df.loc[:, df.columns.str.endswith('hybrid_max_run')].median(axis = 'columns')
  primer_df = primer_df.append(df)
primer_df = primer_df[['Primer_Name', 'Seq', 'Amplicon',
                       'Primer_Length', 'Tm', 'Delta Tm', 'exp',
                       'GC_Content', 'Length_Longest_Homopolymer', 'Percent_Homopolymer',
                       'hybrid_score_max', 'hybrid_score_mean' , 'hybrid_score_median',
                       'percent_hybrid_max', 'percent_hybrid_mean', 'percent_hybrid_median',
                       'hybrid_max_run_max', 'hybrid_max_run_mean', 'hybrid_max_run_median',]]
primer_df.head()

Unnamed: 0,Primer_Name,Seq,Amplicon,Primer_Length,Tm,Delta Tm,exp,GC_Content,Length_Longest_Homopolymer,Percent_Homopolymer,hybrid_score_max,hybrid_score_mean,hybrid_score_median,percent_hybrid_max,percent_hybrid_mean,percent_hybrid_median,hybrid_max_run_max,hybrid_max_run_mean,hybrid_max_run_median
0,48_F,CACTACCGATTTCTGGATAAAAA,48,23,48.911218,4.07491,Ba_1,34.782609,5,0.347826,26.4,16.624561,16.4,100.0,52.915117,42.261905,7.0,4.078947,4.0
1,11_R,TATACCTTCTATACTCATGTTAA,11,23,43.522139,1.314169,Ba_1,26.086957,2,0.0,31.0,16.231579,15.4,50.0,40.62074,40.0,9.0,4.096491,4.0
2,48_R,TCATAGTATTCTTAGATATGTGT,48,23,43.897529,0.938779,Ba_1,26.086957,2,0.0,27.6,15.507018,15.2,75.0,46.291815,50.0,7.0,4.157895,4.0
3,2_F,AGACAACGAGTTGCATTT,2,18,46.747627,1.911319,Ba_1,38.888889,3,0.166667,23.0,15.770175,15.6,75.0,46.279431,50.0,6.0,3.780702,4.0
4,4_F,TTATCGTAACGGAATTAC,4,18,40.937899,3.898409,Ba_1,33.333333,2,0.0,24.0,15.389474,15.1,50.0,40.436201,40.0,8.0,3.991228,4.0


In [103]:
primer_df['flank'] = primer_df['Primer_Name'].apply(lambda x: x.split("_")[-1])

In [104]:
forward = primer_df[primer_df.flank == "F"]
reverse = primer_df[primer_df.flank == "R"]

In [105]:
primers = forward.merge(reverse, left_on=["Amplicon", 'exp'], right_on=["Amplicon", "exp"], suffixes=('_L', '_R'))

In [106]:
output_df = pd.DataFrame([])

for output_file in output_files:
    df = pd.read_csv(output_file)
    output_df = output_df.append(df)
    
output_df.head()
output_df = output_df.rename(columns = {'amplicon_id' : 'Amplicon'})


In [107]:
merged_df = primers.merge(output_df, left_on = 'Amplicon', right_on = 'Amplicon')
merged_df = merged_df.sort_values('Amplicon')

merged_df = merged_df[merged_df['strain'].isna()]

merged_df.head()

Unnamed: 0,Primer_Name_L,Seq_L,Amplicon,Primer_Length_L,Tm_L,Delta Tm_L,exp,GC_Content_L,Length_Longest_Homopolymer_L,Percent_Homopolymer_L,...,percent_hybrid_median_R,hybrid_max_run_max_R,hybrid_max_run_mean_R,hybrid_max_run_median_R,flank_R,sample_id,sample_name,species,strain,n_reads
54272,1_F,TACGTGGGATGCAAATAAA,1,19,46.445263,0.76986,Ba_V45,36.842105,3,0.473684,...,40.0,6.0,4.025424,4.0,R,S5,A0615,Ba,,19
54108,1_F,TACGTGGGATGCAAATAAA,1,19,46.445263,0.7978,Ba_6_V4,36.842105,3,0.473684,...,40.0,6.0,4.025424,4.0,R,S6,A0706,Ba,,8
54109,1_F,TACGTGGGATGCAAATAAA,1,19,46.445263,0.7978,Ba_6_V4,36.842105,3,0.473684,...,40.0,6.0,4.025424,4.0,R,S2,A0330,Ba,,12
54110,1_F,TACGTGGGATGCAAATAAA,1,19,46.445263,0.7978,Ba_6_V4,36.842105,3,0.473684,...,40.0,6.0,4.025424,4.0,R,S4,A0605,Ba,,3986
54111,1_F,TACGTGGGATGCAAATAAA,1,19,46.445263,0.7978,Ba_6_V4,36.842105,3,0.473684,...,40.0,6.0,4.025424,4.0,R,S5,A0615,Ba,,19


In [108]:
X = merged_df.iloc[:, 0 : 23]
y = merged_df.loc[:, 'n_reads']

In [10]:
X_number = X.select_dtypes(include = ['number'])

In [11]:
scaler = MinMaxScaler(feature_range=(0, 1))
X_number = scaler.fit_transform(X_number)

In [12]:
# scores = []
# best_svr = SVR(kernel='rbf')
# cv = KFold(n_splits=10, random_state=42, shuffle=True)
# for train_index, test_index in cv.split(X_number):
#     print("Train Index: ", train_index, "\n")
#     print("Test Index: ", test_index)

# X_train, X_test, y_train, y_test = X_number[train_index], X_number[test_index], y[train_index], y[test_index]
# best_svr.fit(X_train, y_train)
# scores.append(best_svr.score(X_test, y_test))