In [33]:
import os
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.preprocessing import MinMaxScaler
from sklearn.svm import SVR

In [2]:
output_files = !ls ./amplicon_outputs/*.csv

primer_files = !ls ./primer_inputs/*.csv

In [3]:
primer_df = pd.DataFrame([])

for primer_file in primer_files:
  exp_name = os.path.splitext(os.path.basename(primer_file))[0].rstrip("_primer_stats")
  df = pd.read_csv(primer_file,
                   usecols=['Primer_Name', 'Seq', 'Amplicon',
                            'Primer_Length', 'Tm', 'Delta Tm',
                            'GC_Content', 'Length_Longest_Homopolymer', 'Percent_Homopolymer'])
  df['exp'] = exp_name
  primer_df = primer_df.append(df)

In [4]:
primer_df['flank'] = primer_df['Primer_Name'].apply(lambda x: x.split("_")[-1])

In [5]:
forward = primer_df[primer_df.flank == "F"]
reverse = primer_df[primer_df.flank == "R"]

In [6]:
primers = forward.merge(reverse, left_on=["Amplicon", 'exp'], right_on=["Amplicon", "exp"], suffixes=('_L', '_R'))


In [7]:
output_df = pd.DataFrame([])

for output_file in output_files:
    df = pd.read_csv(output_file)
    output_df = output_df.append(df)
    
output_df.head()
output_df = output_df.rename(columns = {'amplicon_id' : 'Amplicon'})


In [52]:
merged_df = primers.merge(output_df, left_on = 'Amplicon', right_on = 'Amplicon')
merged_df = merged_df.sort_values('Amplicon')

merged_df = merged_df[merged_df['strain'].isna()]



In [43]:
X = merged_df.iloc[:, 0 : 23]
y = merged_df.loc[:, 'n_reads']

In [41]:
X_number = X.select_dtypes(include = ['number'])

In [42]:
scaler = MinMaxScaler(feature_range=(0, 1))
X_number = scaler.fit_transform(X_number)

In [1]:
# scores = []
# best_svr = SVR(kernel='rbf')
# cv = KFold(n_splits=10, random_state=42, shuffle=True)
# for train_index, test_index in cv.split(X_number):
#     print("Train Index: ", train_index, "\n")
#     print("Test Index: ", test_index)

# X_train, X_test, y_train, y_test = X_number[train_index], X_number[test_index], y[train_index], y[test_index]
# best_svr.fit(X_train, y_train)
# scores.append(best_svr.score(X_test, y_test))