In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from glob import glob
from scipy.signal import argrelextrema
import numpy as np
from sklearn import cluster
import seaborn; seaborn.set()

In [2]:
def find_peaks(sample_path, plot=False, threshold_factor=4):
    """
    finding the peaks of chromatography
    :param sample_path: full path of the tic front file to analyze
    :param plot: boolean - True if you want to plot the graph with the peaks
    :param threshold_factor: larger threshold factor will cause to function to be more sensitive to local peaks
    :return: filtered_iloc_max - list of the indexes of the peaks, df1 - the dataframe of the chromatography.
    """
    df1 = pd.read_csv(sample_path, names=['time', 'value']).iloc[1:]
    ilocs_max = argrelextrema(df1.value.values, np.greater_equal, order=10)[0]
    if len(ilocs_max) > 0:
        if ilocs_max[0] == 0:
            ilocs_max = ilocs_max[1:]
    iloc_max_list = list(ilocs_max)
    df_median = df1['value'][ilocs_max].median()
    max_val = df1['value'][ilocs_max].max()
    threshold_factor = max_val ** (1 / threshold_factor) / 2
    filtered_iloc_max = list(filter(lambda x: df1['value'][x] > df_median * threshold_factor, iloc_max_list))
    # plot the chromatography with the peaks
    if plot:
        df1.value.plot(figsize=(20,8), alpha=.3)
        df1.iloc[filtered_iloc_max].value.plot(style='.', lw=10, color='red')
        title_name = sample_path.split('/')
        plt.title(title_name[2] + "_" + title_name[3])
    return filtered_iloc_max, df1

In [3]:
# creating a dataframe of all the samples of the Drug - auto method

methods_dict = pd.read_csv("df_dic.csv")
methods_dict.drop(methods_dict.columns[0], axis=1, inplace=True)
drug_auto_df = pd.DataFrame()
drug_auto_df["sample"] = methods_dict["DRUG-AUTO.M"]
num_of_peaks = []
peak_locations = []
peak_values = []
sample_list = drug_auto_df['sample'].to_list()
samples = 0

In [4]:
for sample in sample_list:
    tic_front_path = sample[1:] + 'tic_front.csv'
    peaks, df = find_peaks(tic_front_path)
    num_of_peaks.append(len(peaks))
    peak_locations.append(peaks)
    peak_values.append(df['value'][peaks].values)
    samples += 1

In [5]:
drug_auto_df["num_of_peaks"] = num_of_peaks
peak1_index = []
peak1_value = []
peak2_index = []
peak2_value = []
peak3_index = []
peak3_value = []
peak4_index = []
peak4_value = []
peak5_index = []
peak5_value = []
peak6_index = []
peak6_value = []
new_columns = [(peak1_index, peak1_value), (peak2_index, peak2_value), (peak3_index, peak3_value), (peak4_index, peak4_value), (peak5_index, peak5_value), (peak6_index, peak6_value)]


for i in range(6):
    # print(new_columns[i][0])
    for j in range(samples):
        if len(peak_locations[j]) > i:
            new_columns[i][0].append(peak_locations[j][i])
        else:
            new_columns[i][0].append(-1)
        if len(peak_values[j]) > i:
            new_columns[i][1].append(peak_values[j][i])
        else:
            new_columns[i][1].append(-1)


In [6]:
for i in range(6):
    name = "peak_" + str(i+1) + "_index"
    drug_auto_df[name] = new_columns[i][0]
    name = "peak_" + str(i+1) + "_value"
    drug_auto_df[name] = new_columns[i][1]

# drug_auto_df.to_csv("drug_auto_df_times.csv")


In [7]:
one_df = pd.DataFrame()
two_df = pd.DataFrame()
three_df = pd.DataFrame()
four_df = pd.DataFrame()
five_df = pd.DataFrame()
dfs = [one_df, two_df, three_df, four_df, five_df]

for i in range(5):
    dfs[i] = drug_auto_df[drug_auto_df['num_of_peaks'] == i + 1]
    dfs[i] = dfs[i].drop([col for col in dfs[i].columns if dfs[i][col].eq(-1).any()], axis=1)
    dfs[i] = dfs[i].loc[:, dfs[i].columns != 'num_of_peaks']


# for i in range(len(dfs)):
#     name = 'drug_auto_' + str(i+1) + '_peaks.csv'
#     dfs[i].to_csv(name)
#

In [8]:
for i in range(5):
    for j in range(i):
        name = 'difference_' + str(j+1)
        peak1 = 'peak_' + str(j+1) + '_index'
        peak2 = 'peak_' + str(j+2) + '_index'
        dfs[i][name] = dfs[i][peak2] - dfs[i][peak1]



# dfs[1]['difference'] = dfs[1]['peak_2_index'] - dfs[1]['peak_1_index']
# print(dfs[1]['peak_2_index'])
def scale_data(df):
  for col in df.columns:
    df[col] = absolute_maximum_scale(df[col])
  return df


def absolute_maximum_scale(series):
    return series / series.max()

# cols_for_norm = dfs[1][['peak_1_value','peak_2_value']].transpose()
# cols_for_norm=scale_data(cols_for_norm).transpose()
#
# dfs[1]['peak_1_value']=cols_for_norm['peak_1_value']
# dfs[1]['peak_2_value']=cols_for_norm['peak_2_value']

In [9]:
# new_df = dfs[1].loc[:, dfs[1].columns != 'sample']
# new_df = dfs[1]['difference'].values.reshape(-1, 1)
# new_df = dfs[1][['peak_1_index','difference_1']]
# clustering = cluster.KMeans(n_clusters=32)
# fitted_data = clustering.fit_predict(new_df)
# dfs[1]['cluster_omer'] = fitted_data
# dfs[1].to_csv('drug_auto_2_peaks.csv')

new_df = dfs[2][['peak_1_index','difference_1', 'difference_2']]
clustering = cluster.KMeans(n_clusters=32)
fitted_data = clustering.fit_predict(new_df)
dfs[2]['cluster_omer'] = fitted_data
# dfs[2].to_csv('drug_auto_3_peaks.csv')


[ 0  0 10  0  2  0  0  2  2 13  2  0  0  0  9  0 23  0 25  8  2 25  8  8
  2 11  2  2 23 11 11 31  2  2 25 11 23  0  0  0  0  2 27  8 12  0 19 18
  7 18  9  0  2  0 17 28  8  0  0  0  0  0 23  9  0  0  0  0  0  0 16  0
 16  9 17  0  2  2  0  0  6  6  6 14  1 14 14 14  4  1  1  6 20 26 20  6
 28  2 28 28 28 28  6 21 15  6  4 30  6  4 24  1  1 22 22  5 22 28 28  3
 28 28 28 28 29]


In [10]:
# printing examples of the basic cluster
# index = 0
# for sample in dfs[1]['sample'].to_list():
#     tic_front_path = sample[1:] + 'tic_front.csv'
#     if fitted_data[index] == 8:
#         if index < 550:
#             find_peaks(tic_front_path, True)
#             plt.show()
#     index += 1

In [None]:
def plot_samples_from_same_cluster(df, cluster_number):
    plots = 0
    relevant_samples = df[df['cluster_omer'] == cluster_number]
    print("number of samples in cluster " + str(cluster_number) + ": " + str(len(relevant_samples)))
    for sample in relevant_samples['sample'].to_list():
        tic_front_path = sample[1:] + 'tic_front.csv'
        if plots < 7:
            # find_peaks(tic_front_path, True)
            data = pd.read_csv(tic_front_path, names=['x', 'y']).iloc[1:]
            title_name = sample[1:].split('/')
            title_ = "cluster: " + str(cluster_number) + ", sample: " + title_name[2] + "/" + title_name[3]
            data.plot(x='x', y='y', title=title_)
            file_name = "clusters_plot_omer/cluster_" + str(cluster_number) + "_graph_" + str(plots) + ".png"
            # plt.savefig(file_name)
            plt.show()
        plots += 1

# plot_samples_from_same_cluster(three_peaks_df, 18)

# for i in range(32):
#     plot_samples_from_same_cluster(two_peaks_df, i)


In [None]:
two_peaks_df = pd.read_csv('drug_auto_2_peaks.csv')
three_peaks_df = pd.read_csv('drug_auto_3_peaks.csv')

index_list = []
for i in range(len(two_peaks_df)):
    if i % 7 == 0:
        index_list.append(i)
#
index_list = index_list[:50]

matrix_df = two_peaks_df.iloc[index_list]


In [None]:
omer_cluster_matrix = []
yaara_clusters = [13, 11, 2, 14, 14, 12, 2, 5, 14, 7, 15, 13, 2, 13, 7, 14, 2, 8, 13, 2, 9, 2, 15, 2,
                  2, 16, 2, 14, 2, 13, 7, 15, 2, 13, 11, 10, 7, 7, 6, 17, 1, 1, 4, 1, 3, 2, 3, 1, 18, 19]
yaara_cluster_matrix = []


for i in range(50):
    line = []
    for j in range(50):
        if yaara_clusters[i] == yaara_clusters[j]:
            line.append(1)
        else:
            line.append(0)
    yaara_cluster_matrix.append(line)



for i in range(50):
    line = []
    for j in range(50):
        if matrix_df['cluster_omer'][index_list[i]] == matrix_df['cluster_omer'][index_list[j]]:
            line.append(1)
        else:
            line.append(0)
    omer_cluster_matrix.append(line)


samples_matrix = []

for i in range(50):
    line = []
    for j in range(50):
        line.append((matrix_df['sample'][index_list[i]], matrix_df['sample'][index_list[j]]))
    samples_matrix.append(line)


In [None]:
np_omer_matrix = np.array(omer_cluster_matrix)

np_yaara_matrix = np.array(yaara_cluster_matrix)

tensor_method = np.stack([np_omer_matrix, np_yaara_matrix])

tensor_sum = np.sum(tensor_method, axis=0)

print(tensor_sum)

# interesting_samples = np.where((tensor_sum == 1) | (tensor_sum == 2))[0]

# print(interesting_samples)

samples_np = np.array(samples_matrix)

interesting_samples = samples_np[(tensor_sum == 1) | (tensor_sum == 2)]

# print(interesting_samples.shape)

# print(np.sum(tensor_sum == 2))