In [89]:
## Import libraries
import math
import matplotlib
import matplotlib.pyplot as plt
## remove this line when running script from terminal, keep it when running notebooks
%matplotlib inline 

import numpy as np
import pandas as pd

import os # to join pathrs, etc..
import pickle # to store the models

In [90]:
results_folder = 'results'

if not os.path.exists(results_folder):
    os.makedirs(results_folder)

    models_folder = os.path.join("../", "part-2", "results")
peaks_path = "output"
peaks_files =  [os.path.join(peaks_path, file) for file in os.listdir(peaks_path) if file.endswith('csv') ]
print peaks_files

['output/BD18_1711291249_ims.csv', 'output/BD18_1711291317_ims.csv', 'output/BD18_1711291251_ims.csv', 'output/BD18_1711291328_ims.csv', 'output/BD18_1711291320_ims.csv', 'output/BD18_1711291245_ims.csv']


In [91]:
def read_model( path ):
    """Helper function which loads models from pickle python files"""
    path = os.path.join(models_folder,path + '.pck' )
    with open( path  , 'r') as f:
        model = pickle.load(f)
    return model

def read_files( paths ):
    # reads all the files
    raw_peaks = [pd.read_csv(path, sep="\t") for path in paths ]
    # combines them all in one dataframe
    df = pd.concat(raw_peaks).reset_index(drop=True)
    # converts peak_name column into integers
    peaks = [ int(peak.replace('p','')) for peak in df['peak_name']] 
    df['peak_name'] = peaks
    return df

def create_train_matrix( data , n_clusters):
    total_clusters = n_clusters
    d = { colname: [0] * total_clusters for colname in data['measurement_name'].unique() }
    matrix = pd.DataFrame(data=d)
    ## fill the matrix
    for name in data['measurement_name'].unique():
        patient = data[data['measurement_name'] == name]
        clusters = patient['cluster_id']
        #for cluster in clusters:
        matrix[name][clusters] = 1
    
    return matrix.transpose()

def write_csv( names, labels, filename):
    path = os.path.join(results_folder, filename +".csv")
    with open(path, "w") as f:
        f.write("file,candy\n")
        for name, label, in zip(names, labels):
            f.write("%s,%s\n"%(name, label))
            


In [101]:
#training kmeans again with all the data files of 3 parts
peaks_files =  [
    os.path.join(peaks_path, file) 
    for file in os.listdir(peaks_path)
        if file.endswith('csv') 
] + [
    os.path.join("../part-1/output", file) 
    for file in  os.listdir("../part-1/output")
        if file.endswith('csv') 
] + [
    os.path.join("../part-2/output", file) 
    for file in  os.listdir("../part-2/output")
        if file.endswith('csv') 
]

def align_peaks( data, peaks=90):
    KM = KMeans(n_clusters=peaks, n_init=100)
    KM.fit(data[['t', 'r']])
    data['cluster_id'] = KM.labels_
    return KM, data  


print peaks_files

['output/BD18_1711291249_ims.csv', 'output/BD18_1711291317_ims.csv', 'output/BD18_1711291251_ims.csv', 'output/BD18_1711291328_ims.csv', 'output/BD18_1711291320_ims.csv', 'output/BD18_1711291245_ims.csv', '../part-1/output/BD18_1408280841_ims.csv', '../part-1/output/BD18_1408280826_ims.csv', '../part-1/output/BD18_1408280844_ims.csv', '../part-1/output/BD18_1408280851_ims.csv', '../part-1/output/BD18_1408280834_ims.csv', '../part-1/output/BD18_1408280838_ims.csv', '../part-2/output/BD18_1711291800_ims.csv', '../part-2/output/BD18_1711291732_ims.csv', '../part-2/output/BD18_1711291722_ims.csv', '../part-2/output/BD18_1711291649_m2_ims.csv', '../part-2/output/BD18_1711291712_ims.csv', '../part-2/output/BD18_1711291725_ims.csv', '../part-2/output/BD18_1711291709_ims.csv', '../part-2/output/BD18_1711291756_ims.csv', '../part-2/output/BD18_1711291652_ims.csv', '../part-2/output/BD18_1711291746_ims.csv', '../part-2/output/BD18_1711291656_ims.csv', '../part-2/output/BD18_1711291743_ims.csv', 

In [103]:
from sklearn.cluster import KMeans
all_df = read_files( peaks_files )
CLUSTERS = int(np.average([ all_df[all_df['measurement_name'] == name].shape[0] for name in all_df['measurement_name'].unique()]) * 1.5)
print "Guessing for  ", CLUSTERS, " clusters"
k_means, aligned_all_df = align_peaks( all_df ,peaks = CLUSTERS)

Guessing for   92  clusters


In [None]:
KM = k_means

In [118]:
# training random forests with all the data with the previous 2 parts, and testing with the data of the part 3
train_files = [ file for file in  os.listdir(peaks_path) if file.endswith('csv') ] 
#aligned_all_df[[aligned_all_df['measurement_name'] in train_files]]

# select only the elements that are in the list of files of the part 3
test_df = aligned_all_df[aligned_all_df['measurement_name'].isin(train_files)]

# now, select the elements that are not in that list
train_df = aligned_all_df[~aligned_all_df['measurement_name'].isin(train_files)]

In [161]:
# create matrices for both
train_matrix_df = create_train_matrix(train_df, KM.n_clusters)
test_matrix_df = create_train_matrix(test_df, KM.n_clusters)

In [162]:
# now, create the list of labels for the training
# the first part labels need some cleaning
part_1_df = pd.read_csv("../part-1/files categories.txt", sep=",")
part_1_df = part_1_df.apply(lambda x: x.str.strip())
part_1_df = part_1_df.rename(columns=lambda x: x.strip())
part_1_df['file'] = part_1_df['file'].apply(lambda x: x +'_ims.csv')
part_1_df['candy'][part_1_df['candy'] == 'halls_citruzzz'] = 'citrus'

# load labels second part
part_2_df = pd.read_csv("../part-2/all_labels.txt", sep="\t")

# merge labels
all_labels = pd.concat([part_1_df, part_2_df]).reset_index(drop=True)

# set file column as index
labels_df = all_labels[['file', 'candy']].set_index('file')

In [163]:
labels_df

Unnamed: 0_level_0,candy
file,Unnamed: 1_level_1
BD18_1408280826_ims.csv,halls
BD18_1408280841_ims.csv,halls
BD18_1408280844_ims.csv,halls
BD18_1408280834_ims.csv,citrus
BD18_1408280838_ims.csv,citrus
BD18_1408280851_ims.csv,citrus
BD18_1711291646_ims.csv,citrus
BD18_1711291649_m2_ims.csv,citrus
BD18_1711291652_ims.csv,citrus
BD18_1711291652_ims.csv,citrus


In [164]:
train_matrix

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,87,88,89,90,91,92,93,94,95,96
BD18_1408280826_ims.csv,1,0,1,0,0,0,0,1,0,0,...,0,0,0,0,1,0,0,0,0,0
BD18_1408280834_ims.csv,1,0,1,0,0,0,0,1,0,0,...,0,0,0,0,1,0,0,0,0,0
BD18_1408280838_ims.csv,1,0,1,0,0,0,1,1,0,0,...,0,0,1,0,1,0,0,0,0,0
BD18_1408280841_ims.csv,1,1,1,0,0,0,0,0,0,0,...,1,0,1,0,1,0,0,0,0,0
BD18_1408280844_ims.csv,1,0,1,0,0,0,0,0,0,0,...,1,0,0,0,1,0,0,0,0,0
BD18_1408280851_ims.csv,1,0,1,0,0,0,0,0,0,0,...,1,0,0,0,1,0,0,0,0,0
BD18_1711291646_ims.csv,1,0,1,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
BD18_1711291649_m2_ims.csv,1,0,1,0,0,0,0,1,0,0,...,1,0,0,0,0,0,0,0,0,0
BD18_1711291652_ims.csv,1,0,1,0,1,0,0,1,1,0,...,0,0,0,0,0,0,0,0,0,0
BD18_1711291656_ims.csv,1,0,1,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [165]:
from sklearn.ensemble import RandomForestClassifier
RF = RandomForestClassifier()
RF = RF.fit(train_matrix_df.as_matrix(), labels_df['candy'])

In [92]:
# LOAD MODELS TRAINED IN PART 2
# KM = read_model("model_k_means")
# RF = read_model("model_random_forests")
DT = read_model("model_decision_tree")

## Step 1

Use the code from part I to process and align this data set to create a second matrix M: peaks x volunteers , which will serve as test data

In [93]:
# create a dataframe with everything, same as usual
df = read_files( peaks_files )

# predict peaks for new data
df['cluster_id'] = KM.predict(df[['t', 'r']])

# get the matrix
test_matrix_df = create_train_matrix(df, KM.n_clusters)


In [94]:
test_matrix_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,87,88,89,90,91,92,93,94,95,96
BD18_1711291245_ims.csv,0,1,0,0,0,1,1,0,0,1,...,0,0,0,0,0,1,0,0,1,0
BD18_1711291249_ims.csv,1,1,1,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,1,1,0
BD18_1711291251_ims.csv,0,1,1,0,0,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1
BD18_1711291317_ims.csv,0,1,1,0,0,0,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0
BD18_1711291320_ims.csv,0,1,1,0,0,1,0,0,1,0,...,0,0,0,1,0,0,0,0,0,0
BD18_1711291328_ims.csv,0,1,1,0,1,0,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0


## Step 2
Apply your classifier learned in part II to this new data
sets.

In [166]:
prediction = RF.predict(test_matrix_df.as_matrix())
print prediction

['halls' 'halls' 'halls' 'citrus' 'citrus' 'citrus']


In [167]:
write_csv(test_matrix_df.index, prediction, "random_forests_prediction")

In [82]:
test_matrix_df.index

Index([u'BD18_1711291245_ims.csv', u'BD18_1711291249_ims.csv',
       u'BD18_1711291251_ims.csv', u'BD18_1711291317_ims.csv',
       u'BD18_1711291320_ims.csv', u'BD18_1711291328_ims.csv'],
      dtype='object')

In [84]:
best_features = [13, 8, 63, 74, 32] # copy pasted from part 2
dt_prediction = DT.predict(test_matrix_df[best_features].as_matrix())
print dt_prediction

['halls' 'halls' 'citrus' 'citrus' 'citrus' 'citrus']


In [86]:
prediction

array(['halls', 'halls', 'citrus', 'citrus', 'citrus', 'citrus'], dtype=object)