In [1]:
import numpy as np
import pandas as pd
import os
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score

In [7]:
class RandomForest:
    
    def __init__(self, path_data_pairs, path_mfccs, path_predictions):
        
        self.path_data_pairs = path_data_pairs
        self.p_trn = self.path_data_pairs+"n1_train_pairs.csv"
        self.p_val = self.path_data_pairs+"n1_validation_pairs.csv"
        self.n1_test = self.path_data_pairs+"n1_test_pairs.csv"
        self.n2_test = self.path_data_pairs+"n2_test_pairs.csv"
        self.n3_test = self.path_data_pairs+"n3_test_pairs.csv"
        self.n1_n2_test = self.path_data_pairs+"n1_n2_test_pairs.csv"
        self.n1_n3_test = self.path_data_pairs+"n1_n3_test_pairs.csv"
        self.n2_n3_test = self.path_data_pairs+"n2_n3_test_pairs.csv"
        self.path_mfccs = path_mfccs
        self.path_predictions = path_predictions
        
    def samples_labels(self, path):
        
        df = pd.read_csv(path)
        sample_1 = df.sample_1.values
        sample_2 = df.sample_2.values
        labels = df.label.values
        
        return df, sample_1, sample_2, labels
    
    def sample_to_mfccs(self, path):
        mfccs_df = pd.read_csv(path)
        rows = [row.values for _, row in mfccs_df.iterrows()]
        sample_2_mfccs = {row[0]:row[1:] for row in rows}
        return sample_2_mfccs
    
    def lst_samples_to_mfccs(self, lst_samples, sample_2_mfccs):
        return zip(*[(lst_samples[i],sample_2_mfccs[lst_samples[i]]) for i in range(len(lst_samples))])
    
    
    def combine(self, sample_2_mfccs, p_d_pairs):
        
        df_data_pairs, sam1, sam2, y = self.samples_labels(p_d_pairs)
        _, mfccs_1 = self.lst_samples_to_mfccs(sam1, sample_2_mfccs)
        _, mfccs_2 = self.lst_samples_to_mfccs(sam2, sample_2_mfccs)
        
        fs_1 = np.array(mfccs_1)
        fs_2 = np.array(mfccs_2)
        mfcc_fs = np.hstack([fs_1,fs_2])
        
        assert mfcc_fs.shape[0] == y.shape[0]
        assert mfcc_fs.shape[1] == fs_1.shape[1]*2
        
        return df_data_pairs, mfcc_fs, y 
        
    
    
    def evaluation_metrics(self, Y, P):
       
        TP = np.where((Y == 1) & (P == 1))[0].shape[0]
        TN = np.where((Y == 0) & (P == 0))[0].shape[0]
        FP = np.where((Y == 0) & (P == 1))[0].shape[0]
        FN = np.where((Y == 1) & (P == 0))[0].shape[0]
        Recall = round(TP/(TP+FN),ndigits = 4)
        Precision = round(TP/(TP+FP), ndigits = 4)
        F1s = round(2 * Precision * Recall / (Precision + Recall), ndigits=4)
        Accuracy = round((TP+TN)/(TP+TN+FP+FN),ndigits = 4)
        
        return Precision, Recall, F1s, Accuracy
   
    
    def random_forest_model(self, sample_2_mfccs, p_d_pairs):
    
        t_df, t_fs, t_y = self.combine(sample_2_mfccs, p_d_pairs)
        rf = RandomForestClassifier(n_estimators = 1000,
                                    max_depth = None,
                                    max_features = 5
        )
        rf.fit(t_fs, t_y)
        
        return rf
    
    def prediction(self, model, sample_2_mfccs, pdata, night = None):
        
        df, fs, y = self.combine(sample_2_mfccs, pdata)
        p = model.predict(fs)
        df["predictions"] = p
        df.to_csv(self.path_predictions+night+".csv", index = False)
        P, R, F1, A = self.evaluation_metrics(y,p)
        
        return P, R, F1, A
    
    def Execution(self):
        
        sample_2_mfccs = self.sample_to_mfccs(self.path_mfccs)
        
        model = self.random_forest_model(sample_2_mfccs, self.p_trn)
        
        pre, rec, f1s, acc = self.prediction(model, sample_2_mfccs, self.p_val, "n1_val")
        p1, r1, f1, a1 = self.prediction(model, sample_2_mfccs, self.n1_test, "n1_test")
        p2, r2, f2, a2 = self.prediction(model, sample_2_mfccs, self.n2_test, "n2_test")
        p3, r3, f3, a3 = self.prediction(model, sample_2_mfccs, self.n3_test, "n3_test")
        p12, r12, f12, a12 = self.prediction(model, sample_2_mfccs, self.n1_n2_test, "n1_n2_test")
        p13, r13, f13, a13 = self.prediction(model, sample_2_mfccs, self.n1_n3_test, "n1_n3_test") 
        p23, r23, f23, a23 = self.prediction(model, sample_2_mfccs, self.n2_n3_test, "n2_n3_test")
        
        metrics = {"n1v":[pre, rec, f1s, acc],
                   "n11":[p1,r1,f1,a1],
                   "n22":[p2, r2, f2, a2],
                   "n33":[p3, r3, f3, a3],
                   "n12":[p12, r12, f12, a12],
                   "n13":[p13, r13, f13, a13],
                   "n23":[p23, r23, f23, a23]
                  }
        
        df = pd.DataFrame.from_dict(metrics, orient = "index", columns = ["precision","recall","f1_score","accuracy"])
        df = df.rename_axis("nights")
        df.to_csv(self.path_predictions+"one_metrics.csv")
        
        
        
        return df
        
    
               

In [12]:
p_d = "pairs/"
p_m = "1_s_mfccs.csv"
p_p = "matching_predictions/rf/"

In [13]:
pr = RandomForest(p_d, p_m, p_p)

In [14]:
df = pr.Execution()

In [15]:
df

Unnamed: 0_level_0,precision,recall,f1_score,accuracy
nights,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
n1v,0.9282,0.9578,0.9428,0.9418
n11,0.9172,0.962,0.9391,0.9376
n22,0.9092,0.7749,0.8367,0.8487
n33,0.9056,0.8868,0.8961,0.8972
n12,0.8581,0.3712,0.5182,0.6549
n13,0.7349,0.3454,0.4699,0.6104
n23,0.9214,0.5278,0.6711,0.7414
