In [1]:
import numpy as np
import pandas as pd
import os
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV

In [2]:
class SampleData:
    
    
    def __init__(self, path_csv, path_samples):
        
        self.path_csv = path_csv
        self.n1_trn = path_samples + "n1_train.csv"
        self.n1_val = path_samples + "n1_validation.csv"
        self.n1_tst = path_samples + "n1_test.csv"
        self.n2_tst = path_samples + "n2_test.csv"
        self.n3_tst = path_samples + "n3_test.csv"
        
        
        
    def slice_data_frame(self, path):
        """Select Segment, and ID columns from a data frame"""
        df = pd.read_csv(path)
        return df[["Segment", "ID"]]
        
    def sample_to_ID(self, df):
        """Map each sample to its ID given a data frame of samples and IDs"""
        return dict([(sample, ID) for sample, ID in df.to_records(index = False)])
    
    def ID_to_samples(self, sample_2_ID_dict):
        """Group samples per ID"""
        ID_2_samples = {}
        for sample, ID in sample_2_ID_dict.items():
            if ID not in ID_2_samples:ID_2_samples[ID] = [sample]
            else:ID_2_samples[ID].append(sample)
        return ID_2_samples
    
        
    def get_samples_labels(self, lst_samples, ID_2_samples_dict):
        samp_2_numericID = dict([(a[j], i) for i, a in enumerate(ID_2_samples_dict.values()) for j in range(len(a))])
        samples, labels = zip(*[(sample, samp_2_numericID[sample]) for sample in lst_samples])
        return np.array(samples), np.array(labels)
    
    def get_samples(self, path): return pd.read_csv(path).Segment.values
        
    
    def labelled_samples(self):
        
        df = self.slice_data_frame(self.path_csv)
        sample_2_ID = self.sample_to_ID(df)
        ID_2_samples = self.ID_to_samples(sample_2_ID)
        
        n1_trn_s = self.get_samples(self.n1_trn)
        n1_val_s = self.get_samples(self.n1_val)
        n1_tst_s = self.get_samples(self.n1_tst)
        n2_tst_s = self.get_samples(self.n2_tst)
        n3_tst_s = self.get_samples(self.n3_tst)
        
        n1ts, n1ty = self.get_samples_labels(n1_trn_s, ID_2_samples)
        n1vs, n1vy = self.get_samples_labels(n1_val_s, ID_2_samples)
        n1es, n1ey = self.get_samples_labels(n1_tst_s, ID_2_samples)
        n2es, n2ey = self.get_samples_labels(n2_tst_s, ID_2_samples)
        n2es, n2ey = self.get_samples_labels(n2_tst_s, ID_2_samples)
        n3es, n3ey = self.get_samples_labels(n3_tst_s, ID_2_samples)
        
        t1 = np.arange(len(n1_trn_s))
        v1 = np.arange(len(n1_val_s))
        e1 = np.arange(len(n1_tst_s))
        t2 = np.arange(len(n2_tst_s))
        t3 = np.arange(len(n3_tst_s))

        np.random.shuffle(t1)
        np.random.shuffle(v1)
        np.random.shuffle(e1)
        np.random.shuffle(t2)
        np.random.shuffle(t3)
        
        return n1ts[t1], n1ty[t1], n1vs[v1], n1vy[v1], n1es[e1], n1ey[e1], n2es[t2], n2ey[t2], n3es[t3], n3ey[t3]
    
class RFClassifier(SampleData):
    
    def __init__(self,p_mfccs, p_predictions, p_seg_chrs, p_samples):
        self.p_mfccs = p_mfccs
        self.p_predictions = p_predictions
        self.p_seg_chrs  = p_seg_chrs
        self.p_samples = p_samples
        super().__init__(self.p_seg_chrs, self.p_samples)
        
        
        
    def sample_to_mfccs(self, path):
        mfccs_df = pd.read_csv(path)
        rows = [row.values for _, row in mfccs_df.iterrows()]
        sample_2_mfccs = {row[0]:row[1:] for row in rows}
        return sample_2_mfccs
    
    def lst_samples_to_mfccs(self, lst_samples, lst_labels, sample_2_mfccs_dict):
        sams, mfccs = zip(*[(lst_samples[i],sample_2_mfccs_dict[lst_samples[i]]) for i in range(len(lst_samples))])
        sam2lab = {"samples":sams, "labels":lst_labels}
        df = pd.DataFrame(sam2lab)
        return df, lst_labels, np.array(mfccs)
    
    def random_forest_model(self, samp_2_mfccs_dict, lst_samples, lst_labels):
    
        _, t_y, t_fs = self.lst_samples_to_mfccs(lst_samples,  lst_labels, samp_2_mfccs_dict)
        
        rf = RandomForestClassifier(n_estimators = 1000,
                                    max_features = 5
        )
        rf.fit(t_fs, t_y)
        
        return rf
    
#     def random_forest_model(self, samp_2_mfccs_dict, lst_samples, lst_labels):
    
#         _, t_y, t_fs = self.lst_samples_to_mfccs(lst_samples,  lst_labels, samp_2_mfccs_dict)
        
#         rf = RandomForestClassifier()
        
#         parameter_grid = {"n_estimators":[100, 200, 500, 1000],
#                  "max_features":[6,7,8,10],
#                  "max_depth":[1,2,5,20]}
        
#         grid_search = GridSearchCV(estimator = rf, param_grid = parameter_grid, cv = 3)
        
#         grid_search.fit(t_fs, t_y)
#         print("best parameters are: ", grid_search.best_params_)
#         best_pars = grid_search.best_params_
#         np.save("best_parameters.npy",best_pars)
        
#         return grid_search.best_estimator_
    
    def predict(self,  model, lst_samples, lst_labels, samp_2_mfcc_dict, night = None):
        df, y, fs = self.lst_samples_to_mfccs(lst_samples, lst_labels, samp_2_mfcc_dict)
        p = model.predict(fs)
        accuracy = np.where(y == p)[0].shape[0]/y.shape[0]
        df["predictions"] = p
        df.to_csv(self.p_predictions + night + ".csv", index  = False)
        return accuracy
    
    def execution(self):
        
        sample_2_mfccs = self.sample_to_mfccs(self.p_mfccs)
        Ts, Ty, Vs, Vy, N1s, N1y, N2s, N2y, N3s, N3y = super().labelled_samples()
        rf = self.random_forest_model(sample_2_mfccs, Ts, Ty)
        a1v = self.predict(rf, Vs, Vy, sample_2_mfccs, "single_night_data_night1v")
        a1 = self.predict(rf, N1s, N1y, sample_2_mfccs,  "single_night_data_night1")
        a2 = self.predict(rf, N2s, N2y, sample_2_mfccs,  "single_night_data_night2")
        a3 = self.predict(rf, N3s, N3y, sample_2_mfccs,  "single_night_data_night3")
        
        a = {"accuracy":[a1v, a1,a2,a3]}
        
        df = pd.DataFrame.from_dict(a, orient = "index", columns = ["night1v", "night1", "night2", "night3"])
        df = df.rename_axis("accuracy")
        df.to_csv(self.p_predictions + "single_night_data_accuracy.csv")
        return df
        

        
    

In [3]:
p1 = "1_s_mfccs.csv"
#p1 = "1_s_f_MFCCs.csv"
p2 = "classification_predictions/rf/"
p3 = "segment_2_characts.cvs"
p4 = "all_tr_val_tst/"

In [4]:
RFC = RFClassifier(p1, p2, p3, p4)

In [5]:
df = RFC.execution()

In [6]:
df

Unnamed: 0_level_0,night1v,night1,night2,night3
accuracy,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
accuracy,0.987327,0.987018,0.194693,0.193597


In [None]:
pars = np.load("best_parameters.npy", allow_pickle=True)

In [None]:
pars

The usual parameters for random forests are: number of trees (n_estimators), number of attributes that are randomly selected for the split search when constructing each tree (max_features), and limits on the tree depth for each tree (max_depth). The higher the number of tree the greater the performance. The number of features to select is by default the square root of the number of atributes (for classification). The randomForest package, controls the depth by the minimum number of cases to perform a split in the tree construction algorithm, and for classification they suggest 1, that is no constraints on the depth of the tree. Sklearn uses 2 as this min_samples_split. If you plan to search this hyperparameter, I think it is wiser to control the minimum number of samples to split the tree, and 1, 2 or 5 seems reasonable values.

In [None]:
parameter_grid = {"n_estimators":[100, 200, 500, 1000],
                 "max_features":[6,7,8,10],
                 "max_detph":[1,2,5,20]}

In [None]:
rf = RandomForestClassifier()

In [None]:
grid_search = GridSearchCV(estimator = rf, param_grid = parameter_grid, cv = 3)
grid_search.fit(train_features, train_labels)
grid_search.best_params_
best_grid = grid_search.best_estimator_