## Priority Queue

Creates a priority queue of time series to be analysed based on their probability of being an astrophysical source.

#### Import packages and read in pre-processed dataframes/arrays

In [1]:
import sklearn.ensemble
import sklearn.multiclass
import sklearn.model_selection
import numpy as np
import pandas as pd
import os

unlabelledDf = pd.read_pickle('./processed.files/raw/unlabelledDf')
preProcFeats = np.load('./processed.files/labelled/features.npy')
preProcLabels = np.load('./processed.files/labelled/labels.npy')
rawfeats = np.load('./processed.files/raw/features.npy')

#### Use the RandomForest method to fit the OneVsRest classifier on all of the data

In [5]:
randForest = sklearn.multiclass.OneVsRestClassifier(sklearn.ensemble.RandomForestClassifier()) #classifier for multiple labels

clf = sklearn.multiclass.OneVsRestClassifier(estimator=sklearn.ensemble.RandomForestClassifier(bootstrap=True, 
            class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=80, n_jobs=-1,
            oob_score=False, random_state=420, verbose=0,
            warm_start=False),
            n_jobs=1)

fitmodel = clf.fit(preProcFeats, preProcLabels)

In [3]:
print unlabelledDf.columns.values

['datfile' 'Beam' 'TSID' 'Buffer' 'MJDstart' 'bestDM' 'bestSNR' 'BinFactor'
 'Events' 'DMmax' 'DMmin' 'DMmean' 'DMmedian' 'DMstd' 'SNRmean' 'SNRmedian'
 'SNRstd' 'MJDmax' 'MJDmin' 'MJDstd' 'MJDmean' 'MJDmedian' 'Label'
 'predictLabel' 'filterbank' 'pctZero' 'pctZeroDeriv' 'ofCount' 'ofPct'
 'longestRun0' 'longestRun1' 'longestRun2' 'globtsStatsStd'
 'globtsStatsMax' 'globtsStatsPosCnt' 'globtsStatsMin' 'globtsStatsNegPct'
 'globtsStatsMedian' 'globtsStatsRatio0' 'globtsStatsPosPct'
 'globtsStatsNegCnt' 'globtsStatsRatio1' 'globtsStatsMean'
 'globDedisptsStatsStd' 'globDedisptsStatsMax' 'globDedisptsStatsPosCnt'
 'globDedisptsStatsMin' 'globDedisptsStatsNegPct' 'globDedisptsStatsMedian'
 'globDedisptsStatsRatio0' 'globDedisptsStatsPosPct'
 'globDedisptsStatsNegCnt' 'globDedisptsStatsRatio1'
 'globDedisptsStatsMean' 'windTimeStatsStd0' 'windTimeStatsStd1'
 'windTimeStatsStd2' 'windTimeStatsStd3' 'windTimeStatsStd4'
 'windTimeStatsStd5' 'windTimeStatsStd6' 'windTimeStatsStd7'
 'windTimeSt

#### Predict probabilities to be in class 9, add them to the dataframe, sort by descending probability, export dataframe and write to text file

In [4]:
pred = fitmodel.predict_proba(rawfeats)
pred9 = pred[:,7]

unlabelledDf['P(class9)'] = pred9

sortedDf = unlabelledDf.sort_values('P(class9)', axis = 0,  ascending = False)

filterbanks = sortedDf[['filterbank', 'datfile', 'BinFactor', 'bestDM', 'P(class9)']]

filterbanks.to_pickle('./processed.files/raw/priorityqueue')

base_filename = 'priorityqueue.txt'
with open(os.path.join('./processed.files/raw/', base_filename),'w') as outfile:
    filterbanks.to_string(outfile)