# Feature Selection with Naive Bayes


In [248]:
import os
import pandas as pd
import numpy as np 
def load_chants(test_chants_file = "test-chants.csv", 
                train_chants_file = "train-chants.csv",
                test_repr_pitch_file = "test-representation-pitch.csv",
                train_repr_pitch_file = "train-representation-pitch.csv"):
    test_chants = pd.read_csv(test_chants_file, index_col='id')
    train_chants = pd.read_csv(train_chants_file, index_col='id')
    chants = pd.concat([train_chants, test_chants])
    pitch_repr_test = pd.read_csv(test_repr_pitch_file, index_col='id')
    pitch_repr_train = pd.read_csv(train_repr_pitch_file, index_col='id')
    pitch_representations = pd.concat([pitch_repr_train, pitch_repr_test])

    return chants, pitch_representations

def prepare_dataset(representation_type = "syllables"):
    chants, pitch_repr = load_chants()
    X, y = [], []
    for segments, mode, id_pitches, id_chant in zip(pitch_repr[representation_type], 
                                                chants['mode'], 
                                                pitch_repr.index, 
                                                chants.index):
        if not id_pitches == id_chant:
            raise ValueError("IDs of features and modes are not equal!")
        X.append(segments)
        y.append(str(mode))

    return np.array(X), np.array(y)

In [249]:
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
import numpy as np
import pandas as pd


def convert_log_prob_to_df_prob(log_prob, class_names, feat_names):
    # Transpose the class labels into the columns
    prob_arr = np.array(log_prob)
    prob_arr = np.transpose(prob_arr)

    probs = pd.DataFrame(prob_arr, columns=class_names)
    # Unlog by taking exponents
    probs = probs.apply(np.exp)
    # Add the feature names as the first column
    probs['feature'] = feat_names
    probs.set_index('feature', inplace=True)
    return probs



def print_feature_selections(X_prepared, y_prepared, treshhold = 0.1):
    # Create the pandas DataFrame
    y = pd.DataFrame(y_prepared, columns=["mode"])
    X = pd.DataFrame(X_prepared, columns=["segment"])
    X = X["segment"]
    # Pipeline
    pipe = Pipeline([('count', CountVectorizer()),
                    ('clf', MultinomialNB(alpha=0))])


    result = pipe.fit(X, y)

    feat_names = pipe['count'].get_feature_names_out()

    probs = convert_log_prob_to_df_prob(pipe['clf'].feature_log_prob_, pipe['clf'].classes_, feat_names)
    print("{:<10}   {}     {}     {}     {}     {}     {}     {}     {}"\
          .format("Segment", "1", "2", "3", "4", "5", "6", "7", "8"))
    for f, mode1, mode2, mode3, mode4, mode5, mode6, mode7, mode8 in zip(probs.index, 
                                                                      probs["1"], 
                                                                      probs["2"], 
                                                                      probs["3"], 
                                                                      probs["4"], 
                                                                      probs["5"], 
                                                                      probs["6"], 
                                                                      probs["7"], 
                                                                      probs["8"]):

        if mode1+mode2+mode3+mode4+mode5+mode6+mode7+mode8>treshhold:
            print("{:<10}:  {:.3f} {:.3f} {:.3f} {:.3f} {:.3f} {:.3f} {:.3f} {:.3f}"\
                  .format(f, mode1, mode2, mode3, mode4, mode5, mode6, mode7, mode8))

### words

In [250]:
X_prepared, y_prepared = prepare_dataset("words")
print_feature_selections(X_prepared, y_prepared)

Segment      1     2     3     4     5     6     7     8
ff        :  0.007 0.015 0.001 0.008 0.024 0.058 0.001 0.004
gg        :  0.006 0.001 0.013 0.007 0.004 0.005 0.048 0.056
gh        :  0.009 0.002 0.025 0.012 0.013 0.034 0.012 0.015
hg        :  0.024 0.008 0.020 0.012 0.024 0.009 0.009 0.031


  y = column_or_1d(y, warn=True)
  % _ALPHA_MIN


### syllables

In [251]:
X_prepared, y_prepared = prepare_dataset("syllables")
print_feature_selections(X_prepared, y_prepared)

Segment      1     2     3     4     5     6     7     8
cd        :  0.050 0.137 0.001 0.018 0.000 0.011 0.001 0.002
dc        :  0.074 0.079 0.004 0.033 0.002 0.044 0.001 0.002
de        :  0.011 0.026 0.025 0.077 0.001 0.004 0.000 0.001
df        :  0.026 0.057 0.002 0.028 0.003 0.034 0.000 0.003
ed        :  0.017 0.035 0.029 0.038 0.000 0.003 0.001 0.002
ef        :  0.037 0.050 0.028 0.038 0.002 0.011 0.001 0.013
fe        :  0.160 0.169 0.040 0.099 0.004 0.060 0.003 0.032
fg        :  0.049 0.045 0.018 0.028 0.012 0.048 0.020 0.079
gf        :  0.057 0.022 0.104 0.098 0.048 0.106 0.016 0.063
gh        :  0.130 0.033 0.075 0.092 0.076 0.159 0.047 0.127
ghg       :  0.016 0.005 0.018 0.021 0.015 0.033 0.002 0.018
hg        :  0.051 0.012 0.071 0.039 0.066 0.060 0.050 0.099
hk        :  0.011 0.002 0.094 0.012 0.042 0.019 0.032 0.044
jh        :  0.002 0.001 0.029 0.015 0.004 0.004 0.057 0.015
jk        :  0.002 0.000 0.027 0.003 0.017 0.002 0.052 0.022
kj        :  0.009 0.002 0.0

  y = column_or_1d(y, warn=True)
  % _ALPHA_MIN


### neumes

In [252]:
X_prepared, y_prepared = prepare_dataset("neumes")
print_feature_selections(X_prepared, y_prepared)

Segment      1     2     3     4     5     6     7     8
cd        :  0.049 0.134 0.001 0.018 0.000 0.011 0.001 0.002
dc        :  0.075 0.080 0.004 0.034 0.002 0.052 0.000 0.003
de        :  0.011 0.027 0.024 0.078 0.001 0.005 0.000 0.001
df        :  0.026 0.060 0.002 0.029 0.003 0.033 0.000 0.004
ed        :  0.018 0.036 0.029 0.040 0.001 0.003 0.001 0.003
ef        :  0.036 0.048 0.027 0.039 0.002 0.011 0.001 0.013
fe        :  0.155 0.168 0.039 0.097 0.005 0.057 0.003 0.032
fg        :  0.051 0.046 0.018 0.028 0.015 0.055 0.019 0.077
gf        :  0.059 0.024 0.102 0.095 0.052 0.106 0.016 0.065
gh        :  0.128 0.033 0.079 0.089 0.072 0.150 0.046 0.126
ghg       :  0.017 0.010 0.018 0.021 0.015 0.031 0.002 0.018
hg        :  0.058 0.015 0.073 0.041 0.068 0.062 0.050 0.100
hk        :  0.012 0.002 0.092 0.012 0.042 0.018 0.032 0.045
jh        :  0.003 0.001 0.030 0.015 0.004 0.004 0.056 0.015
jk        :  0.002 0.000 0.028 0.003 0.016 0.001 0.052 0.022
kj        :  0.010 0.002 0.0

  y = column_or_1d(y, warn=True)
  % _ALPHA_MIN


### 2-mer

In [253]:
X_prepared, y_prepared = prepare_dataset("2-mer")
print_feature_selections(X_prepared, y_prepared)

Segment      1     2     3     4     5     6     7     8
cd        :  0.035 0.085 0.002 0.025 0.000 0.011 0.000 0.002
dc        :  0.037 0.071 0.002 0.023 0.001 0.022 0.000 0.002
dd        :  0.066 0.104 0.004 0.019 0.001 0.009 0.000 0.003
de        :  0.015 0.031 0.012 0.052 0.000 0.005 0.001 0.007
df        :  0.039 0.081 0.004 0.024 0.001 0.024 0.000 0.009
ed        :  0.053 0.066 0.021 0.061 0.002 0.014 0.001 0.007
ef        :  0.031 0.039 0.023 0.045 0.002 0.013 0.001 0.017
fd        :  0.020 0.038 0.007 0.020 0.001 0.019 0.001 0.006
fe        :  0.066 0.087 0.035 0.072 0.002 0.035 0.001 0.013
ff        :  0.038 0.084 0.004 0.032 0.037 0.130 0.001 0.014
fg        :  0.089 0.044 0.034 0.055 0.028 0.094 0.011 0.052
gf        :  0.084 0.040 0.047 0.069 0.046 0.087 0.014 0.054
gg        :  0.030 0.012 0.066 0.049 0.017 0.043 0.053 0.110
gh        :  0.070 0.025 0.092 0.070 0.051 0.084 0.032 0.086
hg        :  0.081 0.024 0.082 0.076 0.073 0.074 0.047 0.105
hh        :  0.065 0.014 0.0

  y = column_or_1d(y, warn=True)
  % _ALPHA_MIN


### 3-mer

In [254]:
X_prepared, y_prepared = prepare_dataset("3-mer")
print_feature_selections(X_prepared, y_prepared)

Segment      1     2     3     4     5     6     7     8
fed       :  0.048 0.050 0.012 0.040 0.002 0.013 0.001 0.007
fff       :  0.010 0.035 0.001 0.011 0.006 0.057 0.000 0.002
fgf       :  0.032 0.019 0.006 0.019 0.003 0.025 0.000 0.004
fgh       :  0.041 0.011 0.011 0.015 0.016 0.045 0.005 0.028
gfe       :  0.028 0.019 0.023 0.037 0.002 0.008 0.001 0.007
gfg       :  0.033 0.009 0.015 0.016 0.012 0.028 0.008 0.025
ggh       :  0.009 0.004 0.026 0.016 0.006 0.016 0.005 0.026
ghg       :  0.034 0.011 0.031 0.026 0.023 0.039 0.007 0.037
hgf       :  0.038 0.014 0.025 0.028 0.035 0.043 0.009 0.029
hgg       :  0.011 0.002 0.020 0.012 0.007 0.012 0.020 0.035
hgh       :  0.019 0.007 0.023 0.030 0.025 0.017 0.012 0.025
hhg       :  0.029 0.007 0.009 0.015 0.012 0.012 0.006 0.024
kjh       :  0.003 0.001 0.037 0.006 0.012 0.002 0.028 0.021
kkk       :  0.001 0.001 0.037 0.001 0.034 0.003 0.018 0.024


  y = column_or_1d(y, warn=True)
  % _ALPHA_MIN


### 4-mer

In [296]:
X_prepared, y_prepared = prepare_dataset("4-mer")
print_feature_selections(X_prepared, y_prepared, 0.04)

Segment      1     2     3     4     5     6     7     8
efgf      :  0.013 0.009 0.003 0.013 0.000 0.001 0.000 0.001
fedc      :  0.011 0.014 0.001 0.006 0.000 0.007 0.000 0.000
fedd      :  0.020 0.015 0.003 0.005 0.000 0.002 0.000 0.002
fede      :  0.005 0.008 0.006 0.021 0.000 0.001 0.000 0.001
ffed      :  0.007 0.014 0.002 0.012 0.000 0.006 0.000 0.001
fffe      :  0.003 0.018 0.000 0.005 0.000 0.018 0.000 0.000
fgfe      :  0.015 0.011 0.003 0.011 0.000 0.004 0.000 0.001
fghg      :  0.021 0.006 0.008 0.011 0.005 0.022 0.000 0.007
fghh      :  0.013 0.002 0.001 0.003 0.003 0.009 0.002 0.013
gfed      :  0.018 0.011 0.006 0.016 0.000 0.003 0.000 0.003
gfgg      :  0.005 0.001 0.008 0.005 0.002 0.008 0.003 0.009
gfgh      :  0.023 0.003 0.005 0.007 0.007 0.009 0.003 0.016
gghg      :  0.004 0.002 0.009 0.007 0.002 0.008 0.002 0.013
ghgf      :  0.017 0.008 0.013 0.014 0.014 0.025 0.004 0.018
hgfe      :  0.008 0.004 0.011 0.018 0.000 0.003 0.000 0.002
hgff      :  0.005 0.002 0.0

  y = column_or_1d(y, warn=True)
  % _ALPHA_MIN


### 5-mer

In [294]:
X_prepared, y_prepared = prepare_dataset("5-mer")
print_feature_selections(X_prepared, y_prepared, 0.02)

Segment      1     2     3     4     5     6     7     8
efgfe     :  0.008 0.005 0.002 0.008 0.000 0.000 0.000 0.000
fedcd     :  0.008 0.009 0.000 0.004 0.000 0.001 0.000 0.000
fgfed     :  0.012 0.010 0.001 0.005 0.000 0.002 0.000 0.001
fghgf     :  0.010 0.004 0.006 0.006 0.004 0.018 0.000 0.003
fghhg     :  0.004 0.002 0.000 0.002 0.000 0.002 0.002 0.011
gfghg     :  0.012 0.001 0.004 0.005 0.003 0.006 0.000 0.005
gfghh     :  0.007 0.001 0.001 0.001 0.001 0.001 0.002 0.008
gghgf     :  0.002 0.001 0.004 0.003 0.001 0.008 0.001 0.005
ghgfe     :  0.005 0.003 0.009 0.010 0.001 0.001 0.000 0.002
ghgff     :  0.004 0.001 0.001 0.001 0.009 0.010 0.000 0.003
ghgfg     :  0.007 0.004 0.005 0.003 0.003 0.004 0.002 0.010
ghkkk     :  0.000 0.000 0.018 0.000 0.001 0.000 0.002 0.003
hg        :  0.007 0.000 0.002 0.000 0.001 0.000 0.000 0.010
hgfgg     :  0.003 0.000 0.007 0.003 0.002 0.002 0.003 0.006
hgfgh     :  0.013 0.001 0.004 0.002 0.006 0.004 0.003 0.012
hghgf     :  0.005 0.003 0.0

  y = column_or_1d(y, warn=True)
  % _ALPHA_MIN


### 6-mer

In [288]:
X_prepared, y_prepared = prepare_dataset("6-mer")
print_feature_selections(X_prepared, y_prepared, 0.015)

Segment      1     2     3     4     5     6     7     8
efgfed    :  0.008 0.003 0.001 0.004 0.000 0.000 0.000 0.000
gfghgf    :  0.005 0.001 0.004 0.003 0.001 0.003 0.000 0.001
ghgfgh    :  0.003 0.001 0.001 0.001 0.003 0.002 0.002 0.005
hg        :  0.006 0.000 0.002 0.000 0.001 0.000 0.000 0.011
hgfghg    :  0.009 0.000 0.002 0.002 0.002 0.002 0.000 0.001


  y = column_or_1d(y, warn=True)
  % _ALPHA_MIN


### 7-mer

In [285]:
X_prepared, y_prepared = prepare_dataset("7-mer")
print_feature_selections(X_prepared, y_prepared, 0.01)

Segment      1     2     3     4     5     6     7     8
dd        :  0.003 0.008 0.000 0.000 0.000 0.000 0.000 0.000
fghgf     :  0.001 0.000 0.000 0.000 0.000 0.010 0.000 0.000
gf        :  0.001 0.000 0.000 0.000 0.001 0.011 0.000 0.000
ggllmlk   :  0.000 0.000 0.000 0.000 0.000 0.000 0.010 0.000
ghgf      :  0.001 0.000 0.000 0.000 0.000 0.010 0.000 0.000
hfghgf    :  0.000 0.000 0.000 0.000 0.000 0.011 0.000 0.000
hg        :  0.006 0.000 0.002 0.001 0.001 0.000 0.000 0.011
hgf       :  0.001 0.000 0.000 0.000 0.001 0.013 0.000 0.000
hhfghgf   :  0.000 0.000 0.000 0.000 0.000 0.010 0.000 0.000
hhgfghg   :  0.008 0.000 0.001 0.001 0.000 0.000 0.000 0.000
jh        :  0.000 0.000 0.004 0.000 0.000 0.000 0.006 0.000
jkhg      :  0.000 0.000 0.000 0.000 0.001 0.000 0.000 0.009
khg       :  0.000 0.000 0.002 0.000 0.000 0.000 0.000 0.008
kjh       :  0.000 0.000 0.005 0.000 0.000 0.000 0.007 0.000


  y = column_or_1d(y, warn=True)
  % _ALPHA_MIN


### 8-mer

In [284]:
X_prepared, y_prepared = prepare_dataset("8-mer")
print_feature_selections(X_prepared, y_prepared, 0.01)

Segment      1     2     3     4     5     6     7     8
fghgf     :  0.001 0.000 0.000 0.000 0.000 0.010 0.000 0.000
gf        :  0.001 0.000 0.000 0.000 0.001 0.011 0.000 0.000
hfghgf    :  0.000 0.000 0.000 0.000 0.000 0.012 0.000 0.000
hg        :  0.006 0.000 0.003 0.000 0.000 0.000 0.000 0.011
hgf       :  0.001 0.000 0.000 0.000 0.001 0.010 0.000 0.000
hhfghgf   :  0.000 0.000 0.000 0.000 0.000 0.014 0.000 0.000
jh        :  0.000 0.000 0.004 0.000 0.000 0.000 0.005 0.000
khg       :  0.000 0.000 0.002 0.000 0.001 0.000 0.000 0.009
kjh       :  0.000 0.000 0.004 0.000 0.000 0.000 0.006 0.000


  y = column_or_1d(y, warn=True)
  % _ALPHA_MIN


### 9-mer

In [283]:
X_prepared, y_prepared = prepare_dataset("9-mer")
print_feature_selections(X_prepared, y_prepared, 0.01)

  y = column_or_1d(y, warn=True)
  % _ALPHA_MIN


Segment      1     2     3     4     5     6     7     8
fghgf     :  0.001 0.000 0.000 0.000 0.000 0.009 0.000 0.000
gf        :  0.001 0.000 0.000 0.000 0.002 0.009 0.000 0.000
ghgf      :  0.001 0.000 0.000 0.000 0.000 0.009 0.000 0.000
hfghgf    :  0.000 0.000 0.000 0.000 0.000 0.010 0.000 0.000
hg        :  0.006 0.000 0.001 0.000 0.000 0.000 0.000 0.010
hgf       :  0.001 0.000 0.000 0.000 0.000 0.015 0.000 0.000
hhfghgf   :  0.000 0.000 0.000 0.000 0.000 0.010 0.000 0.000
jh        :  0.000 0.000 0.005 0.000 0.000 0.000 0.005 0.000
khg       :  0.000 0.000 0.002 0.000 0.001 0.000 0.000 0.009
kjh       :  0.000 0.000 0.005 0.000 0.000 0.000 0.006 0.000


### 10-mer

In [282]:
X_prepared, y_prepared = prepare_dataset("10-mer")
print_feature_selections(X_prepared, y_prepared, 0.01)

Segment      1     2     3     4     5     6     7     8
gf        :  0.001 0.000 0.000 0.000 0.002 0.013 0.000 0.000
ghgf      :  0.001 0.000 0.000 0.000 0.000 0.012 0.000 0.000
hg        :  0.007 0.000 0.002 0.000 0.001 0.000 0.000 0.009
hgf       :  0.001 0.000 0.000 0.000 0.001 0.010 0.000 0.000
hhfghgf   :  0.000 0.000 0.000 0.000 0.000 0.012 0.000 0.000
jh        :  0.000 0.000 0.004 0.000 0.000 0.000 0.006 0.000
kjh       :  0.000 0.000 0.004 0.000 0.000 0.000 0.007 0.000


  y = column_or_1d(y, warn=True)
  % _ALPHA_MIN


### 11-mer

In [281]:
X_prepared, y_prepared = prepare_dataset("11-mer")
print_feature_selections(X_prepared, y_prepared, 0.01)

Segment      1     2     3     4     5     6     7     8
dd        :  0.003 0.009 0.000 0.000 0.000 0.000 0.000 0.000
fghgf     :  0.001 0.000 0.000 0.000 0.000 0.010 0.000 0.000
fhhfghgf  :  0.000 0.000 0.000 0.000 0.000 0.010 0.000 0.000
gf        :  0.001 0.000 0.000 0.000 0.001 0.010 0.000 0.000
hg        :  0.006 0.000 0.001 0.000 0.001 0.000 0.001 0.011
kh        :  0.000 0.000 0.000 0.000 0.012 0.000 0.000 0.000
khg       :  0.000 0.000 0.001 0.000 0.000 0.000 0.000 0.009
kjh       :  0.000 0.000 0.005 0.000 0.000 0.000 0.005 0.000


  y = column_or_1d(y, warn=True)
  % _ALPHA_MIN


### 12-mer

In [280]:
X_prepared, y_prepared = prepare_dataset("12-mer")
print_feature_selections(X_prepared, y_prepared, 0.01)

Segment      1     2     3     4     5     6     7     8
gf        :  0.001 0.000 0.000 0.000 0.002 0.009 0.000 0.000
hfghgf    :  0.000 0.000 0.000 0.000 0.000 0.012 0.000 0.000
hg        :  0.006 0.000 0.002 0.000 0.000 0.000 0.000 0.010
hgf       :  0.001 0.000 0.000 0.000 0.000 0.011 0.000 0.000
hhfghgf   :  0.000 0.000 0.000 0.000 0.000 0.013 0.000 0.000
jh        :  0.000 0.000 0.006 0.000 0.001 0.000 0.005 0.000
khg       :  0.000 0.000 0.002 0.000 0.000 0.000 0.000 0.008
kjh       :  0.000 0.000 0.004 0.000 0.000 0.001 0.006 0.000


  y = column_or_1d(y, warn=True)
  % _ALPHA_MIN


### 13-mer

In [279]:
X_prepared, y_prepared = prepare_dataset("13-mer")
print_feature_selections(X_prepared, y_prepared, 0.01)

Segment      1     2     3     4     5     6     7     8
fghgf     :  0.001 0.000 0.000 0.000 0.000 0.017 0.000 0.000
fhhfghgf  :  0.000 0.000 0.000 0.000 0.000 0.010 0.000 0.000
gf        :  0.001 0.000 0.000 0.000 0.002 0.009 0.000 0.000
ghgf      :  0.001 0.000 0.000 0.000 0.001 0.015 0.000 0.000
hg        :  0.007 0.000 0.003 0.000 0.001 0.000 0.000 0.009
jh        :  0.000 0.000 0.005 0.000 0.000 0.000 0.006 0.000
kh        :  0.000 0.000 0.000 0.000 0.010 0.000 0.000 0.000
khg       :  0.000 0.000 0.002 0.000 0.001 0.000 0.000 0.009
kjh       :  0.000 0.000 0.004 0.000 0.001 0.000 0.005 0.000
kkljkh    :  0.000 0.000 0.000 0.000 0.010 0.000 0.000 0.000


  y = column_or_1d(y, warn=True)
  % _ALPHA_MIN


### 14-mer

In [278]:
X_prepared, y_prepared = prepare_dataset("14-mer")
print_feature_selections(X_prepared, y_prepared, 0.01)

Segment      1     2     3     4     5     6     7     8
fghgf     :  0.001 0.000 0.000 0.000 0.000 0.011 0.000 0.000
gf        :  0.001 0.000 0.000 0.000 0.001 0.010 0.000 0.000
ghgf      :  0.001 0.000 0.000 0.000 0.000 0.012 0.000 0.000
hfghgf    :  0.000 0.000 0.000 0.000 0.000 0.013 0.000 0.000
hg        :  0.006 0.000 0.002 0.000 0.000 0.000 0.000 0.010
hgf       :  0.001 0.000 0.000 0.000 0.001 0.015 0.000 0.000
hhfghgf   :  0.000 0.000 0.000 0.000 0.000 0.011 0.000 0.000
kkljkh    :  0.000 0.000 0.000 0.000 0.010 0.000 0.000 0.000


  y = column_or_1d(y, warn=True)
  % _ALPHA_MIN


### 15-mer

In [273]:
X_prepared, y_prepared = prepare_dataset("15-mer")
print_feature_selections(X_prepared, y_prepared, 0.01)

Segment      1     2     3     4     5     6     7     8
gf        :  0.001 0.000 0.000 0.000 0.003 0.016 0.000 0.000
ghgf      :  0.001 0.000 0.000 0.000 0.000 0.011 0.000 0.000
hfghgf    :  0.000 0.000 0.000 0.000 0.000 0.011 0.000 0.000
hg        :  0.006 0.000 0.002 0.001 0.000 0.000 0.000 0.010
hgf       :  0.001 0.000 0.000 0.000 0.000 0.013 0.000 0.000
hhfghgf   :  0.000 0.000 0.000 0.000 0.000 0.011 0.000 0.000
jh        :  0.000 0.000 0.004 0.000 0.000 0.000 0.005 0.000
khg       :  0.000 0.000 0.001 0.000 0.000 0.000 0.000 0.008
kjh       :  0.000 0.000 0.004 0.000 0.000 0.000 0.006 0.000


  y = column_or_1d(y, warn=True)
  % _ALPHA_MIN


### 16-mer

In [272]:
X_prepared, y_prepared = prepare_dataset("16-mer")
print_feature_selections(X_prepared, y_prepared, 0.01)

Segment      1     2     3     4     5     6     7     8
fghgf     :  0.001 0.000 0.000 0.000 0.000 0.010 0.000 0.000
gf        :  0.001 0.000 0.000 0.000 0.001 0.015 0.000 0.000
hg        :  0.005 0.000 0.004 0.000 0.000 0.000 0.000 0.010
hgf       :  0.001 0.000 0.000 0.000 0.001 0.011 0.000 0.000
jh        :  0.000 0.000 0.005 0.000 0.001 0.000 0.006 0.000
kh        :  0.000 0.000 0.000 0.000 0.010 0.000 0.000 0.000
kjh       :  0.000 0.000 0.005 0.000 0.000 0.000 0.005 0.000


  y = column_or_1d(y, warn=True)
  % _ALPHA_MIN


### poisson-3

In [268]:
X_prepared, y_prepared = prepare_dataset("poisson-3")
print_feature_selections(X_prepared, y_prepared)

Segment      1     2     3     4     5     6     7     8
ff        :  0.011 0.026 0.002 0.011 0.016 0.047 0.000 0.005
fg        :  0.027 0.012 0.010 0.017 0.008 0.028 0.003 0.016
gf        :  0.027 0.012 0.015 0.021 0.016 0.037 0.005 0.017
gg        :  0.010 0.004 0.023 0.015 0.005 0.012 0.020 0.037
gh        :  0.023 0.008 0.032 0.023 0.016 0.027 0.010 0.027
hg        :  0.029 0.007 0.026 0.024 0.024 0.025 0.016 0.040
kk        :  0.001 0.000 0.028 0.001 0.035 0.005 0.016 0.023


  y = column_or_1d(y, warn=True)
  % _ALPHA_MIN


### poisson-5

In [276]:
X_prepared, y_prepared = prepare_dataset("poisson-5")
print_feature_selections(X_prepared, y_prepared, 0.04)

Segment      1     2     3     4     5     6     7     8
gf        :  0.007 0.003 0.003 0.005 0.004 0.018 0.001 0.004
gh        :  0.006 0.003 0.009 0.007 0.004 0.006 0.002 0.007
hg        :  0.011 0.002 0.007 0.006 0.007 0.006 0.004 0.018
hgf       :  0.006 0.002 0.003 0.005 0.005 0.017 0.001 0.004


  y = column_or_1d(y, warn=True)
  % _ALPHA_MIN


### poisson-7

In [277]:
X_prepared, y_prepared = prepare_dataset("poisson-7")
print_feature_selections(X_prepared, y_prepared, 0.01)

  y = column_or_1d(y, warn=True)
  % _ALPHA_MIN


Segment      1     2     3     4     5     6     7     8
dd        :  0.004 0.008 0.000 0.001 0.000 0.000 0.000 0.000
fed       :  0.004 0.002 0.001 0.002 0.000 0.001 0.000 0.000
ff        :  0.001 0.002 0.000 0.001 0.002 0.006 0.000 0.000
fghg      :  0.006 0.001 0.001 0.001 0.000 0.002 0.000 0.001
fghgf     :  0.003 0.000 0.001 0.001 0.000 0.010 0.000 0.000
gf        :  0.002 0.001 0.001 0.001 0.003 0.014 0.000 0.001
gg        :  0.001 0.000 0.002 0.001 0.000 0.000 0.004 0.006
gh        :  0.002 0.001 0.003 0.001 0.001 0.002 0.001 0.002
ghg       :  0.008 0.001 0.003 0.002 0.001 0.002 0.000 0.002
ghgf      :  0.002 0.000 0.001 0.001 0.001 0.010 0.000 0.002
hg        :  0.007 0.001 0.004 0.002 0.002 0.002 0.001 0.011
hgf       :  0.003 0.001 0.001 0.001 0.003 0.011 0.000 0.001
jh        :  0.000 0.000 0.004 0.000 0.000 0.000 0.006 0.001
jkhg      :  0.000 0.000 0.000 0.000 0.001 0.000 0.000 0.009
khg       :  0.000 0.000 0.002 0.000 0.000 0.000 0.000 0.010
kjh       :  0.000 0.000 0.0