# AUDIOGRAM PREDICT

This is a project to predict audiogram values

### Part Two: Training Pipeline and Evaluation

## Settings

#### Path

In [158]:
from pathlib import Path
import os

# Sets base path
b_path = Path.home() / 'Development' / 'audiogram'
os.chdir(b_path)
!ls

d_path = b_path / 'dataset'
n_path = b_path / 'notebook'
s_path = b_path / 'scripts'
m_path = b_path / 'model'

dataset  notebook	 README.md		      scripts
model	 pyproject.toml  requirements_experiment.txt  src


#### Install

In [2]:
!pip install --upgrade scikit-learn



In [89]:
!pip install xgboost



In [3]:
!pip install fancyimpute

Collecting fancyimpute
  Downloading fancyimpute-0.7.0.tar.gz (25 kB)
  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting knnimpute>=0.1.0
  Downloading knnimpute-0.1.0.tar.gz (8.3 kB)
  Preparing metadata (setup.py) ... [?25ldone
Collecting cvxpy
  Downloading cvxpy-1.2.1-cp310-cp310-manylinux_2_24_x86_64.whl (2.8 MB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.8/2.8 MB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m[36m0:00:01[0m[36m0:00:01[0m:01[0m
[?25hCollecting cvxopt
  Downloading cvxopt-1.3.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (12.7 MB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.7/12.7 MB[0m [31m11.3 MB/s[0m eta [36m0:00:00[0mm eta [36m0:00:01[0m[36m0:00:01[0m
[?25hCollecting pytest
  Downloading pytest-7.1.2-py3-none-any.whl (297 kB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m297.0/297.0 kB[0m [31m27.3 MB/s[0m eta [36m0:00:00

#### Imports

In [6]:
%matplotlib inline

In [438]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import joblib
import copy
import pickle

from fancyimpute import KNN, IterativeImputer 
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from xgboost import XGBRegressor

#### Configuration

In [10]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 500)
warnings.filterwarnings('ignore')

## Machine Learning Modeling

In [119]:
df_audio_full = pd.read_csv(d_path / 'df_audio_v1.csv')

In [120]:
df_audio = pd.read_csv(d_path / 'df_audio_check_3.csv')

In [101]:
df_audio = (df_audio + 5) / 5

In [114]:
base_cols = ['2k','4k','6k']

In [35]:
def get_closer_5_multi(n):
    r = n % 5
    return n+(5-r) if r>=2.5 else n-r

## 1 Experiment - Regression

In [117]:
x = df_audio[base_cols]
y = df_audio['3k']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.33, random_state=123, shuffle=True)

In [118]:
model = LinearRegression()

In [119]:
model.fit(x_train, y_train)

In [120]:
coefficients = pd.DataFrame(model.coef_, x.columns, columns=['Coeff'])

In [121]:
coefficients.head()

Unnamed: 0,Coeff
2k,0.434244
4k,0.523486
6k,0.07123


In [122]:
preds = model.predict(x_test)

In [131]:
get_5 = np.vectorize(get_closer_5_multi)
preds = get_5(preds)

In [132]:
preds

array([ 0.,  5.,  0., ..., 70.,  5., 10.])

In [135]:
print(f"The Mean Absolute Error is {round(mean_absolute_error(y_test,preds))}")
print(f"The Mean Squared Error is {round(mean_squared_error(y_test,preds))}")
print(f"The Root Mean Squared Error is {round(np.sqrt(mean_squared_error(y_test,preds)))}")
print(f"The Adjusted R-Squared is {round(1 - (1 - r2_score(y_test, preds)) * (len(y) - 1) / (len(y) - x.shape[1] - 1),2)}")

The Mean Absolute Error is 6
The Mean Squared Error is 65
The Root Mean Squared Error is 8
The Adjusted R-Squared is 0.75


In [139]:
count = 0
for pred, gt in zip(list(preds), y_test.to_list()):
    if pred==gt:
        count+=1

In [142]:
count / len(y_test.to_list())

0.31958231081015004

In [143]:
x = df_audio[base_cols]
y = df_audio['3k']

In [146]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.33, random_state=123, shuffle=True)

In [147]:
model = XGBRegressor()

In [149]:
model.fit(x_train, y_train)

In [150]:
preds = model.predict(x_test)

In [151]:
get_5 = np.vectorize(get_closer_5_multi)
preds = get_5(preds)

In [152]:
count = 0
for pred, gt in zip(list(preds), y_test.to_list()):
    if pred==gt:
        count+=1

In [154]:
count / len(y_test.to_list())

0.31828730881928596

In [155]:
base_cols = ['500k','1k','2k','4k','6k','8k']

In [156]:
x = df_audio[base_cols]
y = df_audio['3k']

In [157]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.33, random_state=123, shuffle=True)

In [158]:
model = LinearRegression()

In [159]:
model.fit(x_train, y_train)

In [161]:
coefficients = pd.DataFrame(model.coef_, x.columns, columns=['Coeff'])
coefficients

Unnamed: 0,Coeff
500k,-0.006179
1k,-0.048957
2k,0.45674
4k,0.521516
6k,0.066387
8k,0.012538


In [162]:
preds = model.predict(x_test)

In [163]:
get_5 = np.vectorize(get_closer_5_multi)
preds = get_5(preds)

In [164]:
print(f"The Mean Absolute Error is {round(mean_absolute_error(y_test,preds))}")
print(f"The Mean Squared Error is {round(mean_squared_error(y_test,preds))}")
print(f"The Root Mean Squared Error is {round(np.sqrt(mean_squared_error(y_test,preds)))}")
print(f"The Adjusted R-Squared is {round(1 - (1 - r2_score(y_test, preds)) * (len(y) - 1) / (len(y) - x.shape[1] - 1),2)}")

The Mean Absolute Error is 6
The Mean Squared Error is 65
The Root Mean Squared Error is 8
The Adjusted R-Squared is 0.75


## 2 Experiment - KNN

In [162]:
base_cols = ['2k','4k','6k']

In [163]:
x = df_audio[base_cols]
y = df_audio['3k']
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.33, random_state=123, shuffle=True)

In [197]:
knn = KNeighborsRegressor(n_neighbors=7)

In [198]:
knn.fit(X_train, y_train)

In [199]:
knn.score(X_test, y_test)

0.7650631025068291

In [200]:
preds = knn.predict(X_test)

In [201]:
get_5 = np.vectorize(get_closer_5_multi)
preds = get_5(preds)

In [202]:
count=0
for pred, gt in zip(preds, y_test):
    if pred==gt:
        count+=1

In [203]:
count / len(preds)

0.33105050505050504

## 3 Experiment - MICE

In [13]:
df_audio_t = df_audio.iloc[:,2:6]

In [14]:
df_audio_t

Unnamed: 0,2k,3k,4k,6k
0,0,0,5,15
1,10,20,10,5
2,0,0,0,25
3,5,5,10,15
4,5,5,0,10
...,...,...,...,...
299995,10,20,15,20
299996,10,50,55,50
299997,35,55,50,65
299998,5,25,65,30


In [17]:
df_audio_t.loc[len(df_audio_t.index)] = [5,np.nan,10,5]

In [22]:
# Frequencies -128/127 int8
for c in df_audio_t.columns:
    df_audio_t[c] = df_audio_t[c].astype('int8')

IntCastingNaNError: Cannot convert non-finite values (NA or inf) to integer

In [30]:
mice_imputer = IterativeImputer()

In [31]:
df = mice_imputer.fit_transform(df_audio_t)

In [37]:
get_closer_5_multi(df[-1][1])

5.0

In [27]:
df_audio_t.iloc[-10000:,:]

Unnamed: 0,2k,3k,4k,6k
290001,10,20.0,25.0,25.0
290002,35,35.0,60.0,45.0
290003,45,50.0,55.0,70.0
290004,25,45.0,55.0,50.0
290005,35,65.0,65.0,70.0
...,...,...,...,...
299996,10,50.0,55.0,50.0
299997,35,55.0,50.0,65.0
299998,5,25.0,65.0,30.0
299999,5,5.0,40.0,45.0


In [75]:
df_audio_full_t = df_audio_full.iloc[:,2:6]

In [87]:
df_audio_full_1 = df_audio_full_t[int(len(df_audio_full_t.index)/2):]
df_audio_full_1.reset_index(inplace=True, drop=True)
df_audio_full_2 = df_audio_full_t[:int(len(df_audio_full_t.index)/2)]
df_audio_full_2.reset_index(inplace=True, drop=True)

In [77]:
df_audio_full_2

Unnamed: 0,2k,3k,4k,6k
0,5,15,10,5
1,5,15,5,10
2,20,20,15,20
3,15,20,20,15
4,15,20,35,25
...,...,...,...,...
2819693,5,10,0,20
2819694,5,0,5,15
2819695,0,10,10,15
2819696,5,10,30,25


In [62]:
df = mice_imputer.fit_transform(df_audio_full_1)
get_closer_5_multi(df[-1][1])

5.0

In [78]:
df_audio_full_2['3k'] = np.nan

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_audio_full_2['3k'] = np.nan


In [80]:
df_audio_full_1 = pd.concat([df_audio_full_1, df_audio_full_2])

In [81]:
df_audio_full_1.reset_index(inplace=True, drop=True)

In [83]:
df_audio_full_1

Unnamed: 0,2k,3k,4k,6k
0,5,5.0,45,25
1,25,60.0,60,60
2,25,60.0,65,70
3,10,10.0,30,15
4,20,25.0,40,30
...,...,...,...,...
5639391,5,,0,20
5639392,5,,5,15
5639393,0,,10,15
5639394,5,,30,25


In [84]:
df = mice_imputer.fit_transform(df_audio_full_1)
get_closer_5_multi(df[-1][1])

25.0

In [93]:
df_pred = pd.DataFrame(df)
pred_df_audio = df_pred[1].to_list()

In [98]:
len(pred_df_audio[int(len(pred_df_audio)/2):])

2819698

In [107]:
pred_df_audio = list(map(get_closer_5_multi, pred_df_audio[int(len(pred_df_audio)/2):]))

In [102]:
gt_df_audio = df_audio_full_2['3k'].to_list()

In [113]:
count=0
for pred, gt in zip(pred_df_audio, gt_df_audio):
    if pred==gt:
        count+=1

In [115]:
count / len(pred_df_audio)

0.2989688966690759

## Machine Learning Pipeline

In [243]:
base_cols = ['2k','4k','6k']

In [398]:
def get_closer_5_multi(n):
    r = n % 5
    r = n+(5-r) if r>=2.5 else n-r
    return int(r)

In [223]:
# Function to get correct order of Models
def add_columns(df, cols):
    total=0
    count=len(cols)
    for c in cols:
        total+=df[c]
    
    return total / count

def get_pred_order(df, input_cols):
    df_corr = df.corr()
    col_len = len(input_cols)
    order_l = []
    i_cols = copy.deepcopy(input_cols)
    for i, _ in enumerate(range(col_len, 8)):
        df_corr[f'{i}pred'] = df_corr.apply(add_columns, cols=i_cols)
        col_order = df_corr[f'{i}pred'].sort_values(ascending=False).index
        for c in col_order:
            if c not in i_cols:
                order_l.append(c)
                i_cols.append(c)
                break
    
    return order_l

In [204]:
def train_KNN(df, x_cols, y_col):
    x = df[x_cols]
    y = df[y_col]
    X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.33, random_state=123, shuffle=True)
    knn = KNeighborsRegressor(n_neighbors=7)
    knn.fit(X_train, y_train)
    return knn

In [291]:
train_comb = []
model_targets = [c for c in df_audio.columns if c not in base_cols]
y_t = get_pred_order(df_audio, base_cols)[0]
combs = (base_cols, y_t)
train_comb.append(combs)

for target in model_targets:
    new_cols = base_cols + [target]
    while True:
        y_t = get_pred_order(df_audio, new_cols)
        y_t = y_t[0] if y_t else None
        i_cols = copy.deepcopy(new_cols)
        combs = (i_cols, y_t) if y_t else []
        if combs:
            flag = True
            for _i in train_comb:
                if new_cols == _i[0]:
                    flag = False
                    break
            if flag:
                train_comb.append(combs)
                new_cols += [y_t]
            else:
                break
        else:
            break

In [355]:
pickle_models = []
comb = train_comb[0]
pickle_models.append({'x':comb[0], 'y':comb[1], 'model_name':'base_audio_model.joblib'})
for i, comb in enumerate(train_comb):
    if i==0:continue
    model_name_l = []
    x, y = comb
    for i in range(len(x)-1, 2, -1):
        model_name_l.append(x[i])
    model_name = "-".join(model_name_l)
    pickle_models.append({'x':x, 'y':y, 'model_name':f'{model_name}_audio_model.joblib'})

In [356]:
pickle_models

[{'x': ['2k', '4k', '6k'], 'y': '3k', 'model_name': 'base_audio_model.joblib'},
 {'x': ['2k', '4k', '6k', '500k'],
  'y': '3k',
  'model_name': '500k_audio_model.joblib'},
 {'x': ['2k', '4k', '6k', '500k', '3k'],
  'y': '1k',
  'model_name': '3k-500k_audio_model.joblib'},
 {'x': ['2k', '4k', '6k', '500k', '3k', '1k'],
  'y': '8k',
  'model_name': '1k-3k-500k_audio_model.joblib'},
 {'x': ['2k', '4k', '6k', '1k'],
  'y': '3k',
  'model_name': '1k_audio_model.joblib'},
 {'x': ['2k', '4k', '6k', '1k', '3k'],
  'y': '500k',
  'model_name': '3k-1k_audio_model.joblib'},
 {'x': ['2k', '4k', '6k', '3k'],
  'y': '1k',
  'model_name': '3k_audio_model.joblib'},
 {'x': ['2k', '4k', '6k', '8k'],
  'y': '3k',
  'model_name': '8k_audio_model.joblib'},
 {'x': ['2k', '4k', '6k', '8k', '3k'],
  'y': '1k',
  'model_name': '3k-8k_audio_model.joblib'},
 {'x': ['2k', '4k', '6k', '8k', '3k', '1k'],
  'y': '500k',
  'model_name': '1k-3k-8k_audio_model.joblib'}]

In [357]:
with open(m_path / 'models.pkl', 'wb') as f:
    pickle.dump(pickle_models, f)

In [363]:
with open(m_path / 'models.pkl', 'rb') as f:
    pickle_models = pickle.load(f)

def find_audio_model(comb):
    for model in pickle_models:
        x, y, model_name = [model['x'], model['y'], model['model_name']]
        if len(comb)!=len(x):continue
        if all((i in x) for i in comb):
            return x, y, model_name
    
    return None

In [360]:
for i in train_comb:
    comb = i[0]
    model_inf = find_audio_model(comb)
    model = train_KNN(df_audio, model_inf['x'], model_inf['y'])
    joblib.dump(model, m_path / model_inf['model_name'])

## Application

In [430]:
_input = (['2k','4k','6k'], [20,10,5])

In [432]:
orig_col = ['500k', '1k', '2k', '3k', '4k', '6k', '8k']

def get_predictions(_input):
    comb, freqs = _input
    comb_s = len(comb)
    preds = []
    for i in range(comb_s, 7):
        model_inf = find_audio_model(comb)
        if model_inf:
            x, y, model_name = find_audio_model(comb)
        else:
            print('Combination not accepted')
            return None
        model = joblib.load(m_path / model_name)
        pred = get_closer_5_multi(model.predict(np.array([freqs]))[0])
        comb += [y]
        freqs += [pred]
        
    orig_idx = [orig_col.index(i) for i in comb]
    res = [i for _,i in sorted(zip(orig_idx,freqs))]
    return res

In [433]:
get_predictions(_input)

[15, 15, 20, 15, 10, 5, 10]

In [436]:
_input = (['2k','4k','6k','8k'], [20,10,5,10])

In [439]:
get_predictions(_input)

[10, 10, 20, 15, 10, 5, 10]

In [440]:
_input = (['2k','4k','6k','8k','10k'], [20,10,5,10,8])

In [441]:
get_predictions(_input)

Combination not accepted
