# Speaker Classification through Intensity-based Rhythms 

#### Flowchart

In [None]:
+-----------------------+
|       Load Data        |
|           |           |
|           V           |
| +-------------------+ |
| |     intensity.csv  | |
| +-------------------+ |
+-----------------------+
              |
              V
+-----------------------+
|    Split Data into     |
|   Training and Test   |
|           |           |
|           V           |
| +-------------------+ |
| |     train_test     | |
| |      _split       | |
| +-------------------+ |
+-----------------------+
              |
              V
+------------------------+
|    Feature Engineering  |
|            |           |
|            V           |
| +-------------------+ |
| |   SelectKBest     | |
| |     f_classif     | |
| +-------------------+ |
|            |           |
|            V           |
| +-------------------+ |
| |    Standardize    | |
| +-------------------+ |
+-----------------------+
              |
              V
+------------------------+
|  Train Logistic        |
|   Regression Model     |
|            |           |
|            V           |
| +-------------------+ |
| |    Logistic       | |
| |   Regression      | |
| +-------------------+ |
|            |           |
|            V           |
| +-------------------+ |
| |   Trained Model   | |
| +-------------------+ |
+-----------------------+
              |
              V
+------------------------+
|     Evaluate Model     |
|            |           |
|            V           |
| +-------------------+ |
| | classification    | |
| |     report        | |
| +-------------------+ |
|            |           |
|            V           |
| +-------------------+ |
| |  Precision, Recall| |
| |      F1-score    | |
| +-------------------+ |
+-----------------------+

#### Importing libraries

In [147]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
#import joblib

#### Reading the dataframe and selecting the features and target

In [148]:
df = pd.read_csv('intensity.csv')

X = df[['rPVIm', 'nPVIm', 'rPVIp', 'nPVIp']]
y = df['speaker']

#### Splitting data into training sets and testing sets

In [149]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=49)

#### Creating the pipeline for the model

In [182]:
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('selector', SelectKBest(f_classif, k=4)),
    ('logreg', LogisticRegression(C=14.0, penalty='l2', solver='saga', max_iter=900))
])

#### Training the model and predicting the training set

In [183]:
pipeline.fit(X_train, y_train)

y_pred = pipeline.predict(X_test)

#### Generating a classification report

In [184]:
report = classification_report(y_test, y_pred, zero_division=1)
print(report)

              precision    recall  f1-score   support

           1       0.27      0.28      0.27        25
           2       0.06      0.06      0.06        18
           3       0.15      0.38      0.22        16
           4       0.00      0.00      0.00        18
           5       0.17      0.05      0.07        22
           6       0.10      0.20      0.13        15
           7       0.00      0.00      0.00        25
           8       0.21      0.16      0.18        25
           9       0.48      0.67      0.56        21
          10       0.18      0.12      0.15        16
          11       0.17      0.10      0.12        21
          12       0.25      0.10      0.14        21
          13       0.45      0.20      0.28        25
          14       0.21      0.55      0.31        22
          15       0.00      0.00      0.00        22
          16       0.05      0.05      0.05        20
          17       0.76      0.93      0.84        28
          18       0.07    

#### Joblib draft for future maintenance

In [None]:
#joblib.dump(pipeline, 'intensity-pipeline.joblib')
#joblib.load('intensity-pipeline.joblib')

### Knowledge graph visualization

In [146]:
from graphviz import Digraph

dot = Digraph()

dot.node("Load Data", shape="rectangle", style="filled", fillcolor="#FFCCCC")
dot.node("Split Data", shape="rectangle", style="filled", fillcolor="#CCCCFF")
dot.node("Feature Engineering", shape="rectangle", style="filled", fillcolor="#CCFFCC")
dot.node("Train Model", shape="rectangle", style="filled", fillcolor="#99FF99")
dot.node("Evaluate Model", shape="rectangle", style="filled", fillcolor="#CC99FF")
dot.node("X, y", shape="oval", style="filled", fillcolor="#FFFFCC")
dot.node("Classification Report", shape="oval", style="filled", fillcolor="#FFFFCC")

dot.edge("Load Data", "Split Data")
dot.edge("Split Data", "Feature Engineering")
dot.edge("Feature Engineering", "Train Model")
dot.edge("Train Model", "Evaluate Model")
dot.edge("Split Data", "X, y")
dot.edge("Evaluate Model", "Classification Report")

dot.render("knowledge_graph.gv", view=True)

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


'knowledge_graph.gv.pdf'

### Fine-tuning with hyperparameters

In [None]:
from sklearn.model_selection import GridSearchCV
import numpy as np

df = pd.read_csv('intensity.csv')

X = df[['rPVIm', 'nPVIm', 'rPVIp', 'nPVIp']]
y = df['speaker']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=49)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

param_grid = {
    'C': np.linspace(0.001, 20.0, 40),
    'penalty': ['l1', 'l2'],
    'solver': ['saga']
}

model = LogisticRegression(random_state=49, max_iter=10000)

grid_search = GridSearchCV(model, param_grid, cv=5)
grid_search.fit(X_train_scaled, y_train)

best_model = grid_search.best_estimator_
best_params = grid_search.best_params_

print("Best Parameters:", best_params)

y_pred = best_model.predict(X_test_scaled)

report = classification_report(y_test, y_pred, zero_division=1)
print(report)