In [1]:
import torch
import librosa
import pandas as pd
import numpy as np
import pylangacq
from pathlib import Path

from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split

In [2]:
class Paths:
  meta_data = 'datasets/addresso/train/meta_data.csv'
  transcripts = 'datasets/addresso/train/transcription/'
  concatenated = 'datasets/addresso/train/concatenated/'
  normalised = 'datasets/addresso/train/normalised/'

vocab = []

In [3]:
train_df = pd.read_csv(Paths.meta_data)

train_df.head()

Unnamed: 0,ID,age,gender,mmse,class
0,S001,74,male,,0
1,S002,62,female,30.0,0
2,S003,69,female,29.0,0
3,S004,71,female,30.0,0
4,S005,74,female,30.0,0


In [4]:
def get_transcript_features(row):
    chat = pylangacq.read_chat(f'{Paths.transcripts}{row["ID"]}.cha')
    utterances = [utterance for utterance in chat.utterances() if utterance.participant == 'PAR']
    tokens = [token for utterance in utterances for token in utterance.tokens]
    words = [token.word for token in tokens]
    
    # bag of words
    features = np.zeros(1000)
    for word in words:
        if word in vocab:
            features[vocab.index(word)] += 1
        else:
            vocab.append(word)
            features = np.append(features, 1)

    # number of words
    features = np.append(features, len(words))

    return features

In [5]:
def get_pitch(path):
  y, sr = librosa.load(path)
  pitch, _ = librosa.piptrack(y=y, sr=sr)

  return pitch

In [6]:
def get_acoustic_features(row):
  pitch = get_pitch(f'{Paths.concatenated}{row["ID"]}.wav')

  return pd.Series([np.mean(pitch), np.std(pitch), np.max(pitch), np.min(pitch)], index=['pitch_mean', 'pitch_std', 'pitch_max', 'pitch_min'])

In [7]:
def get_demographic_features(row):
  return [row['age'], 0 if row['gender'] == 'male' else 1]

In [8]:
train_df[['pitch_mean', 'pitch_std', 'pitch_max', 'pitch_min']] = train_df.apply(get_acoustic_features, axis=1)

In [9]:
train_df.head()

Unnamed: 0,ID,age,gender,mmse,class,pitch_mean,pitch_std,pitch_max,pitch_min
0,S001,74,male,,0,10.782606,149.675293,3998.401855,0.0
1,S002,62,female,30.0,0,18.939041,182.111618,3999.339844,0.0
2,S003,69,female,29.0,0,6.895372,107.814011,3999.598877,0.0
3,S004,71,female,30.0,0,13.534988,170.697357,3999.235596,0.0
4,S005,74,female,30.0,0,14.547139,175.085495,3999.458008,0.0


In [10]:
X = train_df.drop(columns=['class'])
y = train_df['class']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [11]:
def row_to_features(row):
    transcript_features = get_transcript_features(row)
    acoustic_features = get_acoustic_features(row)
    demo_features = get_demographic_features(row)
    return np.concatenate([transcript_features, acoustic_features.values, demo_features])

In [13]:
X_train = np.stack(train_df.apply(row_to_features, axis=1), axis=1)
y_train = train_df['class']

# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [14]:
clf = SVC(kernel='linear')
scores = cross_val_score(clf, X_train, y_train, cv=5)
print(np.mean(scores))

ValueError: Found input variables with inconsistent numbers of samples: [1007, 108]