In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import numpy as np
import librosa
import matplotlib.pyplot as plt

In [3]:
import pandas as pd
import os
import scipy.signal

In [4]:
def area(y, sr):
    # Compute the F0 contour
    f0, voiced_flag, voiced_probs = librosa.pyin(y=y, fmin=librosa.note_to_hz('C2'), fmax=librosa.note_to_hz('C7'), sr=sr, frame_length=1024, hop_length=256)
    f0 = np.nan_to_num(f0)

    # Smooth the F0 contour
    f0_smoothed = scipy.signal.convolve(f0, scipy.signal.hann(41), mode='same') / sum(scipy.signal.hann(41))

    # Compute the duration of each segment in seconds
    durations = librosa.frames_to_time(range(len(f0)), sr=sr, hop_length=256)

    # Compute the AFC feature
    afc = sum(ti * fi for ti, fi in zip(durations, f0_smoothed))

    return afc

In [5]:
def area2(y, sr):
    # Compute the F0 contour
    f0, voiced_flag, voiced_probs = librosa.pyin(y=y, fmin=librosa.note_to_hz('C2'), fmax=librosa.note_to_hz('C7'), sr=sr, frame_length=1024, hop_length=256)
    f0 = np.nan_to_num(f0)

    # Smooth the F0 contour using a Hann window with a length of 41 frames
    f0_smoothed = scipy.signal.convolve(f0, scipy.signal.hann(41), mode='same') / sum(scipy.signal.hann(41))

    # Compute the root-mean-square (RMS) energy of each segment
    rms_energy = librosa.feature.rms(y=y, hop_length=128)[0]

    # Scale the RMS energy by a factor of 0.1
    rms_energy_scaled = 0.1 * (rms_energy)

    # Compute the duration of each segment in seconds
    durations = librosa.frames_to_time(range(len(f0)), sr=sr, hop_length=256)

    # Compute the EFI feature
    efi = sum(ti * fi * ei for ti, fi, ei in zip(durations, f0_smoothed, rms_energy_scaled))

    return efi

In [6]:

def calculate_vur(y, sr):
    # Compute the short-time Fourier transform (STFT) of the signal
    stft = librosa.stft(y)

    # Compute the power spectrogram
    power = librosa.power_to_db(np.abs(stft)**2)

    # Compute the voiced and unvoiced segments using the Mel-frequency cepstral coefficients (MFCCs)
    mfccs = librosa.feature.mfcc(S=power, sr=sr)
    voiced = np.where(mfccs[0] > np.median(mfccs[0]))[0]
    unvoiced = np.where(mfccs[0] <= np.median(mfccs[0]))[0]

    # Compute the duration of the voiced and unvoiced segments
    voiced_duration = librosa.frames_to_time(len(voiced), sr=sr)
    unvoiced_duration = librosa.frames_to_time(len(unvoiced), sr=sr)

    # Calculate the VUR
    vur = voiced_duration / (voiced_duration + unvoiced_duration)

    return vur


In [7]:
from sklearn.mixture import GaussianMixture
from scipy.signal import medfilt

def calculate_dlh(y, sr):

    spec = librosa.stft(y=y, n_fft=2048, hop_length=256, win_length=1024, window='hann', center=True, pad_mode='reflect')

    # Compute the power spectrogram
    power_spec = np.abs(spec) ** 2

    # Compute the mel spectrogram
    mel_spec = librosa.feature.melspectrogram(S=power_spec)

    # Compute the log mel spectrogram
    log_mel_spec = librosa.amplitude_to_db(mel_spec)

    # Compute the F0 contour
    f0, voiced_flag, voiced_probs = librosa.pyin(y=y, fmin=librosa.note_to_hz('C2'), fmax=librosa.note_to_hz('C7'), sr=sr, frame_length=1024, hop_length=256)
    f0 = np.nan_to_num(f0)

    # Smooth the F0 contour using a Hann window with a length of 41 frames
    f0_smoothed = scipy.signal.convolve(f0, scipy.signal.hann(41), mode='same') / sum(scipy.signal.hann(41))

    # Cluster the F0 values using a two-component Gaussian mixture model
    gmm = GaussianMixture(n_components=2, covariance_type='full')
    f0_reshaped = f0_smoothed.reshape(-1, 1)
    gmm.fit(f0_reshaped)
    labels = gmm.predict(f0_reshaped)

    # Compute the mean F0 values for the low and high frequency clusters
    mean_low = np.mean(f0_smoothed[labels == 0])
    mean_high = np.mean(f0_smoothed[labels == 1])

    # Compute the DLH feature as the difference between the mean F0 values of the high and low frequency clusters
    dlh = mean_high - mean_low

    return dlh


In [8]:
import numpy as np
import librosa
from sklearn.mixture import GaussianMixture
from scipy.signal import medfilt

def get_f0_peak_valley(y, sr):
    f0_values, voiced_flag, _ = librosa.pyin(
        y, fmin=librosa.note_to_hz('C2'), fmax=librosa.note_to_hz('C7'), sr=sr)
    f0_values = np.nan_to_num(f0_values)
    # Smooth F0 curve using a median filter
    f0_smoothed = medfilt(f0_values, kernel_size=5)

    #count the no. of nan values in f0_values
    nan_count = np.count_nonzero(np.isnan(f0_values))

    # Compute location and amplitude of F0 peak or valley
    if f0_smoothed.size > 0:
        peak_loc, valley_loc = librosa.util.peak_pick(
        f0_smoothed, 3, 3, 3, 5, 0.5, 5)[:2]


    # else:
    # # handle the case where f0_smoothed is empty or has a size of 0
    #     print("Error: f0_smoothed array is empty or has a size of 0.")
    peak_loc, valley_loc = librosa.util.peak_pick(f0_smoothed, 3, 3, 3, 5, 0.5, 5)[:2]
    print(peak_loc, valley_loc)
    if peak_loc > 0:
        f_amp = f0_smoothed[peak_loc] - np.mean(f0_smoothed)

        f_loc = peak_loc / sr
    elif valley_loc > 0:
        f_amp = np.mean(f0_smoothed) - f0_smoothed[valley_loc]
        f_loc = valley_loc / sr
    else:
        f_amp = 0
        f_loc = 0

    return f_amp, f_loc

In [9]:
import numpy as np

def get_aggregate_stats(y, sr):
    f0, voiced_flag, voiced_probs = librosa.pyin(y=y, fmin=librosa.note_to_hz('C2'), fmax=librosa.note_to_hz('C7'), sr=sr, frame_length=1024, hop_length=256)
    f0 = np.nan_to_num(f0)
    rms_energy = librosa.feature.rms(y=y, hop_length=128)[0]
    rms_energy = np.nan_to_num(rms_energy)

    stats_list = [np.mean(f0), np.median(f0), np.max(f0), np.min(f0), np.var(f0),
                  np.mean(rms_energy), np.median(rms_energy), np.max(rms_energy), np.min(rms_energy), np.var(rms_energy)]

    return stats_list

In [10]:
def generate_features(audio, sr, start, end):
    y = audio[start:end]
    l1 = []
    # area under fundamental frequency curve
    ar = area(y, sr)
    l1.append(ar)
    # energy fundamental frequency integral
    ar2 = area2(y, sr)
    l1.append(ar2) 

    # vur
    vur = calculate_vur(y, sr)
    l1.append(vur)
    # dlh
    dlh = calculate_dlh(y, sr)
    l1.append(dlh)

    # f0 peak and valley
    # famp, floc = get_f0_peak_valley(y, sr)
    # print(famp, floc)
    # duration
    duration_frames = int(len(y) / 10)
    l1.append(duration_frames)

    # aggregate stats
    stats_l = get_aggregate_stats(y, sr)
    for i in range(len(stats_l)):
        l1.append(stats_l[i])
    
    return l1

In [11]:
def generate_features_for_all_words(audio, sr, word_durations):
    features = []
    for start, end in word_durations:
        features.append(generate_features(audio, sr, start, end))
    return features

In [None]:
import pandas as pd
# df = pd.DataFrame(columns=['features'])
feature_vectors = []
for x in range(278,1000):
#  load with 16000 sampling rate
  audio, sr = librosa.load('/content/drive/MyDrive/Movies/'+str(x)+'.wav', sr=16000)


  # read txt file
  start = []
  duration = []
  with open('/content/drive/MyDrive/Movies/'+str(x)+'_A.txt', 'r') as f:
      for line in f:
          line = line.split()
          # convert to float
          start.append(float(line[2]))
          duration.append(float(line[3]))

  # make start and end arrays with index
  start = np.array(start)
  duration = np.array(duration)
  end = start + duration

  #  convert start from seconds to indices of audio
  start = librosa.time_to_samples(start, sr=sr)
  end = librosa.time_to_samples(end, sr=sr)

  word_durations = []
  for i in range(len(start)):
      duration = (start[i], end[i])
      word_durations.append(duration)

  # listen every word
  # import sounddevice
  import time

  l1 = generate_features_for_all_words(audio, sr, word_durations)
    # append [[1,2,3,4],[2,23,34]] to the dataframe
  # df = df.append({'features': l1}, ignore_index=True)
  feature_vectors.append(l1)


df = pd.DataFrame(feature_vectors)
# svaing the dataframe
df.to_csv('teste.csv', index=False)
  # import csv
  # with open('result.csv', 'a') as f:
  #       writer = csv.writer(f)
  #       writer.writerow(l1)

  .fit(X)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  .fit(X)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  .fit(X)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  .fit(X)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  .fit(X)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  .fit(X)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  .fit(X)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  .fit(X)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  .fit(X)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  .fit(X)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  .fit(X)
  return _

In [17]:
#implementing the model for the features generated
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier

#load the dataset where data is stored as the file name and sentence corresponding to the file name and the sentence is marked as 0,1,0,0 where 0 is for the word which is not the prominent word and 1 is for the word which is the prominet word
import pandas as pd
df = pd.read_csv('/content/drive/MyDrive/Movies/test2.csv')

x = df.values.tolist()
# convert each element from string to list
X = []
for i in range(len(x)):
    Z = []
    for j in range(len(x[i])):
        # if x[i][j] is string
        if isinstance(x[i][j], str):
            p = x[i][j].strip('][').split(', ')
            Z.append(p)
            # convert each element from string to int
            for k in range(len(p)):
                p[k] = float(p[k])
        else :
            break
    X.append(Z)


df2 = pd.read_csv('/content/drive/MyDrive/Movies/wordmarkings2.csv')
y = df2.values.tolist()
# convert each element from string to list
Y = []
for i in range(len(y)):
    Z = []
    for j in range(len(y[i])):
        # if x[i][j] is 0 or 1
        if y[i][j] == 0 or y[i][j] == 1:
            Z.append(int(y[i][j]))
        else: break
    Y.append(Z)

# print(Y)
# xv_numeric = [ast.literal_eval(x) for x in xv[1:100]]
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)
X_train = [item for sublist in X_train for item in sublist]
Y_train = [item for sublist in Y_train for item in sublist]

X_trainar = np.array(X_train)
# np.nan_to_num(X_trainar)
nan_count = np.count_nonzero(np.isnan(X_trainar))
X_trainar[np.isnan(X_trainar)] = 0
nan_count = np.count_nonzero(np.isnan(X_trainar))
X_train = X_trainar.tolist()

#train the model

model = DecisionTreeClassifier()
model.fit(X_train, Y_train)

# save the model
pickle.dump(model, open('modelcart2.pkl', 'wb'))




In [13]:
#load the model Random Forest
model = pickle.load(open('model2.pkl', 'rb'))

X_test2 = [item for sublist in X_test for item in sublist]
Y_test2 = [item for sublist in Y_test for item in sublist]

X_trainar = np.array(X_test2)
# np.nan_to_num(X_trainar)
nan_count = np.count_nonzero(np.isnan(X_trainar))
X_trainar[np.isnan(X_trainar)] = 0
nan_count = np.count_nonzero(np.isnan(X_trainar))
X_test2 = X_trainar.tolist()
#predict the model
y_pred = model.predict(X_test2)

print(accuracy_score(Y_test2, y_pred))
print(confusion_matrix(Y_test2, y_pred))
print(classification_report(Y_test2, y_pred))

importances = model.feature_importances_



0.8173076923076923
[[214  13]
 [ 44  41]]
              precision    recall  f1-score   support

           0       0.83      0.94      0.88       227
           1       0.76      0.48      0.59        85

    accuracy                           0.82       312
   macro avg       0.79      0.71      0.74       312
weighted avg       0.81      0.82      0.80       312



In [16]:
#load the model AdaBoost
model = pickle.load(open('modelada2.pkl', 'rb'))

X_test2 = [item for sublist in X_test for item in sublist]
Y_test2 = [item for sublist in Y_test for item in sublist]

X_trainar = np.array(X_test2)
# np.nan_to_num(X_trainar)
nan_count = np.count_nonzero(np.isnan(X_trainar))
X_trainar[np.isnan(X_trainar)] = 0
nan_count = np.count_nonzero(np.isnan(X_trainar))
X_test2 = X_trainar.tolist()
#predict the model
y_pred = model.predict(X_test2)

print(accuracy_score(Y_test2, y_pred))
print(confusion_matrix(Y_test2, y_pred))
print(classification_report(Y_test2, y_pred))

importances = model.feature_importances_




0.7884615384615384
[[207  20]
 [ 46  39]]
              precision    recall  f1-score   support

           0       0.82      0.91      0.86       227
           1       0.66      0.46      0.54        85

    accuracy                           0.79       312
   macro avg       0.74      0.69      0.70       312
weighted avg       0.78      0.79      0.78       312



In [19]:
#load the model cart
model = pickle.load(open('modelcart2.pkl', 'rb'))

X_test2 = [item for sublist in X_test for item in sublist]
Y_test2 = [item for sublist in Y_test for item in sublist]

X_trainar = np.array(X_test2)
# np.nan_to_num(X_trainar)
nan_count = np.count_nonzero(np.isnan(X_trainar))
X_trainar[np.isnan(X_trainar)] = 0
nan_count = np.count_nonzero(np.isnan(X_trainar))
X_test2 = X_trainar.tolist()
#predict the model
y_pred = model.predict(X_test2)

print(accuracy_score(Y_test2, y_pred))
print(confusion_matrix(Y_test2, y_pred))
print(classification_report(Y_test2, y_pred))

importances = model.feature_importances_


0.7051282051282052
[[180  47]
 [ 45  40]]
              precision    recall  f1-score   support

           0       0.80      0.79      0.80       227
           1       0.46      0.47      0.47        85

    accuracy                           0.71       312
   macro avg       0.63      0.63      0.63       312
weighted avg       0.71      0.71      0.71       312

