In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!pip3 install pydub



In [None]:
!pip install praat-parselmouth



In [None]:
from parselmouth.praat import call

def pitch_bounds(sound):
  # measure pitch ceiling and floor
  broad_pitch = sound.to_pitch_ac(
      None, 50, 15, True, 0.03, 0.45, 0.01, 0.35, 0.14, 500
  )
  # get mean pitch
  broad_mean_f0: float = call(
      broad_pitch, "Get mean", 0, 0, "hertz"
  )

  if broad_mean_f0 > 170:
    pitch_floor = 100
    pitch_ceiling = 500
  elif broad_mean_f0 < 170:
    pitch_floor = 50
    pitch_ceiling = 300
  else:
    pitch_floor = 50
    pitch_ceiling = 500
  return pitch_floor, pitch_ceiling

In [None]:
def measure_pitch(
    voice, floor=50, ceiling=500, method="ac", time_step=0, max_number_of_candidates=15,
    silence_threshold=0.03, voicing_threshold=0.45, octave_cost=0.01,
    octave_jump_cost=0.35, voiced_unvoiced_cost=0.14, unit="Hertz", very_accurate="no",
):
  pitch: object = call(
      voice, method, time_step, floor, max_number_of_candidates, very_accurate,
      silence_threshold, voicing_threshold, octave_cost, octave_jump_cost,
      voiced_unvoiced_cost, ceiling,
  )
  mean_f0: float = call(pitch, "Get mean", 0, 0, unit)
  stdev_f0: float = call(
      pitch, "Get standard deviation", 0, 0, unit
  )
  min_f0: float = call(pitch, "Get minimum", 0, 0, unit, "Parabolic")
  max_f0: float = call(pitch, "Get maximum", 0, 0, unit, "Parabolic")

  return pitch, mean_f0, stdev_f0, min_f0, max_f0
  

In [None]:
def PitchInfo(Sound):
  file_duration: float = call(Sound, "Get total duration")

  time_step = default["Time Step"]
  max_number_of_candidates = default["Max Number of Candidates"]
  silence_threshold = default["Silence Threshold"]
  voicing_threshold = default["Voicing Threshold"]
  octave_cost = default["Octave Cost"]
  octave_jump_cost = default["Octave Jump Cost"]
  voiced_unvoiced_cost = default["Voiced Unvoiced Cost"]
  unit = default["Unit"][0]

  method = default["Algorithm"][0]
  very_accurate = default["Very Accurate"][0]

  pitch_floor, pitch_ceiling = pitch_bounds(Sound)
  pitch, mean_f0, stdev_f0, min_f0, max_f0 = measure_pitch(
    Sound,
    floor=pitch_floor,
    ceiling=pitch_ceiling,
    method=method,
    time_step=time_step,
    max_number_of_candidates=max_number_of_candidates,
    silence_threshold=silence_threshold,
    voicing_threshold=voicing_threshold,
    octave_cost=octave_cost,
    octave_jump_cost=octave_jump_cost,
    voiced_unvoiced_cost=voiced_unvoiced_cost,
    unit=unit,
    very_accurate=very_accurate,
  )

  return mean_f0, stdev_f0, min_f0, max_f0, pitch_floor, pitch_ceiling

In [None]:
def IntensityInfo(Sound):
  default["minimum_pitch"] = 100
  minimum_pitch = default["minimum_pitch"]
  intensity = Sound.to_intensity(minimum_pitch)
  mean_intensity = intensity.get_average()

  return mean_intensity

In [None]:
def JitterInfo(Sound):
  default["start_time"] = 0
  default["end_time"] = 0
  default["shortest_period"] = 0.0001
  default["longest_period"] = 0.02
  default["maximum_period_factor"] = 1.3
  default["Measure PCA"] = True

  state = {
    "local_jitter_list": [],
    "localabsolute_jitter_list": [],
    "rap_jitter_list": [],
    "ppq5_jitter_list": [],
    "ddp_jitter_list": [],
  }

  start_time = default["start_time"]
  end_time = default["end_time"]
  shortest_period = default["shortest_period"]
  longest_period = default["longest_period"]
  max_period_factor = default["maximum_period_factor"]

  pitch_floor, pitch_ceiling = pitch_bounds(Sound)
  point_process: object = call(
    Sound, "To PointProcess (periodic, cc)", pitch_floor, pitch_ceiling
  )

  local_jitter: float = call(
    point_process,
    "Get jitter (local)",
    start_time,
    end_time,
    shortest_period,
    longest_period,
    max_period_factor,
  )

  localabsolute_jitter: float = call(
    point_process,
    "Get jitter (local, absolute)",
    start_time,
    end_time,
    shortest_period,
    longest_period,
    max_period_factor,
  )

  rap_jitter: float = call(
    point_process,
    "Get jitter (rap)",
    start_time,
    end_time,
    shortest_period,
    longest_period,
    max_period_factor,
  )

  ppq5_jitter: float = call(
    point_process,
    "Get jitter (ppq5)",
    start_time,
    end_time,
    shortest_period,
    longest_period,
    max_period_factor,
  )

  ddp_jitter: float = call(
    point_process,
    "Get jitter (ddp)",
    start_time,
    end_time,
    shortest_period,
    longest_period,
    max_period_factor,
  )

  state["local_jitter_list"].append(local_jitter)
  state["localabsolute_jitter_list"].append(localabsolute_jitter)
  state["rap_jitter_list"].append(rap_jitter)
  state["ppq5_jitter_list"].append(ppq5_jitter)
  state["ddp_jitter_list"].append(ddp_jitter)

  return local_jitter, localabsolute_jitter, rap_jitter, ppq5_jitter, ddp_jitter

In [None]:
import librosa
import numpy as np

def AmplitudeInfo(filename):
  y, sr = librosa.load(filename)

  return np.max(y), np.min(y), sum(y) / len(y)

In [None]:
import parselmouth

default = {
    "Time Step": 0,
    "Max Number of Candidates": 15,
    "Silence Threshold": 0.03,
    "Voicing Threshold": 0.45,
    "Octave Cost": 0.01,
    "Octave Jump Cost": 0.35,
    "Voiced Unvoiced Cost": 0.14,
    "Unit": ("Hertz", ["Hertz",
                       "Hertz (Logarithmic)",
                       "mel",
                       "logHertz",
                       "semitones re 1 Hz",
                       "semitones re 100 Hz",
                       "semitones re 200 Hz",
                       "semitones re 440 Hz",
                       "ERB",
                       ]),
    "Algorithm": ("To Pitch (ac)", ["To Pitch (ac)", "To Pitch (cc)"]),
    "Very Accurate": ("yes", ["yes", "no"]),
}

In [None]:
# mp3 파일 -> wav 파일
from pydub import AudioSegment
import os
import pandas as pd

dst = "test.wav"
AudioFeature = pd.DataFrame(columns=['name', 'Mean Pitch (F0)', 'Standard Deviation Pitch (F0)', 'Pitch Min (F0)', 'Pitch Max (F0)', 'Pitch Floor', 'Pitch Ceiling',
                                     'Mean Intensity (dB)', 'Local Jitter', 'Local Absolute Jitter', 'RAP Jitter', 'ppq5 Jitter', 'ddp Jitter',
                                     'Max Amplitude', 'Min Amplitude', 'Average Amplitude'])


# convert wav to mp3
for voicename in os.listdir("/content/drive/MyDrive/졸업프로젝트/recordings"):
#for i in range(10): 
#  voicename = os.listdir("/content/drive/MyDrive/졸업프로젝트/recordings")[i]                                                            
  audSeg = AudioSegment.from_mp3("/content/drive/MyDrive/졸업프로젝트/recordings/" + voicename)
  audSeg.export(dst, format="wav")
  filename = "/content/test.wav"

  Sound = parselmouth.Sound("/content/test.wav")
  mean_f0, stdev_f0, min_f0, max_f0, pitch_floor, pitch_ceiling = PitchInfo(Sound)
  mean_intensity = IntensityInfo(Sound)
  local_jitter, localabsolute_jitter, rap_jitter, ppq5_jitter, ddp_jitter = JitterInfo(Sound)
  max, min, average = AmplitudeInfo(filename)
  AudioFeature = AudioFeature.append({'name' : voicename[:-4],
                                      'Mean Pitch (F0)' : mean_f0, 'Standard Deviation Pitch (F0)' : stdev_f0, 'Pitch Min (F0)' : min_f0, 'Pitch Max (F0)' : max_f0, 'Pitch Floor' : pitch_floor, 'Pitch Ceiling' : pitch_ceiling,
                                      'Mean Intensity (dB)' : mean_intensity,
                                      'Local Jitter' : local_jitter, 'Local Absolute Jitter' : localabsolute_jitter, 'RAP Jitter' : rap_jitter, 'ppq5 Jitter' : ppq5_jitter, 'ddp Jitter' : ddp_jitter,
                                      'Max Amplitude' : max, 'Min Amplitude' : min, 'Average Amplitude' : average}, ignore_index=True)

In [None]:
AudioFeature

Unnamed: 0,name,Mean Pitch (F0),Standard Deviation Pitch (F0),Pitch Min (F0),Pitch Max (F0),Pitch Floor,Pitch Ceiling,Mean Intensity (dB),Local Jitter,Local Absolute Jitter,RAP Jitter,ppq5 Jitter,ddp Jitter,Max Amplitude,Min Amplitude,Average Amplitude
0,hindi11,117.178817,19.973408,65.406469,256.788079,50,300,70.897742,0.022519,0.000193,0.009891,0.010897,0.029673,0.553026,-0.664893,2.374769e-06
1,hebrew6,270.007496,46.692003,88.248105,402.758425,100,500,71.463766,0.024865,9.2e-05,0.012316,0.013913,0.036948,0.690041,-0.545608,-6.511105e-06
2,hungarian1,205.906566,21.537017,100.655714,417.030397,100,500,72.919763,0.027717,0.000135,0.015121,0.013408,0.045364,0.615578,-0.659404,2.700696e-05
3,hungarian9,151.126001,33.272409,49.450146,282.134531,50,300,69.467123,0.020562,0.000136,0.009484,0.010346,0.028452,0.601537,-0.669348,-7.58457e-06
4,hindi10,220.087738,47.920474,89.354822,369.926904,100,500,72.402773,0.024929,0.000114,0.012362,0.012893,0.037085,0.606594,-0.673412,-2.68527e-06
5,hausa4,121.159696,24.105348,64.350389,247.166693,50,300,71.272996,0.027213,0.000225,0.01174,0.014092,0.035221,0.589712,-0.673285,-1.165253e-06
6,hungarian4,125.91952,23.664881,52.31049,290.171944,50,300,68.798918,0.023049,0.000184,0.009889,0.011263,0.029668,0.392793,-0.671142,-3.689918e-06
7,hungarian7,191.562695,78.603254,95.021973,501.364761,100,500,72.721002,0.024485,0.000132,0.01163,0.011571,0.034891,0.678276,-0.624668,5.55186e-07
8,hindi14,144.609652,19.980029,59.890396,218.173142,50,300,74.139143,0.023547,0.000163,0.011173,0.012762,0.03352,0.598579,-0.661042,7.962834e-06
9,hausa8,120.547651,28.817431,49.252218,244.329014,50,300,71.13594,0.022945,0.000189,0.010092,0.010753,0.030277,0.670295,-0.626929,-4.192201e-08


In [None]:
# frequency 계산
import numpy as np

y, sr = librosa.load("/content/test.wav")
freq = librosa.core.stft(y, n_fft=1024, hop_length=512, win_length=1024)
log_freq = librosa.power_to_db(np.abs(freq)**2, ref=np.max)

In [None]:
log_freq

array([[-34.761406, -80.      , -65.14072 , ..., -44.52765 , -41.083878,
        -52.888332],
       [-39.414013, -47.95102 , -36.620808, ..., -37.430737, -37.9537  ,
        -46.34778 ],
       [-59.434113, -46.160675, -34.94571 , ..., -29.275846, -31.78643 ,
        -48.893967],
       ...,
       [-80.      , -80.      , -80.      , ..., -80.      , -80.      ,
        -80.      ],
       [-80.      , -80.      , -80.      , ..., -80.      , -80.      ,
        -80.      ],
       [-80.      , -80.      , -80.      , ..., -80.      , -80.      ,
        -80.      ]], dtype=float32)

In [None]:
y

array([ 0.00209511,  0.00470292,  0.00439482, ..., -0.00968624,
       -0.01128522, -0.01457994], dtype=float32)