<a href="https://colab.research.google.com/github/k7sung/clap2choir/blob/master/sync.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
SONG_NAME = "A Promise" #@param ["A Promise", "On Eagle's Wings"]

SHARED_FOLDER_PATH = 'Shared drives/CR ESSC Virtual Choir' #@param {type:"string"}
VID_FILE_GLOBS = ["*.mov","*.MOV", "*.mp4"]
AUD_FILE_GLOBS = ["*.m4a", "*.mp3", "*.acc", "*.wav"]

VIDEO_RES = "1920x1080" #@param ["1920x1080", "960x540", "540x360"] 
# SYNC_MODE = "Auto" #@param["Auto", "Clap"]

In [0]:
## OPTIONAL PARAMS ##
TIME_ZONE = 'US/Central' #@param ["US/Central", "Europe/London", "Asia/Taipei"] {allow-input:true}
OUTPUT_SOLO = True #@param {type:"boolean"}
OUTPUT_COMPARE = True #@param {type:"boolean"}
VID_DOWNSCALE_RATIO =  6#@param {type:"integer"}
INCLUDE_CLAPS = False #@param {type:"boolean"}
#PREPEND_METRONOME = False #@param {type:"boolean"}

OUTPUT_TILED_VID = True #@param {type:"boolean"}
SUBTITLE_FILENAME = "" #@param {type:"string"}
RENDER_SUBTITLE = SUBTITLE_FILENAME != ""


In [0]:
import pytz
from datetime import datetime, timezone
tz = pytz.timezone(TIME_ZONE)
now = datetime.now(tz)
TODAY_DATE = now.strftime("%m%d")

SHARED_FOLDER_PATH = "/content/drive/" + SHARED_FOLDER_PATH
RAW_PATH = SHARED_FOLDER_PATH + f"/inputs/{SONG_NAME}/"
INPUT_PATH = SHARED_FOLDER_PATH + f"/scaled/{SONG_NAME}/"
# INPUT_PATH = f"/content/scaled/{SONG_NAME}/"
OUTPUT_PATH = SHARED_FOLDER_PATH + f"/outputs/{TODAY_DATE}/{SONG_NAME}/"
# OUTPUT_PATH = f"/content/outputs/{TODAY_DATE}/{SONG_NAME}/"
W,H = [int(t) for t in VIDEO_RES.split("x")]




### would be nice:
* If output_video option is not included, don't generate scaled video
* prepend metronome option
* auto sync option
* Loudness normalization (currently peak normalization). But supported in ffmpeg!
* Easier mounting mechanism of Google Drive?
* Better clap detection that also uses amplitude and time since last peak
* Noise removal









In [0]:
from google.colab import drive
### need to add the shared folder to my drive ("Add to my drive") in Google Drive first
# drive.mount("/content/drive")
#%cd "$SHARED_FOLDER_PATH"
%mkdir -p "$INPUT_PATH"
%mkdir -p "$OUTPUT_PATH"
%mkdir -p "$OUTPUT_PATH"/compare
%mkdir -p "$OUTPUT_PATH"/solo


### Find videos to resize and trim

In [0]:
import os
from os.path import getmtime
from pathlib import Path

raw_paths = list(Path(RAW_PATH).iterdir()) #, key=os.path.getmtime
cached_paths = list(Path(INPUT_PATH).iterdir())
cached_files = dict([(c.name, getmtime(c)) for c in cached_paths])

to_process = []
#create or update
for i in raw_paths:
  if i.name.startswith('_') or i.name.endswith(".sub"):
    continue
  if i.name in cached_files:
    if getmtime(i) > cached_files[i.name]:
      to_process.append(i.name) #update
  else:
    to_process.append(i.name) #create
#delete
to_delete = cached_files.keys() - set([i.name for i in raw_paths])

print(to_process)
print(to_delete)

In [0]:
for name in to_delete:
  print("removing ", name)
  os.remove(INPUT_PATH+name)


In [0]:
import numpy as np
from numpy import mean, median, std, var, diff, correlate as corr, zeros, transpose
from moviepy.editor import *
from moviepy.audio.AudioClip import AudioArrayClip

import glob
import os
from os.path import basename
import matplotlib.pyplot as plt
from itertools import combinations as comb
from random import shuffle, randint
import shutil



### function to find the clapping sounds ###

In [0]:
def get_regular_patterns(ts):
    candi_vars = [[var(diff([ts[i] for i in c])), c] for c in comb(range(len(ts)), 4)]
    most_regular = min(candi_vars)
    return [ts[t] for t in most_regular[1]]

def get_tops(peak_ts, peak_vals, d_peak_vals, top_n):
    tops = sorted([(t, 0.0*v+1.0*dv) for t, v, dv in zip(peak_ts, peak_vals, d_peak_vals)], key=lambda k:k[1], reverse=True)[:top_n]
    #print("tops: ", tops)
    return tops

def estimate_start(audio, search_secs=20, is_finetune=False):
    c_dur = 0.1 #every chunk is 0.1 seconds
    chunks = [None]*int(search_secs/c_dur)

    # read the music in search range
    iter = audio.iter_chunks(chunk_duration=c_dur)
    for i in range(len(chunks)):
      chunk = iter.__next__()
      chunks[i] = chunk[:,0]

    peaks = [(chunk.argmax(),chunk[chunk.argmax()]) for chunk in chunks]
    peak_vals = [p[1] for p in peaks]
    peak_ts = [(sec + p[0]*1/audio.fps)*c_dur for sec, p in zip(range(len(peaks)), peaks) ]
    d_peaks = [0]+[t1-t0 for t0, t1 in zip(peak_vals[:-1], peak_vals[1:])]

    is_master = False
    fname = audio.filename
    if fname.lower().startswith('m') and not is_finetune:
      is_master = True
      
    if is_master:
      tops = get_tops(peak_ts, peak_vals, d_peaks, 8)
      clap_times = sorted([t for t, v in tops[:8]])[-4:]
    else:
      #clap_times = sorted([t for t, v in tops[:4]])
      # clap_times = get_regular_patterns(sorted([t for t, v in tops[:8]]))
      if is_finetune:
        tops = get_tops(peak_ts, peak_vals, d_peaks, 8)
        clap_times = get_regular_patterns(sorted([t for t, v in tops[:4]]))
      else:
        tops = get_tops(peak_ts, peak_vals, d_peaks, 10)
        clap_times = get_regular_patterns(sorted([t for t, v in tops]))
        cheat_list = {#"A_Lina_1.MOV":[4,5,6,7],
                      #"T_Preston.mp4":[1,2,3,4] #,
                      #"T_Enoch_1.mp4":[2,3,4,5]
                      }
        if fname in cheat_list:
          sorted_t = sorted([t for t, v in tops])
          clap_times = [sorted_t[i] for i in cheat_list[fname]]

    #print ("detected claps at ", np.array(clap_times))

    diffs = [t1-t0 for t0, t1 in zip(clap_times[0:3], clap_times[1:4])]
    #print("intervals: ", diffs)
    est_intv = mean(diffs)
    est=[0,0,0,0]
    est[0] = clap_times[0]+est_intv*3.5
    est[1] = clap_times[1]+est_intv*2.5
    est[2] = clap_times[2]+est_intv*1.5
    est[3] = clap_times[3]+est_intv*0.5

    start_est = mean(est)
    if INCLUDE_CLAPS or is_finetune==False:
      start_est = start_est - est_intv*4
    #print("start est:", start_est)
    
    to_plot = lambda: plot_claps(fname, tops, peak_ts, peak_vals, d_peaks, clap_times)    
    return start_est, est_intv, to_plot


def plot_claps(fname, tops, peak_ts, peak_vals, d_peaks, clap_times):
    print("\n"+fname+"\n")
    tops = np.array(tops)
    p1 = plt.subplot(211)
    plt.plot(peak_ts, peak_vals,  '-' ,color = "red")
    plt.plot(tops[:,0], tops[:,1], "x")
    p2 = plt.subplot(212)
    plt.plot(peak_ts, d_peaks,'-', color = 'blue')
    plt.plot(clap_times, [0]*len(clap_times), "x")
    plt.show()

def find_irregulars(ts):
    start_times = ts[0]
    est_beats = ts[1]
    to_plot_funcs = ts[2]

    beats_avg = mean(est_beats)
    beats_var = var(est_beats)
    for b, plot in zip(est_beats, to_plot_funcs):
        if (b-beats_avg)**2 > 6*beats_var: #get outliers
            print("irregular beats - expected:", (beats_avg, std(est_beats)), "actual:", (b, (b-beats_avg)))
            plot()
        # plot()

def estimate_start_x2(input_audios):
    if len(input_audios) == 0:
        return []
    print([basename(a.filename) for a in input_audios])
    t0s = [estimate_start(audio) for audio in input_audios]
    t0s = list(zip(*t0s))#transpose t0s. t0s[0] is array of n elements of start_time
    print("first round")
    find_irregulars(t0s)
    t1s = [estimate_start(audio.subclip(t_start=max(st,0)), search_secs=5, is_finetune=True) 
    for audio, st in zip(input_audios, t0s[0])]
    t1s = list(zip(*t1s))
    print("second round")
    find_irregulars(t1s)
    return [max(max(0,t0)+t1,0) for t0, t1 in zip(t0s[0],t1s[0])]

    

## Sync 

In [0]:
def get_files(dir, glob_pattern):
    return [filename for pattern in glob_pattern 
             for filename in glob.glob(dir+pattern)
             if not basename(filename).startswith('_')]

input_vids = [VideoFileClip(filename) for filename in get_files(RAW_PATH, VID_FILE_GLOBS)]
vid_start_times = estimate_start_x2([v.audio for v in input_vids])
input_vids = [v.subclip(t_start=st) for v,st in zip(input_vids, vid_start_times)]

input_auds = [AudioFileClip(filename) for filename in get_files(RAW_PATH, AUD_FILE_GLOBS)]
aud_start_times = estimate_start_x2(input_auds)
input_auds = [a.subclip(t_start=st) for a,st in zip(input_auds, aud_start_times)]


### Write synced files


In [0]:

vFx = lambda v:(v.resize(height=H/VID_DOWNSCALE_RATIO)
    .fx(afx.audio_normalize)
  )

aFx = lambda a:(a.fx(afx.audio_normalize)
)

for v in input_vids:
    if basename(v.filename) in to_process:
        vo = vFx(v)
        vo.write_videofile(INPUT_PATH+basename(v.filename), fps=30, codec='libx264', ffmpeg_params=['-crf','18'])

for a in input_auds:
    if basename(a.filename) in to_process:
        ao = aFx(a)
        ao.write_audiofile(INPUT_PATH+basename(a.filename), fps=44100, codec='mp3')



### Just syncing audio for now ###

In [0]:
from math import log
from collections import defaultdict

VOL_S = 20 #@param {type:"slider", min:0, max:100}
VOL_A = 40 #@param {type:"slider", min:0, max:100}
VOL_T = 60 #@param {type:"slider", min:0, max:100}
VOL_B = 80 #@param {type:"slider", min:0, max:100}
VOL_M = 100 #@param {type:"slider", min:0, max:100}

VOL = defaultdict(lambda:100, [
    ('S', VOL_S), ('A', VOL_A), ('T', VOL_T), ('B', VOL_B)])

out_auds = [AudioFileClip(filename) for filename in glob.glob(INPUT_PATH+"*")]

def get_part(audio):
    return basename(audio.filename)[0].upper()

def get_part_vol(part):
    return VOL[part]/100/log(len(out_auds))

aud_groups = defaultdict(list)
for a in out_auds:
    aud_groups[get_part(a)].append(a)

aud_parts = [(part, 
              CompositeAudioClip(auds)
              .volumex(get_part_vol(part))) 
    for part, auds in aud_groups.items()]

for part, aud in aud_parts:
    aud.write_audiofile(OUTPUT_PATH+f"output_{part}.mp3", fps=44100, codec='mp3')

cc = CompositeAudioClip([a for p, a in aud_parts])
cc.write_audiofile(OUTPUT_PATH+"output.mp3", fps=44100, codec='mp3')
# ipython_display(cc, fps=44100, maxduration=360)


### Create solo-group stereo track for comparison and study


In [0]:
if OUTPUT_COMPARE:
    cc_wavarray = cc.to_soundarray(fps=44100)[:,0]
    for aud in out_auds:
        fname = basename(aud.filename)
        aud = aud.to_soundarray()[:,0]
        mlen = min(len(aud), len(cc_wavarray))
        mix = AudioArrayClip(np.transpose([aud[:mlen], cc_wavarray[:mlen]]), fps=44100)
        mix.write_audiofile(OUTPUT_PATH+"compare/"+fname+"_compare.mp3", codec='mp3')

if OUTPUT_SOLO:
    for aud in out_auds:
      aud.write_audiofile(OUTPUT_PATH+"solo/"+basename(aud.filename)+".mp3", codec='mp3')

In [0]:
def SATB_order(v):
  key = ''
  if 'filename' in v.__dict__:
    filepath = v.filename
    filename = basename(filepath)
    key = filename[0].lower()
  m={'s':0,'a':1,'t':2,'b':3}
  if key in m.keys():
    return m[key]
  return randint(0,3)


def get_vid_array_dim(n):
  i=0 
  while True:
    i+=1
    if i*i >= n:
      break

  if i*i == n:
    return (i, i)
  if (i-1)*i >= n:
    return (i-1,i)
  return (i,i)

if OUTPUT_TILED_VID:
    sub_height = H//10
    if RENDER_SUBTITLE:
        H -= sub_height

    out_vids = [VideoFileClip(filename).set_audio(None) for filename in get_files(INPUT_PATH, VID_FILE_GLOBS)]
    ROWS,COLS = get_vid_array_dim(len(out_vids))
    clip_duration = max([i.duration for i in out_vids])
    empty_clip = ColorClip((W//COLS,H//ROWS), (0,0,0), duration=clip_duration)
    out_vids = out_vids+[empty_clip]*(ROWS*COLS-len(out_vids))
    shuffle(out_vids)
    out_vids.sort(key=SATB_order)

    tiles = np.reshape(out_vids, (COLS, ROWS)) #dimension is inversed because need to transpose in next line
    tiles = np.transpose(tiles)
    if RENDER_SUBTITLE:
        subspace_clip = ColorClip((W//COLS,sub_height), (0,0,0), duration=clip_duration)
        tiles = np.vstack([tiles,[subspace_clip]*COLS])

    cc = clips_array(tiles, cols_widths=[W//COLS]*COLS)
    if RENDER_SUBTITLE:
        cc = clips_array(tiles, cols_widths=[W//COLS]*COLS)
    cc = cc.set_audio(AudioFileClip(OUTPUT_PATH+"output.mp3"))
    # ipython_display(cc, t=10)
    # cc=cc.subclip(40,60)
    output_i=0
    while os.path.exists(OUTPUT_PATH+f"output{output_i}.mp4"):
        output_i += 1

    cc.write_videofile(OUTPUT_PATH+f"output{output_i}.mp4", fps=30, codec='libx264', ffmpeg_params=['-crf','18'], threads=4)

In [0]:
!ffmpeg -i "$RAW_PATH""$SUBTITLE_FILENAME" subtitle.ass 
!ffmpeg -i "$OUTPUT_PATH"output"$output_i".mp4 -vf ass=subtitle.ass "$OUTPUT_PATH"output"$output_i"_sub_rendered.mp4
