# Using Our Trained ML Model to Predict Laughter

In an earlier notebook, we trained our model to identify laughter using a training set of ~20 episodes (each episode is around ~22 mins) which is about 7 hours of training data split into 10-second increments.

In this notebook, we load all remaining 214 episodes of friends and predict the exact time ranges of laughter for each episode. 

In [None]:
from pydub import AudioSegment
import os
from scipy.io import wavfile
import matplotlib.pyplot as plt
import operator
import numpy as np
import IPython

### Deciding clip length and overlap between clips

We use 10 second clips just like we used in training. 

In [None]:
# predtimesteps is dictated by trained model
predtimesteps = 847
cliplen = 10000
lag = 0 # this will dictate overlap necessary
overlap = 0 # old formula = round(((cliplen / predtimesteps) * lag) + 1) # rounding up to nearest ms here
timesteplen = cliplen / predtimesteps
print("overlap in ms is " + str(overlap))
print("length of each timestep in ms is " + str(timesteplen))

### Creating 10-second clips

We take in the preprocessed audio files from each episode (~22 mins each) and slice them into 10-second audio clips.

In [None]:
# Creating 10-sec wav clips out of episodes
audiofolder = '/Users/Jack/Developer/friends/allaudio/'
seasonfilter = 5

for filename in os.listdir(audiofolder):
    if not filename.startswith('.'):
        season = filename[9:11]
        if int(season) == seasonfilter:
            episode = filename[12:14]
            filepath = audiofolder + filename
            file = AudioSegment.from_file(filepath)
    #         print("Length of file is " + str(len(file)))

            startcut = 0
            endcut = startcut + cliplen
            count = 1

            while startcut < len(file):
                #create clip here
                clip = file[startcut:endcut]
                if len(clip) < cliplen:
                    break
                else:
                    clip.export("/Users/Jack/Developer/friends/clips/s" + str(season) + "e" + str(episode) + "n" + str(count) + "beg" + str(startcut) + "end" + str(endcut) + ".wav", format="wav")



    #             print("clip num is " + str(count))
    #             print("clip start is " + str(startcut))
    #             print("clip end is " + str(endcut))
    #             print("clip len is " + str(len(clip)))
                #increment values for next clip
                startcut += (cliplen - overlap)
                endcut = min(startcut + cliplen, len(file))
                count += 1


### Sorting the clips

In [None]:
# master sort

listtosort = []
clipsfolder = '/Users/Jack/Developer/friends/clips/'

for filename in os.listdir(clipsfolder):
    if not filename.startswith('.'):
        season = int(filename[1:3])
        if season == seasonfilter:
            episode = int(filename[4:6])
            findbegstart = filename.find("beg")
            findendstart = filename.find("end")
            findendend = filename.find(".")
            begtimestart = findbegstart + 3
            endtimestart = findendstart + 3
            begtime = int(filename[begtimestart:findendstart])
            endtime = int(filename[endtimestart:findendend])
            listtosort.append([season, episode, begtime, endtime, filename])


sortedclips = sorted(listtosort, key = operator.itemgetter(0, 1, 2))
# print(sortedclips)


### Shaping clips into the correct form for the model

We take the 10-second clips and we break the clips into 861 separate timesteps and we measure 257 unique frequency levels at each timestep. Then we package the timesteps and frequencies for each clip and create a numpy array that we can feed to our model. 

In [None]:
# Creating X numpy array of shape (number of clips, number of frequencies, number of spectrogram timesteps)

count = 1
clipsfolder = '/Users/Jack/Developer/friends/clips/'
# totalclips = len(sortedclips)
# print(totalclips)
spectdata = []

for info in sortedclips:
    filename = info[4]
    filepath = clipsfolder + filename
    FS, data = wavfile.read(filepath) # read wav file
#     print("Number of channels is " + str(data.ndim))
    pxx, freqs, bins, im = plt.specgram(data, Fs=FS, NFFT=512, noverlap=0)  # building plot and spectrogram
    Tx = pxx.shape[1] # represents number of time steps in spectrogram
    n_freq = pxx.shape[0] # represents number of frequencies in spectrogram
    pxxtransposed = pxx.T
#     print("filename is " + str(filename))
#     print("Shape of pxxtransposed is " + str(pxxtransposed.shape))
    spectdata.append(pxxtransposed)
    if count == 40:
        print("file duration is " + str(len(data) / float(FS)))
        print("filename is " + str(filename))
        print("number of channels is " + str(data.ndim)) # 1 channel now because we are merging in stereo
        print("num of time steps in spectrogram is " + str(Tx))
        print("num of frequencies in spectrogram is " + str(n_freq))
#         plt.show() # plot the spectrogram
    count += 1
preX = np.stack(spectdata, axis=0)
print("shape of preX is " + str(preX.shape))


### We save the numpy array of clip info locally

In [None]:
# Save preX
modeldatafolder = '/Users/Jack/Developer/friends/modeldata/'

np.save(modeldatafolder + 'preX' + str(seasonfilter) + '.txt', preX)

### Removed some preprocessing for preX so now we just assign as X

In [None]:
X = preX

In [None]:
from keras.callbacks import ModelCheckpoint
from keras.models import Model, load_model, Sequential
from keras.layers import Dense, Activation, Dropout, Input, Masking, TimeDistributed, LSTM, Conv1D
from keras.layers import GRU, Bidirectional, BatchNormalization, Reshape
from keras.optimizers import Adam
from keras.metrics import Precision, Recall

### Redefining our CNN+GRU neural network architecture

In [None]:
def model(input_shape):
    
    X_input = Input(shape = input_shape)
    
    # Convolution layer
    X = Conv1D(filters=256,kernel_size=15,strides=1)(X_input)
    X = BatchNormalization()(X)
    X = output_x = Activation("relu")(X)
    X = Dropout(rate=0.8)(X)
    
    # GRU Layer 1
    X = GRU(units=256, return_sequences = True)(X)
    X = Dropout(rate=0.8)(X)
    X = BatchNormalization()(X)
    
    # GRU Layer 2
    X = GRU(units=256, return_sequences = True)(X)
    X = Dropout(rate=0.8)(X)
    X = BatchNormalization()(X)
    X = Dropout(rate=0.8)(X)
    
    # Time-Distributed Dense Layer with Sigmoid
    X = TimeDistributed(Dense(1, activation = "sigmoid"))(X)
    
    model = Model(inputs = X_input, outputs = X)
    
    return model
    

### Defining model input shape

In [None]:
model = model(input_shape = (X.shape[1], X.shape[2]))

### Loading weights from our previously trained model

In [None]:
modelfolder = '/Users/Jack/Developer/friends/modelweights/'

model.load_weights(modelfolder + 'modelweights.h5')

### Using Adam gradient descent optimization algorithm

In [None]:
opt = Adam(lr=0.0001, beta_1=0.9, beta_2=0.999, decay=0.01)
model.compile(loss='binary_crossentropy', optimizer=opt, metrics=["accuracy", Precision(), Recall()])

### Predicting for all clips

In [None]:
# then predict for all clips
rawprobs = model.predict(X)

In [None]:
print(rawprobs.shape)

### Chose an 80% confidence threshold for laughter inclusion

Our precision score was lower than our recall score for a long time, so I decided to be more strict on what counted as laughter. We were getting almost all "true" laughter instances correct, but we were including some instances that were not laughter more often, so raising the threshold would correct for this. However, I think the precision and recall scores switched slightly in the last few versions of the model training, so I probably could have lowered this back down to ~50%. 

In [None]:
# any probs above 80% are counted as laughter
probs = rawprobs[:, :, 0]
preds = np.where(probs > 0.8, 1, 0)
print(preds.shape)

### Testing performance on some individual clips

In [None]:
# Clip test
# S5E15 starts at clipnum 2100
clipnum = 1866
clipdata = X[clipnum]
clipdata = np.expand_dims(clipdata, axis=0)
preds = model.predict(clipdata)
probs = preds[0, :, 0]

# audio output
clipsfolder = '/Users/Jack/Developer/friends/clips/'
clipinfo = sortedclips[clipnum]
filename = clipinfo[4]
filepath = clipsfolder + filename
print(filename)
IPython.display.display(IPython.display.Audio(filepath))

# probabilities graph
plt.subplot(1, 1, 1)
plt.plot(probs)
plt.ylabel('probability')
plt.show()


### Creating dict with episode clip counts for later use

In [None]:
epclipcounts = {}        

for i, info in enumerate(sortedclips):
    season = info[0]
    episode = info[1]
    begtime = info[2]
    endtime = info[3]
    filename = info[4]
    if season == seasonfilter:
        if episode not in epclipcounts:
            epclipcounts[episode] = 1
        else:
            epclipcounts[episode] += 1
print(epclipcounts)

### Checking our clip count

In [None]:
totalclips = 0
for ep, epcount in epclipcounts.items():
    totalclips += epcount
print("Total clips is " + str(totalclips) + " and should be equal to " + str(preds.shape[0]))
    

### Preprocessing laughter predictions for each episode

In [None]:
startpos = 0
predsbyep = {}

for ep, epcount in epclipcounts.items():
    key = "s" + str(seasonfilter).zfill(2) + "e" + str(ep).zfill(2)
    endpos = startpos + epcount
    print(startpos, endpos)
    predsbyep[key] = preds[startpos:endpos]
    print(key, predsbyep[key].shape)
    startpos += epcount

### Saving laughter predictions

In [None]:
# Save to predsbyep folder
predsbyepfolder = '/Users/Jack/Developer/friends/predsbyep/'

for ep, array in predsbyep.items():
    np.save(predsbyepfolder + ep + ".txt", array)

### Creating final laughter ranges

There are effectively two processing steps I take here in order to make the laughter ranges even more accurate. Anything under 400 milliseconds is too short to be a standalone laughter instance, so it either needs to join together with a close-by laughter instance, or it needs to be removed. Any gap of 100ms or less between two laughter instances is much to short to be meaningful, so we combine those two laughter instances into one longer laughter instance. 

We output a dictionary where the keys are specific episodes and the values are lists that contain pairs of start/end timestamps for laughter instances. 

In [None]:
# All seasons at once starting here
laughrangesdict = {}
minlaughlen = 400 # in ms - this decides the minimum length to be considered a laugh
concatlaugh = 100 # in ms - this decides the maximum length in between two laughs in order to join them together
consecsteps = int(minlaughlen / timesteplen) # min number of timesteps in a row to register as laugh
consecnolaugh = int(concatlaugh / timesteplen) # max number of no-laugh timesteps in a row to combine two laughs
print("min # of timesteps in a row to register a laugh is " + str(consecsteps))
print("max # of timesteps in a row to combine two laughs is " + str(consecnolaugh))

for filename in os.listdir(predsbyepfolder):
    if not filename.startswith('.'):
        season = filename[1:3]
        episode = filename[4:6]
#         if int(season) == seasonfilter:
        ep = np.load(predsbyepfolder + filename)
        flatep = ep.flatten()
        switchingindices = []
        # finding all indices where the value switches from the previous index (0 to 1 or 1 to 0)
        for i, step in enumerate(flatep):
            # starting first value at 0 no matter what
            if i == 0:
                flatep[i] = 0
            # putting last value as zero so we get even number of switches no matter what
            elif i == len(flatep) - 1:
                flatep[i] = 0
                # may need to switch on last value if value before was 1 (in order to keep switches even)
                if flatep[i - 1] == 1:
                    switchingindices.append(i)
            # checking to see if we should switch on any given value (except first or last value handled above)
            elif flatep[i] != flatep[i - 1]:
                switchingindices.append(i)
#             print(len(switchingindices))
#             print(switchingindices)
        # if value is within +- consecnolaugh steps from another value, we remove both values (separately) from list
        smoothindices = []
        for i, value in enumerate(switchingindices):
            # need to handle 0 and last index separately so i + 1 and i - 1 will always exist below
            if i == 0:
                smoothindices.append(value)
            # need to handle 0 and last index separately so i + 1 and i - 1 will always exist below
            elif i == len(switchingindices) - 1:
                smoothindices.append(value)
            # this means it represents beginning of laugh, so we check end of last laugh to see how far away that was
            elif i % 2 == 0:
                if value - switchingindices[i - 1] > consecnolaugh:
                    smoothindices.append(value)
            # this means it represents end of laugh, so we check beginning of next laugh to see how far apart that is
            elif i % 2 != 0:
                if abs(value - switchingindices[i + 1]) > consecnolaugh:
                    smoothindices.append(value)
        # We should have pairs at this point so number of indices shoud be even (thus divisible by 2 evenly)
        if len(smoothindices) % 2 != 0:
            print("WE HAVE AN ERROR")
            break
#             print(len(smoothindices))
#             print(smoothindices)
        # combining the pairs into their own sublists now
        coupledlist = []
        templist = []
        for i, timestep in enumerate(smoothindices):
            if i % 2 == 0:
                templist.append(timestep)
            if i % 2 != 0:
                templist.append(timestep)
                coupledlist.append(templist)
                templist = []
#             print(len(coupledlist))
#             print(coupledlist)
        # removing all laughter that is deemed too short in length
        minlaughlist = [pair for pair in coupledlist if pair[1] - pair[0] >= consecsteps]
#             print(len(minlaughlist))
#             print(minlaughlist)
        # translating from timesteps into ms
        laughranges = []
        for pair in minlaughlist:
            templist = [int(step * timesteplen) for step in pair]
            laughranges.append(templist)
#             print(len(laughranges))
#             print(laughranges)
        laughrangesdict[season + episode] = laughranges
print(len(laughrangesdict.keys()))
print(laughrangesdict)


### Creating new SQLite table to store laughter instances 

In [None]:
import sqlite3

# Connecting to friendsdb SQLite database and creating laughs table
conn = sqlite3.connect('/Users/Jack/Developer/friends/friendsdb.sqlite')
cur = conn.cursor()

cur.executescript('''
CREATE TABLE IF NOT EXISTS laughs (
    id INTEGER NOT NULL PRIMARY KEY AUTOINCREMENT UNIQUE,
    season INTEGER,
    episode INTEGER,
    beg INTEGER,
    end TEXT,
    UNIQUE(season, episode, beg)
);
''')

### Inputting laughter instances

We take our nicely formatted laughter ranges, organized by episode, and we store them in a SQLite database.

In [None]:
for seasep, eplaughs in laughrangesdict.items():
    season = int(seasep[0:2])
    episode = int(seasep[2:4])
    for laugh in eplaughs:
        beg = laugh[0]
        end = laugh[1]
        cur.execute('''INSERT OR REPLACE INTO laughs (season, episode, beg, end)
            VALUES ( ?, ?, ?, ? )''', ( season, episode, beg, end ) )
        conn.commit()
    



### Adding column to laughs table in SQLite so we can attribute the laugh to a character

In [None]:
addColumn = "ALTER TABLE laughs ADD COLUMN char TEXT"

cur.execute(addColumn)