In [1]:
import pandas as pd
import numpy as np
import boto3
import os
import yaml
import openpyxl
from openpyxl.styles import Font
from pydub import AudioSegment
from pydub.utils import which
from etl.elastic_search import *

# Set the path to ffmpeg
AudioSegment.converter = which("ffmpeg")

with open('./config.yaml', 'r') as file:
    config_data = yaml.safe_load(file)

In [2]:
item_mapping = pd.read_excel(
    './data/FLAS_item_mapping.xlsx',
    index_col=None
)

recording_list = pd.read_csv(
    './data/FLA.audio.code.flags.csv'
)
recording_list['StdID'] = recording_list['StdID'].astype('str')

In [3]:
session = boto3.Session()
mfa_serial = session._session.full_config['profiles']['default']['mfa_serial']
mfa_token = input('Please enter your 6 digit MFA code:')

sts = session.client('sts')
MFA_validated_token = sts.get_session_token(SerialNumber=mfa_serial, TokenCode=mfa_token)

In [4]:
s3_client = session.client('s3', 
    aws_session_token=MFA_validated_token['Credentials']['SessionToken'],
    aws_secret_access_key=MFA_validated_token['Credentials']['SecretAccessKey'],
    aws_access_key_id=MFA_validated_token['Credentials']['AccessKeyId']
)

In [5]:
bucket_name = 'pisa-fla-audio-recording'
prefix = "delivery-execution-uploads/"
paginator = s3_client.get_paginator('list_objects_v2')

file_list = list()

for page in paginator.paginate(Bucket=bucket_name, Prefix=prefix):
    # Each page is up to 1000 objects.
    # The 'Contents' key holds the list of objects in this page.
    for obj in page.get('Contents', []):
        file_list.append(obj['Key'])

In [6]:
dat = pd.DataFrame(
    {
        'Key': file_list
    }
)
dat[["parent", "deliveryExecutionId", "clusterid", "response", "filename"]] = dat["Key"].str.split(r'/', expand=True)
dat.drop(columns = ['parent'],inplace=True)
dat['StdID'] = dat['deliveryExecutionId'].apply(lambda x: x.split('#', 1)[0][::-1])
dat = dat.merge(
    item_mapping,
    how = 'left',
    on = 'clusterid'
).drop_duplicates(keep = 'first')
dat.head()

Unnamed: 0,Key,deliveryExecutionId,clusterid,response,filename,StdID,item_id
0,delivery-execution-uploads/00131004361#17e47be...,00131004361#17e47bec76d6#585421c1d6b7bd83da983...,cluster1-FLAS01-item-1,RESPONSE,f2a9aa829509e8c40991435a03bbf2a1d5f14238,16340013100,FLAS101Q01
1,delivery-execution-uploads/00131004361#17e47be...,00131004361#17e47bec76d6#585421c1d6b7bd83da983...,cluster1-FLAS01-item-11,RESPONSE_2,c16f7518900680dd086f54371e1b283c9a5e1a23,16340013100,FLAS201Q01
2,delivery-execution-uploads/00131004361#17e47be...,00131004361#17e47bec76d6#585421c1d6b7bd83da983...,cluster1-FLAS01-item-12,RESPONSE,0a19a8a21722db1b7ef1f7264a8b59bef86ebaf3,16340013100,FLAS201Q02
3,delivery-execution-uploads/00131004361#17e47be...,00131004361#17e47bec76d6#585421c1d6b7bd83da983...,cluster1-FLAS01-item-14,RESPONSE,2d941c1840aa510843fd874796b30dd5452de7d8,16340013100,FLAS301Q01
4,delivery-execution-uploads/00131004361#17e47be...,00131004361#17e47bec76d6#585421c1d6b7bd83da983...,cluster1-FLAS01-item-15,RESPONSE,f4adb2178d25f783a855b9d38be3506cad8489c7,16340013100,FLAS301Q02


In [7]:
dat_final = recording_list.loc[:,['StdID','isoname','item_id','item_code','audio_filename','flag']].merge(
    dat,
    how = 'left',
    on = ['StdID','item_id']
).loc[:,['StdID','deliveryExecutionId','isoname','item_id','item_code','clusterid','audio_filename','filename','Key','flag']]

In [17]:
dat_final.shape

(150446, 10)

In [9]:
dat_final.head()

Unnamed: 0,StdID,deliveryExecutionId,isoname,item_id,item_code,clusterid,audio_filename,filename,Key,flag
0,0,,,FLAS105Q01,,,,,,
1,0,,,FLAS105Q02,,,,,,
2,0,,,FLAS105Q03,,,,,,
3,0,,,FLAS105Q04,,,,,,
4,0,,,FLAS105Q05,,,,,,


In [8]:
from pydub import AudioSegment
from pydub.exceptions import CouldntDecodeError
import wave
import subprocess
from botocore.exceptions import ClientError

def audio_length_file(key_path):
    if pd.isnull(key_path):
        duration = None
    else:
        try:
            # Attempt to load the audio file
            audio = AudioSegment.from_file(key_path)
            duration = audio.duration_seconds
        except CouldntDecodeError:
            print(f"Could not read the audio file {key_path}, it might be in use.")
            try:
                result = subprocess.run(
                    ["ffmpeg", "-v", "error", "-i", key_path, "-f", "null", "-"],
                    stderr=subprocess.PIPE,
                    stdout=subprocess.PIPE
                )
                # If the return code is 0, the file is valid
                duration = 0
                return result.returncode == 0
            except Exception as e:
                print(f"Error decoding compressed WAV file: {e}")
                duration = 0
        except Exception as e:
            duration = 0
            print(f"An unexpected error occurred: {e}")
    return duration

def download_audio(base_path,key_string, temp = False, duration = True):
    error = False
    if pd.isnull(key_string):
        key_path = None
        dur = None
    else:
        filename = f"{key_string.split(r'/')[-1:][0]}.mp3"
        key_path = f'{base_path}/{filename}'
        file_exists = os.path.exists(key_path)
        if not file_exists:
            try:
                s3_client.download_file(
                    Bucket = bucket_name,
                    Key = key_string,
                    Filename = key_path
                )
            except ClientError as e:
                try:
                    s3_client.download_file(
                        Bucket = bucket_name,
                        Key = key_string,
                        Filename = key_path
                    )
                except ClientError as er:
                    print(f"Error creating file: {key_string}")
                    error = True
            

        if duration and not error:
            dur = audio_length_file(key_path)
        else:
            dur = None
        if temp and not error:
            os.remove(key_path)
            key_path = None

    return key_path, dur

def duration_es(df):
    eligible = any(df['_source.domEventType'].isin(['record'])) & any(df['_source.domEventType'].isin(['play']))
    if not eligible:
        duration1 = 0
        duration2 = 0
    else:
        play_ser = df.loc[(df['_source.domEventType'] == 'play'),'_source.timestamp'].iloc[0]
        play_time = float(play_ser.item()) / 1000
        record_ser = df.loc[(df['_source.domEventType'] == 'record'),'_source.timestamp'].iloc[0]
        record_time = float(record_ser.item()) / 1000
        move_ser = df.loc[(df['_source.metadata.type'] == 'move'),['_source.timestamp']].iloc[0]
        move_time = float(move_ser.item()) / 1000

        duration1 = record_time - play_time
        duration2 = move_time - record_time

    return pd.DataFrame({
        'clusterid': [df.name],
        'dur_ui_1': [duration1],
        'dur_ui_2': [duration2]
    })

def audio_length_es(del_id):
    args = parse_arguments(
        index_key = 'all',
        # del_exec_list=del_exec_list,
        api_key = config_data['elastic_search']['ELASTIC_API_KEY'],
        req_size = '5000'
    )

    pit_id = create_pit(args.index_key)

    row_list = []

    hits, request_num, search_after = True, 1, None
    while hits:
        # hits = search(args, pit_id, search_after, del_id=list(del_id.split(" ")))
        hits = search(args, pit_id, search_after, del_id=del_id)
        # say(f'Request {request_num} {search_after} for {del_id}; Rows {len(hits)}')
        if hits:
            df = pd.json_normalize(hits)
            df = df.loc[~pd.isnull(df['_source.itemId']),:]
            row_list.append(df)
            request_num += 1
            search_after = hits[-1]['sort']

    dat = pd.concat(row_list,axis = 0)

    item_ids = [x for x in dat['_source.itemId'].unique() if re.search('^cluster',x)]

    dat = df.loc[df['_source.itemId'].isin(item_ids),:]
    results = dat.groupby(['_source.itemId']).apply(duration_es).reset_index(drop=True)
    results['deliveryExecutionId'] = del_id

    return results

def download_audio(row, base_path,duration):
    key_string = row['Key']
    if row['flag'] == 'black':
        temp = False
    else:
        temp = True

    error = False
    if pd.isnull(key_string):
        key_path = None
        dur = None
    else:
        filename = f"{key_string.split(r'/')[-1:][0]}.mp3"
        key_path = f'{base_path}/{filename}'
        file_exists = os.path.exists(key_path)
        if not file_exists:
            try:
                s3_client.download_file(
                    Bucket = bucket_name,
                    Key = key_string,
                    Filename = key_path
                )
            except ClientError as e:
                try:
                    s3_client.download_file(
                        Bucket = bucket_name,
                        Key = key_string,
                        Filename = key_path
                    )
                    error = False
                except ClientError as er:
                    error = True

        if duration and not error:
            dur = audio_length_file(key_path)
        else:
            dur = 0
        if temp and not error:
            os.remove(key_path)
            key_path = ''

    return (key_path, dur)

In [11]:
import openpyxl
import time

base_path = r'P:/VM Backup/251003 PISA FT T6a/Output/FLA Speaking/Audit of black flagged records'
countries = dat_final.loc[~pd.isnull(dat_final['isoname']),'isoname'].unique()
args = None

excel_timings_filename = f'{base_path}/FLA Speaking duration of recordings.xlsx'

wbt = openpyxl.Workbook(write_only=True)

for country in countries[16:]:
    start_time = time.time()
    country_path = f'{base_path}/{country}'
    recording_path = f'{country_path}/Recordings'
    data_path = f'{country_path}/Data'
    
    # Create folders if not already exists
    if not os.path.exists(country_path):
        os.makedirs(country_path)
    if not os.path.exists(recording_path):
        os.makedirs(recording_path)
    if not os.path.exists(data_path):
        os.makedirs(data_path)

    dat_cnt = dat_final.loc[(dat_final['isoname'] == country),:].reset_index(drop = True)
    dat_cnt = dat_cnt.drop_duplicates(subset = ['StdID','item_code','audio_filename'],keep='first').fillna('missing').astype(str)
    dat_cnt = dat_cnt.loc[(~pd.isnull(dat_cnt['deliveryExecutionId'])) & (~pd.isnull(dat_cnt['clusterid'])) & (dat_cnt['deliveryExecutionId'] != 'missing'),:]
    dat_cnt['key_path'] = None
    dat_cnt['dur_file'] = None

    print(f"{country}: Downloading recordings and calculating duration")
    del_ids = dat_cnt['deliveryExecutionId'].unique()
    for del_id in del_ids:

        parquet_file = f"{data_path}/{del_id}.parquet"
        if not os.path.exists(parquet_file):
            try:
                dat_del_id = dat_cnt.loc[dat_cnt['deliveryExecutionId'] == del_id,:]
                dat_del_id[['key_path','dur_file']] = dat_del_id.apply(lambda row: pd.Series(download_audio(row, base_path=recording_path, duration=True)),axis = 1)
                dur_dat = audio_length_es(del_id)
                dat_del_id = dat_del_id.merge(
                    dur_dat,
                    how = 'left',
                    on = ['deliveryExecutionId','clusterid']
                )
                dat_del_id.to_parquet(parquet_file,compression='snappy')
            except Exception as e:
                # Log the error ID to a text file
                with open(f"{base_path}/error_delids.txt", "a") as file:
                    file.write(f"Error with ID {del_id}: {e}\n")

    elapsed_time = time.time() - start_time
    print("Total time taken: " + str(elapsed_time))

    # wbt.create_sheet(country)
    # ws = wbt[country]
    # for r in dataframe_to_rows(dat_cnt.drop(columns = ['Key']),index=False,header=True):
    #     ws.append(r)

    # print('Creating recordings for ' + country)
    # # Create a new workbook and select the active worksheet
    # excel_filename = f'{country_path}/{country} Recordings.xlsx'
    # wb = openpyxl.Workbook()
    # ws = wb.active
    # ws.title = "Recordings"

    # colnames = ['StdID','isoname','item_id','item_code','clusterid','file']
    # for col_idx,col in enumerate(colnames):
    #     col = ws.cell(row = 1,column = col_idx + 1, value = col)
    #     col.font = Font(bold = True)

    # for index, row in dat_cnt.iterrows():
    #     r_idx = index + 2
    #     short_row = row[colnames[:-1]]
    #     for c_idx,cell in enumerate(short_row,1):
    #         ws.cell(row = r_idx, column = c_idx, value = cell)

    #     key_path = row['key_path']
    #     if isinstance(key_path,pd.Series):
    #         image = pd.isnull(key_path).any()
    #     else:
    #         image = pd.isnull(key_path)
    #     if not image:
    #         media_file_path = os.path.abspath(key_path)
    #         file_url = f'file:///{media_file_path.replace("\\", "/")}'

    #         # Assign the display text and hyperlink
    #         cell = ws.cell(row=r_idx, column=6, value='Play Recording')
    #         cell.hyperlink = file_url
    #         cell.style = "Hyperlink"

    # wb.save(excel_filename)

# wbt.save(excel_timings_filename)
    

Qatar: Downloading recordings and calculating duration


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dat_del_id[['key_path','dur_file']] = dat_del_id.apply(lambda row: pd.Series(download_audio(row, base_path=recording_path, duration=True)),axis = 1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dat_del_id[['key_path','dur_file']] = dat_del_id.apply(lambda row: pd.Series(download_audio(row, base_path=recording_path, duration=True)),axis = 1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pa

Total time taken: 5182.991564512253
Romania: Downloading recordings and calculating duration


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dat_del_id[['key_path','dur_file']] = dat_del_id.apply(lambda row: pd.Series(download_audio(row, base_path=recording_path, duration=True)),axis = 1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dat_del_id[['key_path','dur_file']] = dat_del_id.apply(lambda row: pd.Series(download_audio(row, base_path=recording_path, duration=True)),axis = 1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pa

Could not read the audio file P:/VM Backup/251003 PISA FT T6a/Output/FLA Speaking/Audit of black flagged records/Romania/Recordings/19743b786fab59a9b8eb44164a090622176a5a3d.mp3, it might be in use.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dat_del_id[['key_path','dur_file']] = dat_del_id.apply(lambda row: pd.Series(download_audio(row, base_path=recording_path, duration=True)),axis = 1)


Could not read the audio file P:/VM Backup/251003 PISA FT T6a/Output/FLA Speaking/Audit of black flagged records/Romania/Recordings/ce38b4fb8fb7acef687f62de7275530c8982df24.mp3, it might be in use.
Could not read the audio file P:/VM Backup/251003 PISA FT T6a/Output/FLA Speaking/Audit of black flagged records/Romania/Recordings/f1889f9f61627ec3fb3c4ac86566041a278758c2.mp3, it might be in use.
Could not read the audio file P:/VM Backup/251003 PISA FT T6a/Output/FLA Speaking/Audit of black flagged records/Romania/Recordings/03e41d071d602c5d57b70100e6d37ca2f32c9c1d.mp3, it might be in use.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dat_del_id[['key_path','dur_file']] = dat_del_id.apply(lambda row: pd.Series(download_audio(row, base_path=recording_path, duration=True)),axis = 1)


Could not read the audio file P:/VM Backup/251003 PISA FT T6a/Output/FLA Speaking/Audit of black flagged records/Romania/Recordings/309df387ab9923292a2bb4cbb3d311a3f18a7e75.mp3, it might be in use.
Could not read the audio file P:/VM Backup/251003 PISA FT T6a/Output/FLA Speaking/Audit of black flagged records/Romania/Recordings/6046a5afe44cd93221b5730b0f7b88b863e78767.mp3, it might be in use.
Could not read the audio file P:/VM Backup/251003 PISA FT T6a/Output/FLA Speaking/Audit of black flagged records/Romania/Recordings/4acfad413aa4686185512815f445856be89f3d6d.mp3, it might be in use.
Could not read the audio file P:/VM Backup/251003 PISA FT T6a/Output/FLA Speaking/Audit of black flagged records/Romania/Recordings/8daf6c89935eb1a92bb4b724222be074307a3dcf.mp3, it might be in use.
Could not read the audio file P:/VM Backup/251003 PISA FT T6a/Output/FLA Speaking/Audit of black flagged records/Romania/Recordings/8c5204a0c1c0d0d59399567dec35ae209135164a.mp3, it might be in use.
Could not 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dat_del_id[['key_path','dur_file']] = dat_del_id.apply(lambda row: pd.Series(download_audio(row, base_path=recording_path, duration=True)),axis = 1)


Could not read the audio file P:/VM Backup/251003 PISA FT T6a/Output/FLA Speaking/Audit of black flagged records/Romania/Recordings/cff8256ae55776a603a6ce231f6489061b8dbfde.mp3, it might be in use.
Could not read the audio file P:/VM Backup/251003 PISA FT T6a/Output/FLA Speaking/Audit of black flagged records/Romania/Recordings/9f3633dcdc4d89a961951ae29d82bc3ea956d374.mp3, it might be in use.
Could not read the audio file P:/VM Backup/251003 PISA FT T6a/Output/FLA Speaking/Audit of black flagged records/Romania/Recordings/d353dfb0efa4b0df7669ba3accccb46cd867dcf3.mp3, it might be in use.
Could not read the audio file P:/VM Backup/251003 PISA FT T6a/Output/FLA Speaking/Audit of black flagged records/Romania/Recordings/ccd6dbfab2af35e0be992c05afb40f7b8a4a940b.mp3, it might be in use.
Could not read the audio file P:/VM Backup/251003 PISA FT T6a/Output/FLA Speaking/Audit of black flagged records/Romania/Recordings/c07c55b39bb62150fd660bac863ec54179b02e53.mp3, it might be in use.
Could not 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dat_del_id[['key_path','dur_file']] = dat_del_id.apply(lambda row: pd.Series(download_audio(row, base_path=recording_path, duration=True)),axis = 1)


Could not read the audio file P:/VM Backup/251003 PISA FT T6a/Output/FLA Speaking/Audit of black flagged records/Romania/Recordings/ea3d70c38262e14f3012515f000c95bae664e71a.mp3, it might be in use.
Could not read the audio file P:/VM Backup/251003 PISA FT T6a/Output/FLA Speaking/Audit of black flagged records/Romania/Recordings/15a29d19aecf276efc4d6312f5f017c19ddd8208.mp3, it might be in use.
Could not read the audio file P:/VM Backup/251003 PISA FT T6a/Output/FLA Speaking/Audit of black flagged records/Romania/Recordings/cbc270a7b9ee692a90257e3969ad36b2cac479fa.mp3, it might be in use.
Could not read the audio file P:/VM Backup/251003 PISA FT T6a/Output/FLA Speaking/Audit of black flagged records/Romania/Recordings/f04ee13228b2f5faaad0541eae104697c2653e65.mp3, it might be in use.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dat_del_id[['key_path','dur_file']] = dat_del_id.apply(lambda row: pd.Series(download_audio(row, base_path=recording_path, duration=True)),axis = 1)


Total time taken: 5746.894581079483
Spain: Downloading recordings and calculating duration


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dat_del_id[['key_path','dur_file']] = dat_del_id.apply(lambda row: pd.Series(download_audio(row, base_path=recording_path, duration=True)),axis = 1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dat_del_id[['key_path','dur_file']] = dat_del_id.apply(lambda row: pd.Series(download_audio(row, base_path=recording_path, duration=True)),axis = 1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pa

Could not read the audio file P:/VM Backup/251003 PISA FT T6a/Output/FLA Speaking/Audit of black flagged records/Spain/Recordings/5a0e46ec5d9fbfc72aa44736052f56f86e1ef0ce.mp3, it might be in use.
Could not read the audio file P:/VM Backup/251003 PISA FT T6a/Output/FLA Speaking/Audit of black flagged records/Spain/Recordings/d2b72340139ed40c3fbc86b949bd08cef5bfab5b.mp3, it might be in use.
Could not read the audio file P:/VM Backup/251003 PISA FT T6a/Output/FLA Speaking/Audit of black flagged records/Spain/Recordings/b370441419c8b0f1f5888cd5a9afbf65da8a50f0.mp3, it might be in use.
Could not read the audio file P:/VM Backup/251003 PISA FT T6a/Output/FLA Speaking/Audit of black flagged records/Spain/Recordings/4ec4152457563ddf95e2ed607bf77986803ef21f.mp3, it might be in use.
Could not read the audio file P:/VM Backup/251003 PISA FT T6a/Output/FLA Speaking/Audit of black flagged records/Spain/Recordings/77693e04b8a1d4eef12638c56ec281250063ad01.mp3, it might be in use.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dat_del_id[['key_path','dur_file']] = dat_del_id.apply(lambda row: pd.Series(download_audio(row, base_path=recording_path, duration=True)),axis = 1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dat_del_id[['key_path','dur_file']] = dat_del_id.apply(lambda row: pd.Series(download_audio(row, base_path=recording_path, duration=True)),axis = 1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pa

Could not read the audio file P:/VM Backup/251003 PISA FT T6a/Output/FLA Speaking/Audit of black flagged records/Spain/Recordings/98d864e33b1811623537a0441773dda156b13a0b.mp3, it might be in use.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dat_del_id[['key_path','dur_file']] = dat_del_id.apply(lambda row: pd.Series(download_audio(row, base_path=recording_path, duration=True)),axis = 1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dat_del_id[['key_path','dur_file']] = dat_del_id.apply(lambda row: pd.Series(download_audio(row, base_path=recording_path, duration=True)),axis = 1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pa

Total time taken: 14306.824205160141
Sweden: Downloading recordings and calculating duration


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dat_del_id[['key_path','dur_file']] = dat_del_id.apply(lambda row: pd.Series(download_audio(row, base_path=recording_path, duration=True)),axis = 1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dat_del_id[['key_path','dur_file']] = dat_del_id.apply(lambda row: pd.Series(download_audio(row, base_path=recording_path, duration=True)),axis = 1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pa

Total time taken: 6621.706236839294
Ukraine: Downloading recordings and calculating duration


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dat_del_id[['key_path','dur_file']] = dat_del_id.apply(lambda row: pd.Series(download_audio(row, base_path=recording_path, duration=True)),axis = 1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dat_del_id[['key_path','dur_file']] = dat_del_id.apply(lambda row: pd.Series(download_audio(row, base_path=recording_path, duration=True)),axis = 1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pa

Could not read the audio file P:/VM Backup/251003 PISA FT T6a/Output/FLA Speaking/Audit of black flagged records/Ukraine/Recordings/a56ecb49cdcec40da6bb48f9d87a05d51a0c72ec.mp3, it might be in use.
Could not read the audio file P:/VM Backup/251003 PISA FT T6a/Output/FLA Speaking/Audit of black flagged records/Ukraine/Recordings/0bda79d98a53dcfc50021d44ef463e24f4856ab8.mp3, it might be in use.
Could not read the audio file P:/VM Backup/251003 PISA FT T6a/Output/FLA Speaking/Audit of black flagged records/Ukraine/Recordings/c25ea2191abd93106ef96234de73b97cf62b19b4.mp3, it might be in use.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dat_del_id[['key_path','dur_file']] = dat_del_id.apply(lambda row: pd.Series(download_audio(row, base_path=recording_path, duration=True)),axis = 1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dat_del_id[['key_path','dur_file']] = dat_del_id.apply(lambda row: pd.Series(download_audio(row, base_path=recording_path, duration=True)),axis = 1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pa

Total time taken: 5340.177215099335


In [131]:
dat_other = recording_list.loc[(recording_list['flag'] != 'black') & (recording_list['criteria_1'] == 9),['StdID','isoname','item_id','item_code','flag','criteria_1','audio_filename']].merge(
    dat,
    how = 'left',
    on = ['StdID','item_id']
).loc[:,['StdID','isoname','item_id','item_code','clusterid','flag','criteria_1','audio_filename','filename','Key']]
dat_other = dat_other.loc[~pd.isnull(dat_other['filename']),:].reset_index()
dat_other = dat_other.head(100)

excel_filename = f'{base_path}/Other/Other Recordings.xlsx'
wb = openpyxl.Workbook()
ws = wb.active
ws.title = "Recordings"

colnames = ['StdID','isoname','item_id','item_code','clusterid','flag','criteria_1','audio_filename','file']
for col_idx,col in enumerate(colnames):
    col = ws.cell(row = 1,column = col_idx + 1, value = col)
    # col.style = Font(b = True)

for index, row in dat_other.iterrows():
    r_idx = index + 2
    short_row = row[1:9]
    for c_idx,cell in enumerate(short_row,1):
        ws.cell(row = r_idx, column = c_idx, value = cell)
        
    filename = str(row['filename'])
    if not pd.isnull(row['filename']):
        key_string = str(row['Key'])
        key_path = f'{base_path}/Other/Recordings/{filename}.mp3'
        if not os.path.exists(key_path):
            s3_client.download_file(
                Bucket = bucket_name,
                Key = key_string,
                Filename = key_path
            )

        media_file_path = os.path.abspath(key_path)
        file_url = f'file:///{media_file_path.replace("\\", "/")}'

        # Assign the display text and hyperlink
        cell = ws.cell(row=r_idx, column=9, value=filename)
        cell.hyperlink = file_url
        cell.style = "Hyperlink"

wb.save(excel_filename)