In [137]:
# import libraries
from transformers import pipeline, Pipeline
import numpy as np
import pandas as pd
from typing import Callable
import os


# function to import classifier from HF
def get_classifier() -> Pipeline:
    classifier = pipeline("zero-shot-classification")
    return classifier

def concat_sentences(df: pd.DataFrame) -> pd.DataFrame:
    sentence_count=1
    sentence = ""
    for i in range(len(df)):
        if i > 0 and df.loc[i, "speaker"] != df.loc[i-1, "speaker"] and float(df.loc[i, "stop_time"]) - float(df.loc[i-1,"start_time"]) >= 0.5: #type:ignore
            sentence = df.loc[i,"value"]
            df.loc[i,"sentence"] = sentence
            sentence_count+=1
        
        else: sentence = str(sentence) + " " + str(df.loc[i, "value"])
        df.loc[i,"sentence"] = sentence
        df.loc[i, "sentence_count"] = int(sentence_count)
    df = df.loc[df.groupby('sentence_count')['sentence'].apply(lambda x: x.str.len().idxmax())].reset_index(drop=True)
    df = df.drop(columns = ["value", "sentence_count"])
        
    return df



# read out data, can be modified to specify which data to read in
def get_data(dirname) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
    # for multiple files:
    df = pd.DataFrame()
    for file in os.listdir(dirname):
        temp = pd.read_csv(f"{dirname}/"+file, delimiter= "\t")
        temp = concat_sentences(temp)
        #temp["sentence"] = temp["sentence"].apply(lambda x: ' '.join(x))
        df = pd.concat([df,temp], axis=0, ignore_index= True)
    df_pat = df.loc[df["speaker"] == "Participant"]
    df_ther = df.loc[df["speaker"] != "Participant"]

    return df, df_pat, df_ther

# function to create the get_scores function, and automatically pass the classifier to it when it gets used.
def get_scores_func_creator(clfr: Pipeline) -> Callable[[pd.DataFrame], pd.Series]:
    def get_scores(col:pd.DataFrame) -> pd.Series:
        result = clfr(col, candidate_labels=['happy', 'unhappy', 'neutral'])
        return pd.Series({'sent': result["labels"][0], 'score': result["scores"][0]}) #type:ignore
    return get_scores

# function to use the get_scores function to classify the texts
def classify_texts(df: pd.DataFrame, get_scores) -> pd.DataFrame:
    df = df.loc[df["sentence"] != '']
    df[['sent', 'score']] = df['sentence'].apply(get_scores)

    return df

# function to alter the sentiments if the confidence is below the treshold
def adjust_classifications(df: pd.DataFrame, treshold: float) -> pd.DataFrame:
    df["modified_sent"] = df["sent"].loc[df["score"]<=treshold].replace(['happy','unhappy'],'neutral')
    df["modified_sent"] = df["modified_sent"].fillna(df["sent"])
    return df


In [32]:
classfier = get_classifier()

No model was supplied, defaulted to facebook/bart-large-mnli and revision d7645e1 (https://huggingface.co/facebook/bart-large-mnli).
Using a pipeline without specifying a model name and revision in production is not recommended.
Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


In [138]:
df, df_pat, df_ther  = get_data('../THERAVATARS_NN/transcripts')
df

Unnamed: 0,start_time,stop_time,speaker,sentence
0,12.170,13.010,Participant,<synch>
1,32.564,54.104,Ellie,IntroV4Confirmation (hi i'm ellie thanks for c...
2,54.180,54.720,Participant,yes
3,58.401,60.671,Ellie,okay_confirm (okay) how_doingV (so how are you...
4,63.015,64.795,Participant,spectacular this is an interesting experiment
...,...,...,...,...
21353,1245.181,1245.721,Participant,thank you
21354,1249.941,1251.381,Ellie,asked_everything (okay i think i have asked ev...
21355,1251.622,1252.122,Participant,thank you
21356,1252.602,1253.212,Ellie,bye (goodbye)


In [95]:
df['stop_time'].str.replace(r'\d*', r'hallo')

0         39668
1         43378
2         48498
3         52388
4         58958
          ...  
47395    888814
47396    890953
47397    892368
47398    892940
47399    894680
Name: stop_time, Length: 47400, dtype: object

In [94]:
df['stop_time'].str.replace(r'\d+', r'1')

0         39668
1         43378
2         48498
3         52388
4         58958
          ...  
47395    888814
47396    890953
47397    892368
47398    892940
47399    894680
Name: stop_time, Length: 47400, dtype: object

In [141]:
df_pat[['stop_time','start_time']][:1000]

Unnamed: 0,stop_time,start_time
0,13.010,12.170
2,54.720,54.180
4,64.795,63.015
6,68.070,67.680
8,73.680,70.990
...,...,...
1972,158.310,150.830
1974,160.120,159.800
1976,171.030,168.010
1978,180.530,178.870


In [142]:
df.loc[df["sentence"] != ''][:500]

Unnamed: 0,start_time,stop_time,speaker,sentence
0,12.170,13.010,Participant,<synch>
1,32.564,54.104,Ellie,IntroV4Confirmation (hi i'm ellie thanks for c...
2,54.180,54.720,Participant,yes
3,58.401,60.671,Ellie,okay_confirm (okay) how_doingV (so how are you...
4,63.015,64.795,Participant,spectacular this is an interesting experiment
...,...,...,...,...
495,610.350,611.030,Participant,wow where do i start i have plenty of uh bad d...
496,616.477,617.807,Ellie,give_example (can you give me an example of that)
497,620.920,622.680,Participant,um yeah well i have a d_u_i on my record
498,626.188,627.128,Ellie,difficult (how hard is that)


In [128]:
pd.read_csv('/root/workspace/THERAVATARS_NN/transcripts/479_TRANSCRIPT.csv', delimiter = '\t')

Unnamed: 0,start_time,stop_time,speaker,value
0,11.429,12.289,Participant,<synch>
1,31.877,53.417,Ellie,IntroV4Confirmation (hi i'm ellie thanks for c...
2,54.272,54.682,Participant,yes
3,55.828,56.288,Ellie,okay_confirm (okay)
4,57.815,60.085,Ellie,how_doingV (so how are you doing today)
...,...,...,...,...
243,866.859,867.499,Ellie,thats_good (that's good)
244,867.461,867.731,Participant,yeah
245,870.581,873.021,Ellie,asked_everything (okay i think i have asked ev...
246,873.757,875.197,Ellie,appreciate_open (thanks for sharing your thoug...
