In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
pip install fuzzywuzzy

Collecting fuzzywuzzy
  Downloading fuzzywuzzy-0.18.0-py2.py3-none-any.whl (18 kB)
Installing collected packages: fuzzywuzzy
Successfully installed fuzzywuzzy-0.18.0


In [3]:
import os
import pandas as pd
import warnings
from fuzzywuzzy import fuzz  # Make sure to import the 'fuzz' module from the 'fuzzywuzzy' library



In [4]:
target_sentences_df = pd.read_csv('/content/drive/MyDrive/PhD/Transcript - correct/target_sentences.csv')
target_sentences_df

Unnamed: 0,Text
0,još malo pa će izbori
1,naoblačilo se kao da će padati kiša
2,naša kola su stara 10 godina
3,otišao je u inostranstvo prošle godine i još s...
4,telefoni zvone po ceo dan i tebe traže
...,...
62,pojavio se u poslednjem trenutku
63,sanjao sam te pre neku noć
64,premijer je obećao viši standard
65,sutra je doček Nove Godine


In [5]:
file_path = '/content/drive/MyDrive/PhD/Transcript - correct/speaker_gender.txt'

try:
    # Read the text file into a DataFrame with custom column names
    speaker_gender_df = pd.read_csv(file_path, delimiter='\t', header=None, names=['combined'])

    # Extract speaker and gender using a regular expression
    speaker_gender_df[['speaker', 'gender']] = speaker_gender_df['combined'].str.extract(r'(\d+)\s*([mf])')

    # Drop the original combined column
    speaker_gender_df = speaker_gender_df.drop('combined', axis=1)

    print(speaker_gender_df)
except FileNotFoundError:
    print(f"The file at {file_path} could not be found.")
except pd.errors.EmptyDataError:
    print(f"The file at {file_path} is empty.")
except pd.errors.ParserError as pe:
    print(f"Error parsing the file at {file_path}: {pe}")
except Exception as e:
    print(f"An error occurred: {e}")

   speaker gender
0     0001      f
1     0002      f
2     0003      m
3     0005      f
4     0006      m
5     1003      m
6     1004      m
7     1005      m
8     1006      f
9     1007      f
10    1008      f
11    1009      f
12    1010      m
13    1011      f
14    1012      f
15    1013      m
16    1014      f
17    1015      f
18    1016      m
19    1017      f
20    1019      f
21    1020      m
22    1021      f
23    1023      m
24    1024      f
25    1025      f
26    1026      f
27    1027      m
28    1028      m
29    1030      f
30    1031      m
31    1032      m
32    1033      f
33    1034      f
34    1036      f
35    1037      f
36    1038      m
37    1039      f
38    1040      f
39    1043      m
40    1044      m
41    1045      f
42    1046      m
43    1047      f
44    1049      f
45    1050      m
46    1051      m
47    1052      m
48    1053      m
49    1054      m
50    1057      f
51    1058      m
52    1060      m
53    1061      m
54    1062

In [6]:
def find_target_sentence(sentence, df = target_sentences_df):
    max_similarity = 0
    target_index = -1  # Initialize with an invalid index

    for index, row in df.iterrows():
        current_similarity = fuzz.ratio(sentence, row['Text'])
        if current_similarity > max_similarity:
            max_similarity = current_similarity
            target_index = index

    return target_index

In [7]:
def get_word_position(word_count):
    # Determine the word position based on the word count
    if word_count == 1:
        return 'b'  # Beginning of the sentence
    elif word_count == 2:
        return 'm'  # Middle of the sentence
    else:
        return 'e'  # End of the sentence

In [8]:
def process_txt_file(file_path):
    # Add your code to process each TXT file here
    with open(file_path, 'r') as file:
        content = file.read()
        lines = content.split('\n')
        index = find_target_sentence(lines[0][12:].strip())

        word_info_list = []
        total_words = 0
        for line in lines:
            if line.startswith('Word'):
                word_info = {}
                parts = line.split(', ')
                word_info['word'] = parts[0].split(': ')[1]
                word_info['start'] = float(parts[1].split(': ')[1])
                word_info['end'] = float(parts[2].split(': ')[1])

                # Determine word position in the sentence
                total_words += 1
                position = get_word_position(total_words)
                word_info['position'] = position
                word_info['target sentence'] = index

                word_info_list.append(word_info)

        return word_info_list

In [9]:
def calculate_word_length(word):

  words = word.split(' ')
  l = 0
  for i in words:
    l+= len(i)

  return l

In [27]:
def get_gender(speaker):
  return speaker_gender_df.loc[speaker_gender_df['speaker'] == speaker, 'gender'].values[0]

In [32]:
def process_directory(directory_path, output_path):

    # Create an empty DataFrame with desired column names
    columns = ['word', 'speaker', 'emotion', 'time', 'position', 'taget sentence']
    df = pd.DataFrame(columns=columns)

    # Suppress warnings
    warnings.filterwarnings("ignore")

    # Loop through all files in the directory
    for filename in os.listdir(directory_path):
        # Extract information about the two last folders
        speaker = os.path.split(directory_path)[0].split(os.path.sep)[-2:][-1][:4]
        emotion = os.path.split(directory_path)[1]

        if filename.endswith('.txt'):
            file_path = os.path.join(directory_path, filename)
            words = process_txt_file(file_path)

            # Add rows to the DataFrame for each word
            for word_info in words:
                word, start, end, position, target = word_info['word'], word_info['start'], word_info['end'], word_info['position'], word_info['target sentence']
                word_length = calculate_word_length(word)
                gender = get_gender(speaker)
                df = df.append({
                    'word': word,
                    'speaker': str(speaker),
                    'speaker gender': gender,
                    'emotion': emotion,
                    'length': word_length,
                    'time': float(end) - float(start),
                    'position': position,
                    'taget sentence': target
                }, ignore_index=True)

    # Create speaker and emotion folders if they don't exist
    speaker_folder = os.path.join(output_path, speaker)
    emotion_folder = os.path.join(speaker_folder, emotion)
    os.makedirs(emotion_folder, exist_ok=True)

    # Save the DataFrame as a CSV file in the emotion folder
    print('Output folder: ')
    print(emotion_folder)
    output_file_path = os.path.join(emotion_folder, speaker + '_' + emotion + '.csv')
    df.to_csv(output_file_path, index=False)

     # Reset warnings filter to default (optional)
    warnings.resetwarnings()

    return


In [33]:
directory_path = '/content/drive/MyDrive/PhD/Transcript - correct/transcript_corrected'
output_directory = '/content/drive/MyDrive/PhD/Features extraction/data'

# Loop through all folders in the main directory
for folder_name in os.listdir(directory_path):
    folder_path = os.path.join(directory_path, folder_name)
    print('Processing: ')
    print(folder_path)

    # Check if the current item is a directory
    if os.path.isdir(folder_path):
        # Loop through subdirectories within the current directory
        for subfolder_name in os.listdir(folder_path):
            subfolder_path = os.path.join(folder_path, subfolder_name)

            # Check if the current item in the subdirectory is a directory
            if os.path.isdir(subfolder_path):
                process_directory(subfolder_path, output_directory)

Processing: 
/content/drive/MyDrive/PhD/Transcript - correct/transcript_corrected/1052_Resampled/0
Processing: 
/content/drive/MyDrive/PhD/Transcript - correct/transcript_corrected/1052_Resampled/1
Processing: 
/content/drive/MyDrive/PhD/Transcript - correct/transcript_corrected/1052_Resampled/2
Processing: 
/content/drive/MyDrive/PhD/Transcript - correct/transcript_corrected/1052_Resampled/3
Processing: 
/content/drive/MyDrive/PhD/Transcript - correct/transcript_corrected/1052_Resampled/4
