In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import os
import pandas as pd

# Define the base directory
base_path = '/content/drive/MyDrive/PhD/Features extraction/data'

# Create an empty DataFrame to store concatenated data
data = pd.DataFrame()

# Loop through top-level folders
for top_folder_name in os.listdir(base_path):
    top_folder_path = os.path.join(base_path, top_folder_name)

    # Check if the path is a directory
    if os.path.isdir(top_folder_path):
        # Loop through subfolders in each top-level folder
        for sub_folder_name in os.listdir(top_folder_path):
            sub_folder_path = os.path.join(top_folder_path, sub_folder_name)

            # Check if the path is a directory
            if os.path.isdir(sub_folder_path):
                # Loop through all files in the subfolder
                for file_name in os.listdir(sub_folder_path):
                    file_path = os.path.join(sub_folder_path, file_name)

                    # Check if the file is a CSV file
                    if file_name.endswith('.csv'):
                        # Read the CSV file and concatenate it to the data DataFrame
                        df = pd.read_csv(file_path)
                        data = pd.concat([data, df], ignore_index=True)

# Now, 'data' contains the concatenated data from all CSV files in the specified directory structure
data

In [3]:
import pandas as pd

data = pd.read_csv('/content/drive/MyDrive/PhD/Linear regression/concatenated_data.csv')
data

Unnamed: 0,word,speaker,emotion,time,position,taget sentence,speaker gender,length
0,imamo,1,0,0.6,b,25,f,5.0
1,još,1,0,0.1,m,25,f,3.0
2,četiri,1,0,0.2,e,25,f,6.0
3,minuta,1,0,0.8,e,25,f,6.0
4,vremena do,1,0,0.3,e,25,f,9.0
...,...,...,...,...,...,...,...,...
75781,dugo,1052,4,1.1,b,40,m,4.0
75782,vremena,1052,4,0.6,m,40,m,7.0
75783,nismo,1052,4,0.3,e,40,m,5.0
75784,ovde,1052,4,0.5,e,40,m,4.0


In [4]:
freq_df = pd.read_csv('/content/drive/MyDrive/PhD/Surprisal estimation/wordlist_frequencies.csv')
surprisal_gpt = pd.read_csv('/content/drive/MyDrive/PhD/Surprisal estimation/word_surprisals_gpt2.csv')
surprisal_bert = pd.read_csv('/content/drive/MyDrive/PhD/Surprisal estimation/word_surprisals_bert.csv')

In [40]:
freq_df

Unnamed: 0,Sentence,Word,Lemma,Word Frequency,Log Probability
0,0,još,još,79420,-7.325636
1,0,malo,malo,46780,-7.854931
2,0,pa,pa,54554,-7.701195
3,0,će,hteti,80522,-7.311856
4,0,izbori,izbor,25014,-8.480951
...,...,...,...,...,...
452,65,doček,doček,347,-12.758817
453,65,nove,nov,162134,-6.611963
454,65,godine,godina,1038137,-4.755203
455,66,nadam,nadati,1992,-11.011247


In [30]:
def lookup_features(data, freq_df, column_name):
    log_prob_list = []
    current_sentence = 1000
    list_of_words = []

    # Loop through rows of the DataFrame and print the 'word' column
    for index, row in data.iterrows():
        words = row['word'].split(' ')
        sentence = row['taget sentence']
        if sentence != current_sentence:
          current_sentence = sentence
          list_of_words = []
        #print(index)
        log_probability_value = 0
        for word in words:
            # Filter freq_df based on the 'Word' column
            freq_s = freq_df[freq_df['Sentence'] == sentence]
            freq = freq_s[freq_s['Word'] == word]

            # Extract the 'Log Probability' value for the filtered word
            if not freq.empty:
              log_probability_value += freq[column_name].values[0 + list_of_words.count(word)]
            else:
              log_probability_value += 0
              print('error')
              print(word)

            list_of_words.append(word)
            # avoid situation when two same sentences are one after another
            if len(list_of_words) == len(freq_s):
              list_of_words = []

        log_prob_list.append(log_probability_value)

    return log_prob_list

In [31]:
surprisal_gpt_list = lookup_features(data, surprisal_gpt, 'Surprisal GPT-2')
data['surprisal GPT'] = surprisal_gpt_list

In [36]:
surprisal_bert_list = lookup_features(data, surprisal_bert, 'Surprisal BERT')
data['surprisal BERT'] = surprisal_bert_list

In [37]:
surprisal_bert_list = lookup_features(data, surprisal_bert, 'Surprisal BERT')
data['surprisal BERT'] = surprisal_bert_list

In [41]:
log_probab_list = lookup_features(data, freq_df, 'Log Probability')
data['log probability'] = log_probab_list

In [42]:
data

Unnamed: 0,word,speaker,emotion,time,position,taget sentence,speaker gender,length,surprisal GPT,surprisal BERT,log probability
0,imamo,1,0,0.6,b,25,f,5.0,19.857792,3.380102,-6.051349
1,još,1,0,0.1,m,25,f,3.0,31.651057,0.000163,-7.325636
2,četiri,1,0,0.2,e,25,f,6.0,50.446848,0.000199,-8.268886
3,minuta,1,0,0.8,e,25,f,6.0,22.568617,0.000554,-9.569658
4,vremena do,1,0,0.3,e,25,f,9.0,32.872915,0.000472,-13.097782
...,...,...,...,...,...,...,...,...,...,...,...
75781,dugo,1052,4,1.1,b,40,m,4.0,19.776289,2.703350,-9.523365
75782,vremena,1052,4,0.6,m,40,m,7.0,40.858231,0.000097,-7.027754
75783,nismo,1052,4,0.3,e,40,m,5.0,22.771593,0.067076,-3.028636
75784,ovde,1052,4,0.5,e,40,m,4.0,18.642887,0.592845,-9.364560


In [44]:
# Save the concatenated data to a CSV file
output_csv_path = '/content/drive/MyDrive/PhD/Linear regression/concatenated_data.csv'
data.to_csv(output_csv_path, index=False)