In [1]:
from sklearn.model_selection import train_test_split
import pandas as pd
import ast

tr_df = pd.read_csv("translated_output_turkish2 (3).csv")
raw_en_df = pd.read_pickle("test_data_generated.pkl")
test_set_quality_results = pd.read_json("test_data_control_results.json", lines=True)

# Parse 'reference_contexts' column in tr_df
tr_df['reference_contexts'] = tr_df['reference_contexts'].apply(ast.literal_eval)

# Extract 'Correctness' and 'Details' from the 'Answer' column in test_set_quality_results
test_set_quality_results[['Correctness', 'Details']] = test_set_quality_results['Answer'].str.extract(r'\((\w+),\s*(.*)\)')

# Filter rows with 'Correctness' == "Correct" and get corresponding indices
correct_indices = test_set_quality_results.loc[test_set_quality_results['Correctness'] == "Correct", 'Index_of_df']

# Filter raw_en_df based on correct indices and reset its index
filtered_en_df = raw_en_df.loc[correct_indices.to_list()].reset_index(drop=True)

# Add relevant columns to both dataframes
tr_df["filename"] = filtered_en_df["filename"]
tr_df["synthesizer_name"] = filtered_en_df["synthesizer_name"]
tr_df["language"] = "Turkish"
filtered_en_df["language"] = "English"

# Combine the Turkish and English datasets
final_test_set = pd.concat([filtered_en_df, tr_df], ignore_index=True)



In [2]:
final_test_set['group'] = final_test_set['language'] + "_" + final_test_set['filename']
stratified_sample, _ = train_test_split(
    final_test_set,
    test_size=(len(final_test_set) - 134),
    stratify=final_test_set['group'],
    random_state=42
)
final_test_set = stratified_sample.drop(columns=['group']).reset_index(drop=True)


In [3]:
final_test_set.filename = final_test_set.filename.apply(lambda x: x.split(".xlsx")[0])
final_test_set = final_test_set.rename(columns={"filename":"topics"})

In [4]:
final_test_set

Unnamed: 0,topics,user_input,reference_contexts,reference,synthesizer_name,language
0,sports_club,ODTÜ HAT nedir ve ne zaman kuruldu?,[Sports Club Name: ODTÜ HAVACILIK TOPLULUĞU YA...,"ODTÜ HAT nedir? \nODTÜ HAT, ODTÜ Havacılık To...",single_hop_specifc_query_synthesizer,Turkish
1,sports_club,ODTÜ kulübü ve faaliyetleri hakkında daha fazl...,[Sports Club Name: ODTÜ DENİZCİLİK VE YELKEN T...,Spor Kulübü Adı: ODTÜ DENİZCİLİK VE YELKEN TOP...,single_hop_specifc_query_synthesizer,Turkish
2,accomadation,YKS Kayıt Programı kullanarak yurt başvurusu n...,[Application for Dormitories: Students whose f...,Ankara dışında ailesi yaşayan öğrenciler yurtl...,single_hop_specifc_query_synthesizer,Turkish
3,courses,What are the key architectural developments an...,[abea63f4-85ac-4646-ab2c-b2cbc87fdff2\n\nCours...,The ARCH112 course provides a concise survey o...,single_hop_specifc_query_synthesizer,English
4,sports_club,"As the Underwater Research Coordinator, I am i...",[303063e1-3f56-4b2a-aa0d-41b194cbde90\n\nSport...,"The ODTÜ Aikido Topluluğu, through its Childre...",single_hop_specifc_query_synthesizer,English
...,...,...,...,...,...,...
129,student_club,KONTEMPORARY DANS GRUBU için kulüp odası nerede?,[Student Club Name: CONTEMPORARY DANCE GROUP\n...,"GÜNCEL DANS GRUBU için kulüp odası 9. Yurt, Ze...",single_hop_specifc_query_synthesizer,Turkish
130,faq,Yurt Bilgi Teknolojileri Destek Koordinatörü o...,[FAQ Question: How can I reach to department c...,"Departman koordinatörünüze ulaşmak için, aşağı...",multi_hop_specific_query_synthesizer,Turkish
131,courses,MİMARLIK BÖLÜMÜ'nün tasarım kavramlarının tanı...,[Course: ARCH201\nDepartment: This course belo...,"MİMARLIK BÖLÜMÜ, tasarım kavramları ile binala...",single_hop_specifc_query_synthesizer,Turkish
132,faq,"As the Dormitory IT Support Coordinator, I oft...",[04024953-33e8-45d8-8493-af6f27671cbb\n\nFAQ Q...,The METU VPN Service is essential for students...,multi_hop_specific_query_synthesizer,English


In [6]:
final_test_set = final_test_set.drop(index=120).reset_index(drop=True) #Found Turkish language that supposed to be English

In [7]:
final_test_set.to_pickle("final_test_data_set.pkl")