In [37]:
import pandas as pd

# Load the CSV file
df = pd.read_csv('src/overgeneration_data/train_cohere_rplus_jv.csv')
len(df)

1077

In [38]:
# 1. Remove duplicates
df.drop_duplicates(inplace=True)
len(df)

1076

In [39]:
# Function to split premise into kalimat-1, kalimat-2, kalimat-3, kalimat-4
def split_premise(premise):
    if isinstance(premise, str):  # Ensure premise is a string
        # Split based on the period (.)
        sentences = premise.split('.')
        
        # Remove empty sentences or whitespace-only sentences
        sentences = [sentence.strip() for sentence in sentences if sentence.strip()]

        # If there are less than 4 sentences, pad with empty strings
        while len(sentences) < 4:
            sentences.append('')

        # Return the first four sentences
        return sentences[:4]
    else:
        # Return empty values if premise is not a valid string
        return ['', '', '', '']

# 2. Apply split function to premise and create new columns for kalimat-1 to kalimat-4
df[['kalimat_1', 'kalimat_2', 'kalimat_3', 'kalimat_4']] = df['premise'].apply(split_premise).apply(pd.Series)

df.head(2)

Unnamed: 0,topic,premise,correct_ending,incorrect_ending,kalimat_1,kalimat_2,kalimat_3,kalimat_4
0,Food,Joko lan Yanto arep tuku oleh-oleh. Dheweke ar...,Dheweke arep tuku oleh-oleh lan mangan ing war...,Dheweke ora entuk oleh-oleh lan mangan ing war...,Joko lan Yanto arep tuku oleh-oleh,Dheweke arep mangan ing warung,Warunge wis tutup,Dheweke ora entuk oleh-oleh
1,Food,Sinta seneng mangan tempe. Dheweke arep tuku t...,Sinta tuku tempe ing pasar.,Sinta ora mangan tempe.,Sinta seneng mangan tempe,Dheweke arep tuku tempe,Tempe sing diwarunge wis habis,Sinta ora entuk tempe


In [40]:
len(df)

1076

In [42]:
# Filter rows where any of the sentence columns have empty string values
empty_sentences_df = df[(df['kalimat_1'] == '') | 
                        (df['kalimat_2'] == '') | 
                        (df['kalimat_3'] == '') | 
                        (df['kalimat_4'] == '')]

# Display the filtered rows
display(empty_sentences_df[:3])
len(empty_sentences_df)

Unnamed: 0,topic,premise,correct_ending,incorrect_ending,kalimat_1,kalimat_2,kalimat_3,kalimat_4
15,Food,"Ninggal nggone, Ninggal njupuk gawean anyar in...","Ninggal seneng, dheweke entuk jamu sing enak l...","Ninggal ora gelem tuku jamu, dheweke mung tuku...","Ninggal nggone, Ninggal njupuk gawean anyar in...",Dheweke pengin njajal kuliner sing aneh-aneh,Ninggal nemokake warung sing jual jamu,
113,Wedding,"Ning dalem pendhapa, dheweke nggagas tangan la...",Dheweke ngucapke matur nuwun lan nggagas tanga...,Dheweke mung mesem lan ora ngucapke matur nuwun.,"Ning dalem pendhapa, dheweke nggagas tangan la...","Ning ngarep, dheweke ketemu karo wong tuwane",Dheweke ora ngerti yen wong tuwane wis nunggu,
286,Pregnancy and Kids,Bayu lan Ayu duwe anak lanang. Bayu lan Ayu pe...,Bayu lan Ayu lan keluarga nglakoni upacara lan...,Bayu lan Ayu ora ngucapke suwun marang Gusti.,Bayu lan Ayu duwe anak lanang,Bayu lan Ayu pengin nglakoni tradisi Jawa pask...,"Bayu lan Ayu nindakake upacara ""Tumpengan"" kan...",


12

In [32]:
# 4. Remove rows where any of the kalimat columns are empty strings
df = df[(df['kalimat_1'] != '') & 
        (df['kalimat_2'] != '') & 
        (df['kalimat_3'] != '') & 
        (df['kalimat_4'] != '')]

len(df)

1064

In [33]:
# 5. Drop the original 'premise' column as it's no longer needed
df.drop(columns=['premise'], inplace=True)

In [34]:
df.head(3)

Unnamed: 0,topic,correct_ending,incorrect_ending,kalimat_1,kalimat_2,kalimat_3,kalimat_4
0,Food,Dheweke arep tuku oleh-oleh lan mangan ing war...,Dheweke ora entuk oleh-oleh lan mangan ing war...,Joko lan Yanto arep tuku oleh-oleh,Dheweke arep mangan ing warung,Warunge wis tutup,Dheweke ora entuk oleh-oleh
1,Food,Sinta tuku tempe ing pasar.,Sinta ora mangan tempe.,Sinta seneng mangan tempe,Dheweke arep tuku tempe,Tempe sing diwarunge wis habis,Sinta ora entuk tempe
2,Food,Dheweke tuku wedang ronde ing pasar.,Dheweke ora mangan wedang ronde.,Bayu lan Bagas arep tuku wedang ronde,Dheweke arep mangan ing warung,Warunge wis tutup,Dheweke ora entuk wedang ronde


In [35]:
df.to_csv('cleaned_train_cohere_rplus_jv.csv', index=False, columns=['topic', 'kalimat_1', 'kalimat_2', 'kalimat_3', 'kalimat_4', 'correct_ending', 'incorrect_ending'])