In [87]:
import pandas as pd
import string
import re

# Verisetinin Okunması

In [88]:
halk_dataset = pd.read_csv("../raw_data/halk.csv")
sanat_dataset = pd.read_csv("../raw_data/sanat.csv")
ilahi_dataset = pd.read_csv("../raw_data/ilahi.csv")
pop_dataset = pd.read_csv("../raw_data/pop.csv")
rock_dataset = pd.read_csv("../raw_data/rock.csv")
rap_dataset = pd.read_csv("../raw_data/rap.csv")

# Veri Etiketleme
halk_dataset["sarki_turu"] = ["halk" for _ in range(200)]
sanat_dataset["sarki_turu"] = ["sanat" for _ in range(200)]
ilahi_dataset["sarki_turu"] = ["ilahi" for _ in range(200)]
pop_dataset["sarki_turu"] = ["pop" for _ in range(200)]
rock_dataset["sarki_turu"] = ["rock" for _ in range(200)]
rap_dataset["sarki_turu"] = ["rap" for _ in range(200)]

df_dataset = pd.concat([halk_dataset, sanat_dataset, ilahi_dataset, pop_dataset, rock_dataset, rap_dataset], axis=0, ignore_index=True)

df_dataset = df_dataset.reset_index().rename(columns={'index': 'id'})

# Verilerin Temizlenmesi

In [89]:
def clean_text(text):
    # Satır boşluklarının kaldırılması
    text = text.replace("\n\n", "\n")

    # Satır Sonlarındaki Boşlukların Kaldırılması
    lines = text.split("\n")
    temp_text = ""
    for line in lines:
        temp_text += line.rstrip() + "\n"
    text = temp_text

    # Noktalama İşaretlerinin Kaldırılması
    text = text.translate(str.maketrans('', '', string.punctuation))

    # Küçük Harf Dönüşümü
    text = text.lower()

    return text

df_dataset["sarki_sozu"] = [clean_text(lyrics) for lyrics in df_dataset.loc[:,"sarki_sozu"]]

##### Eski Türkçe kelime oranlarının bulunması

In [90]:
my_file = open("../raw_data/eski_turkce_kelimeler.txt", "r")
old_turkish_words = my_file.read()
old_turkish_words = old_turkish_words.replace('\n', ' ').split(" ")
my_file.close()

def calculate_ratio(row):
    lyrics = re.split('\n| ', row['sarki_sozu'])
    matched_words = [word for word in lyrics if word in old_turkish_words]
    ratio = len(matched_words) / len(lyrics)
    return ratio

df_dataset['eski_turkce_kelime_orani'] = df_dataset.apply(calculate_ratio, axis=1)

##### Pos Tag oranlarının eklenmesi

In [91]:
import pos_tagger

verb_rate_list = []
adj_rate_list = []
adv_rate_list = []
conj_rate_list = []
interj_rate_list = []
ques_rate_list = []
unk_rate_list = []
for lyrics in df_dataset.loc[:, "sarki_sozu"]:
    pos_tags = pos_tagger.get_pos_tags(lyrics)
    verb_rate_list.append(round(pos_tags.count(pos_tagger.POSTag.Verb.name) / len(pos_tags), 2))
    adj_rate_list.append(round(pos_tags.count(pos_tagger.POSTag.Adj.name) / len(pos_tags), 2))
    adv_rate_list.append(round(pos_tags.count(pos_tagger.POSTag.Adv.name) / len(pos_tags), 2))
    conj_rate_list.append(round(pos_tags.count(pos_tagger.POSTag.Conj.name) / len(pos_tags), 2))
    interj_rate_list.append(round(pos_tags.count(pos_tagger.POSTag.Interj.name) / len(pos_tags), 2))
    ques_rate_list.append(round(pos_tags.count(pos_tagger.POSTag.Ques.name) / len(pos_tags), 2))
    unk_rate_list.append(round(pos_tags.count(pos_tagger.POSTag.Unk.name) / len(pos_tags), 2))

df_dataset["fiil_orani"] = verb_rate_list
df_dataset["sifat_orani"] = adj_rate_list
df_dataset["zarf_orani"] = adv_rate_list
df_dataset["baglac_orani"] = conj_rate_list
df_dataset["unlem_orani"] = interj_rate_list
df_dataset["soru_orani"] = ques_rate_list
df_dataset["bilinmeyen_orani"] = unk_rate_list

In [92]:
df_dataset.to_csv("../dataset/dataset.csv", index=False)

### Stop word'lerin kaldırılması

In [93]:
# Getting stop_words
my_file = open("../raw_data/stop_words.txt", "r")
stop_words = my_file.read()
stop_words = stop_words.replace('\n', ' ').split(" ")
my_file.close()    

def remove_stopwords(text):
    # Split text into lines
    lines = text.split('\n')
    
    # Remove stopwords from each line
    for i, line in enumerate(lines):
        words = line.split()
        filtered_words = [word for word in words if word.lower() not in stop_words]
        lines[i] = ' '.join(filtered_words)
    
    # Join lines back together with line breaks
    return '\n'.join(lines)

df_dataset["sarki_sozu"] = [remove_stopwords(lyrics) for lyrics in df_dataset.loc[:,"sarki_sozu"]]

##### Eski Türkçe kelime oranlarının bulunması

In [94]:
my_file = open("../raw_data/eski_turkce_kelimeler.txt", "r")
old_turkish_words = my_file.read()
old_turkish_words = old_turkish_words.replace('\n', ' ').split(" ")
my_file.close()

def calculate_ratio(row):
    lyrics = re.split('\n| ', row['sarki_sozu'])
    matched_words = [word for word in lyrics if word in old_turkish_words]
    ratio = len(matched_words) / len(lyrics)
    return ratio

df_dataset['eski_turkce_kelime_orani'] = df_dataset.apply(calculate_ratio, axis=1)

##### Pos Tag oranlarının eklenmesi

In [95]:
import pos_tagger

verb_rate_list = []
adj_rate_list = []
adv_rate_list = []
conj_rate_list = []
interj_rate_list = []
ques_rate_list = []
unk_rate_list = []
for lyrics in df_dataset.loc[:, "sarki_sozu"]:
    pos_tags = pos_tagger.get_pos_tags(lyrics)
    verb_rate_list.append(round(pos_tags.count(pos_tagger.POSTag.Verb.name) / len(pos_tags), 2))
    adj_rate_list.append(round(pos_tags.count(pos_tagger.POSTag.Adj.name) / len(pos_tags), 2))
    adv_rate_list.append(round(pos_tags.count(pos_tagger.POSTag.Adv.name) / len(pos_tags), 2))
    conj_rate_list.append(round(pos_tags.count(pos_tagger.POSTag.Conj.name) / len(pos_tags), 2))
    interj_rate_list.append(round(pos_tags.count(pos_tagger.POSTag.Interj.name) / len(pos_tags), 2))
    ques_rate_list.append(round(pos_tags.count(pos_tagger.POSTag.Ques.name) / len(pos_tags), 2))
    unk_rate_list.append(round(pos_tags.count(pos_tagger.POSTag.Unk.name) / len(pos_tags), 2))

df_dataset["fiil_orani"] = verb_rate_list
df_dataset["sifat_orani"] = adj_rate_list
df_dataset["zarf_orani"] = adv_rate_list
df_dataset["baglac_orani"] = conj_rate_list
df_dataset["unlem_orani"] = interj_rate_list
df_dataset["soru_orani"] = ques_rate_list
df_dataset["bilinmeyen_orani"] = unk_rate_list

In [96]:
df_dataset.to_csv("../dataset/dataset_without_stopwords.csv", index=False)

In [97]:
df_dataset = pd.read_csv("../dataset/dataset.csv")
print(df_dataset.loc[df_dataset["sarki_turu"] == "halk"]['eski_turkce_kelime_orani'].mean())
print(df_dataset.loc[df_dataset["sarki_turu"] == "sanat"]['eski_turkce_kelime_orani'].mean())
print(df_dataset.loc[df_dataset["sarki_turu"] == "ilahi"]['eski_turkce_kelime_orani'].mean())
print(df_dataset.loc[df_dataset["sarki_turu"] == "pop"]['eski_turkce_kelime_orani'].mean())
print(df_dataset.loc[df_dataset["sarki_turu"] == "rock"]['eski_turkce_kelime_orani'].mean())
print(df_dataset.loc[df_dataset["sarki_turu"] == "rap"]['eski_turkce_kelime_orani'].mean())

0.07773120104743954
0.11969667441473383
0.11892698814545277
0.08179447797082677
0.08356235132997344
0.062009132794482634
