In [None]:
import numpy as np
import pandas as pd
import string


In [None]:
df = pd.read_csv("turkish_song_lyrics.csv")


In [None]:
df.head()


In [None]:
df.singer.value_counts()[:10]


In [None]:
singer_0 = "Zeki Müren"
singer_1 = "Müslüm Gürses"


In [None]:
sarkilar_0 = df[df["singer"] == singer_0]
sarkilar_1 = df[df.singer == singer_1]


In [None]:
print(len(sarkilar_0), len(sarkilar_1))


In [None]:
sarkilar_0.singer.unique()


In [None]:
sarkilar_1.singer.unique()


In [None]:
sarkilar_0 = sarkilar_0.lyrics.to_numpy()
sarkilar_1 = sarkilar_1.lyrics.to_numpy()


In [None]:
print(singer_0, "ilk sarkisi:\n\n")
print(sarkilar_0[0])


In [None]:
print(singer_1, "ilk sarkisi:\n\n")
print(sarkilar_1[0])


In [None]:
sarkilar = [sarkilar_0, sarkilar_1]
sarkicilar = [singer_0, singer_1]


In [None]:
satirlar = []
labels = []


In [None]:
# sarki_seti: bir sarkiciya ait tum sarkilar
for i, sarki_seti in enumerate(sarkilar):
    for sarki in sarki_seti:  # sarki: bir sarkiciya ait sarki setindeki her bir sarki
        for satir in sarki.split("\n"):
            # satiri "Hey! Naber?"den "hey naber"e donusturuyoruz
            satir = satir.lower().translate(str.maketrans("", "", string.punctuation))
            satirlar.append(satir)
            labels.append(i)


In [None]:
satirlar[:5]


In [None]:
labels[:5]


In [None]:
satirlar[-5:]


In [None]:
labels[-5:]


In [None]:
print(len(satirlar), len(labels))


In [None]:
from sklearn.model_selection import train_test_split


In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    satirlar, labels, test_size=0.2, random_state=42)


In [None]:
print(len(X_train), len(y_train), len(X_test), len(y_test))


In [None]:
for i in range(15):
    print(X_train[i], "::", y_train[i], "::", sarkicilar[y_train[i]])


In [None]:
for i in range(15):
    print(X_test[i], "::", y_test[i], "::", sarkicilar[y_test[i]])


In [None]:
w2i = {}  # kelime dağarcığı (vocab.) oluşturuyoruz
w2i["<UNK>"] = 0


In [None]:
index = 1
for satir in X_train:
    tokens = satir.split()
    for token in tokens:
        if token not in w2i:
            w2i[token] = index
            index += 1


In [None]:
list(w2i.items())[:5]


In [None]:
len(w2i)


In [None]:
print(w2i["hey"], w2i["cânım"], w2i["ömrüm"])


In [None]:
# "hey cânım hey ömrüm hey" satirini [14, 15, 14, 16, 14] listesine dönüştüreceğiz
X_train_int = []
X_test_int = []


In [None]:
X_train[:3]


In [None]:
# train setindeki text satirlari, int satirlara donusturuyoruz
for satir in X_train:
    satir_int = []
    tokens = satir.split()
    for token in tokens:
        satir_int.append(w2i[token])

    X_train_int.append(satir_int)


In [None]:
X_train_int[:3]


In [None]:
# test setindeki satirlari, int satirlara donusturuyoruz
for satir in X_test:
    satir_int = []
    tokens = satir.split()
    for token in tokens:
        # eğer test setindeki bir token, train setinin sozlugunde w2i bulunmuyorsa
        # bu token için <UNK> tag'ine karsilik gelen 0 degerini atiyoruz
        satir_int.append(w2i.get(token, 0))

    X_test_int.append(satir_int)


In [None]:
X_test_int[:3]


In [None]:
X_train_int_ZM = []  # sadece Zeki Müren'e ait satirlar
for i, label in enumerate(y_train):
    if label == 0:  # 0 == Zeki Müren
        X_train_int_ZM.append(X_train_int[i])


In [None]:
len(X_train_int_ZM)


In [None]:
y_train_0s = [l for l in y_train if l == 0]


In [None]:
len(y_train_0s)


In [None]:
X_train_int_ZM[:3]


In [None]:
X_train_int_MG = []  # Sadece Müslüm Baba'ya ait satirlar
for i, label in enumerate(y_train):
    if label == 1:  # 1 == Müslüm Gürses
        X_train_int_MG.append(X_train_int[i])

print(len(X_train_int_MG))


In [None]:
M = len(w2i)  # dagarcik boyutu (vocab. size)
print(M)

# Zeki Müren modeli
pi_0 = np.ones(M)
A_0 = np.ones((M, M))

# Müslüm Gürses modeli
pi_1 = np.ones(M)
A_1 = np.ones((M, M))


In [None]:
def train_markov_model(satirlar_int, pi, A):
    for satir_int in satirlar_int:  # satir_int: [5928, 4336, 1535, 4397, 802]
        for i, token_index in enumerate(satir_int):  # i: 0, index: 5928
            if i == 0:
                # satir basindaysak, token'i pi'ye ekle
                pi[token_index] += 1
            else:
                # satir basinda degilsek,
                # bir onceki token'dan simdiki token'a gecis frekansini 1 arttir
                A[satir_int[i - 1], token_index] += 1


In [None]:
train_markov_model(X_train_int_ZM, pi_0, A_0)


In [None]:
train_markov_model(X_train_int_MG, pi_1, A_1)


In [None]:
pi_0[:5]


In [None]:
A_1[:5, :15]


In [None]:
# normalizasyon
pi_0 = pi_0 / pi_0.sum()
pi_1 = pi_1 / pi_1.sum()


In [None]:
pi_1[:5]


In [None]:
test = np.array([
    [1, 2, 3],
    [4, 5, 6]
])

test2 = test.sum(axis=0)

test3 = test.sum(axis=1)

test4 = test.sum(axis=1, keepdims=True)

print("original")
print(test)

print("-" * 20)
print("sum(axis=0)")
print(test2)

print("-" * 20)
print("sum(axis=1)")
print(test3)

print("-" * 20)
print("sum(axis=1, keepdims=True)")
print(test4)


In [None]:
A_0 = A_0 / A_0.sum(axis=1, keepdims=True)
A_1 = A_1 / A_1.sum(axis=1, keepdims=True)


In [None]:
log_pi_0 = np.log(pi_0)
log_A_0 = np.log(A_0)

log_pi_1 = np.log(pi_1)
log_A_1 = np.log(A_1)


In [None]:
log_pi_1[:5]


In [None]:
log_A_1[:5, :4]


In [None]:
count_0 = sum(y == 0 for y in y_train)  # ZM'e ait satirlarin sayisi
count_1 = sum(y == 1 for y in y_train)  # MG'e ait satirlarin sayisi

total = len(y_train)  # train setindeki toplam satir sayisi

# prior'lari hesapla
p_0 = count_0 / total
log_p_0 = np.log(p_0)

p_1 = count_1 / total
log_p_1 = np.log(p_1)


In [None]:
print(p_0, p_1, log_p_0, log_p_1)


In [None]:
def compute_log_prob(input, clas):
    """ 
    input'un verilen class'a ait olma olasiliginin log degerini hesaplar

    örn input: [1, 3, 2, 7, ...] 
    örn clas: 0 veya 1
    """
    pi = log_pi_0
    A = log_A_0
    prior = log_p_0

    if clas == 1:
        pi = log_pi_1
        A = log_A_1
        prior = log_p_1

    log_prob = 0
    for i, word_index in enumerate(input):
        if i == 0:
            log_prob += pi[word_index]
        else:
            log_prob += A[input[i - 1], word_index]

    log_prob += prior
    return log_prob


In [None]:
print(X_train_int[0], y_train[0])


In [None]:
compute_log_prob(X_train_int[0], 0)


In [None]:
compute_log_prob(X_train_int[0], 1)


In [None]:
def predict(inputs):
    predictions = []
    for input in inputs:
        probas = [compute_log_prob(input, 0), compute_log_prob(input, 1)]
        prediction = np.argmax(probas)
        predictions.append(prediction)

    return predictions


In [None]:
predictions_train = predict(X_train_int)
predictions_test = predict(X_test_int)


In [None]:
predictions_train[:5]


In [None]:
y_train[:5]


In [None]:
predictions_train = np.array(predictions_train)
predictions_test = np.array(predictions_test)


In [None]:
predictions_train == y_train


In [None]:
train_accu = np.mean(predictions_train == y_train)
test_accu = np.mean(predictions_test == y_test)


In [None]:
print("Train accuracy:", train_accu)
print("Test accuracy:", test_accu)
