In [None]:
import numpy as np
import pandas as pd
import string
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score


In [None]:
df = pd.read_csv("turkish_song_lyrics.csv")
df.head()


In [None]:
df.singer.value_counts()


In [None]:
singers = ["Müslüm Gürses", "Ceza"]


In [None]:
df_subset = df[df.singer.isin(singers)].copy()
df_subset.singer.value_counts()


In [None]:
def clean(row):
    satirlar = row["lyrics"].split("\n")
    return " ".join(satirlar).lower().translate(str.maketrans("", "", string.punctuation))


df_subset["clean"] = df_subset.apply(lambda row: clean(row), axis=1)


In [None]:
def etiketle(row):
    if row["singer"] == singers[0]:
        return 0
    return 1


df_subset["labels"] = df_subset.apply(lambda row:  etiketle(row), axis=1)


In [None]:
df_subset.head()


In [None]:
df_subset.tail()


In [None]:
X = df_subset.clean.to_numpy()
y = df_subset.labels.to_numpy()


In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42)


In [None]:
print(len(X_train), len(X_test))


In [None]:
X_train[0]

In [None]:
vectorizer = TfidfVectorizer()

X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)


In [None]:
model_NB = MultinomialNB()
model_NB.fit(X_train, y_train)


In [None]:
print("NB train accuracy:", model_NB.score(X_train, y_train))
print("NB test accuracy:", model_NB.score(X_test, y_test))

predictions_train = model_NB.predict(X_train)
print("NB Train F1:", f1_score(y_train, predictions_train))

predictions_test = model_NB.predict(X_test)
print("NB Test F1:", f1_score(y_test, predictions_test))


In [None]:
model_DT = DecisionTreeClassifier()
model_DT.fit(X_train, y_train)


In [None]:
print("DT train accuracy:", model_DT.score(X_train, y_train))
print("DT test accuracy:", model_DT.score(X_test, y_test))

predictions_train = model_DT.predict(X_train)
print("DT Train F1:", f1_score(y_train, predictions_train))

predictions_test = model_DT.predict(X_test)
print("DT Test F1:", f1_score(y_test, predictions_test))


In [None]:
lyric = "kahretsin bu hayat batsın karanalıklar içinde çürüyorum"
lyric_vec = vectorizer.transform([lyric])
# müslüm: 0
# ceza: 1
print("NB:", model_NB.predict(lyric_vec))
print("DT:", model_DT.predict(lyric_vec))

print("NB Proba:", model_NB.predict_proba(lyric_vec))
print("DT Proba:", model_DT.predict_proba(lyric_vec))
