In [None]:
import numpy as np 
import pandas as pd 
import string
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.naive_bayes import MultinomialNB

In [None]:
df = pd.read_csv("turkish_song_lyrics.csv")
df.head()

In [None]:
df.singer.value_counts()

In [None]:
singers = ["Zeki Müren", "Tarkan"]

In [None]:
df_subset = df[df.singer.isin(singers)].copy()

In [None]:
df_subset.singer.value_counts()

In [None]:
df_subset.head()

In [None]:
def clean(row):
    satirlar = row["lyrics"].split("\n")
    return " ".join(satirlar).lower().translate(str.maketrans("", "", string.punctuation))

df_subset["clean"] = df_subset.apply(lambda row: clean(row), axis = 1)

In [None]:
def etiketle(row):
    if row["singer"] == singers[0]:
        return 0 
    return 1

df_subset["labels"] = df_subset.apply(lambda row:  etiketle(row), axis=1)

In [None]:
df_subset.head()

In [None]:
df_subset.tail()

In [None]:
X = df_subset.clean.to_numpy()
y = df_subset.labels.to_numpy()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
print(len(X_train), len(X_test))

In [None]:
vectorizer = CountVectorizer()
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

In [None]:
len(vectorizer.vocabulary_)

In [None]:
list(vectorizer.vocabulary_.items())[:10]

In [None]:
vectorizer.get_feature_names()[:15]

In [None]:
X_train

In [None]:
model = MultinomialNB()
model.fit(X_train, y_train)

In [None]:
model.score(X_train, y_train)

In [None]:
model.score(X_test, y_test)

In [None]:
predictions_train = model.predict(X_train)
print("Train F1:", f1_score(y_train, predictions_train))

predictions_test = model.predict(X_test)
print("Test F1:", f1_score(y_test, predictions_test))

In [None]:
y_train[:15]

In [None]:
predictions_train[:15]

In [None]:
zeki_muren_sozu_gibi = "merhaba efendim bu akşam nasılsınız şeref verdiniz"

In [None]:
zm_vec = vectorizer.transform([zeki_muren_sozu_gibi])

In [None]:
model.predict(zm_vec)

In [None]:
t_sozu = "yap bi güzellik be gel kon hadi kalbim dön kavuşalım öp barışalım"
t_vec = vectorizer.transform([t_sozu])

In [None]:
model.predict(t_vec)

In [None]:
df_subset[df_subset.clean.str.contains("yap bi güzellik")]