In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier
import pandas as pd
import string

In [None]:
df = pd.read_csv("turkish_song_lyrics.csv")
singers = ["Zeki Müren", "Tarkan"]
df = df[df.singer.isin(singers)].copy()
df.head()

In [None]:
def clean(row):
    satirlar = row["lyrics"].split("\n")
    return " ".join(satirlar).lower().translate(str.maketrans("", "", string.punctuation))


df["clean"] = df.apply(lambda row: clean(row), axis=1)

In [None]:
def etiketle(row):
    if row["singer"] == singers[0]:
        return 0
    return 1


df["labels"] = df.apply(lambda row:  etiketle(row), axis=1)

In [None]:
X = df.clean.to_numpy()
y = df.labels.to_numpy()

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42)

vectorizer = TfidfVectorizer()

X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

In [None]:
# Hyperparams: criterion="gini", max_depth=2
clf = DecisionTreeClassifier(criterion="gini", max_depth=2)
clf.fit(X_train, y_train)
clf.score(X_test, y_test)

In [None]:
# Hyperparams: criterion="entropy", max_depth=5
clf = DecisionTreeClassifier(criterion="entropy", max_depth=5)
clf.fit(X_train, y_train)
clf.score(X_test, y_test)

In [None]:
# CV with 5-folds
clf = DecisionTreeClassifier(criterion="entropy", max_depth=5)
scores = cross_val_score(clf, X_train, y_train, cv=5)
scores

In [None]:
print(scores.mean(), scores.std())

In [None]:
# Grid search
parameters = {"criterion": ["entropy", "gini", "log_loss"], 
              "max_depth": range(2, 6)}
clf = GridSearchCV(DecisionTreeClassifier(), parameters,
                   cv=5, n_jobs=4, verbose=3)
clf.fit(X_train, y_train)
print("Best score:", clf.best_score_, "Best params:", clf.best_params_)
tree = clf.best_estimator_

In [None]:
print(tree.score(X_train, y_train))
print(tree.score(X_test, y_test))