In [16]:
import numpy as np
import pandas as pd
from sklearn.metrics import confusion_matrix,accuracy_score, precision_score, recall_score, f1_score

In [17]:
train = pd.read_csv('./dataset/train.csv')
X_train = train.iloc[:,2:].values
y_train = train.iloc[:,1].values

test = pd.read_csv('./dataset/test.csv')
X_test = test.iloc[:,1:].values

y_test = pd.read_csv('./dataset/gender_submission.csv')
y_test = y_test.iloc[:,1].values

## Data Preprocessing

In [18]:
from sklearn.impute import SimpleImputer
imputerMean = SimpleImputer(missing_values=np.nan, strategy='mean')
imputerMost = SimpleImputer(missing_values=np.nan, strategy='most_frequent')

In [19]:
#Eksik verilere gerekli işlemleri yapıyorum.
X_train[:,3:4] = imputerMean.fit_transform(X_train[:,3:4])
X_test[:,3:4] = imputerMean.fit_transform(X_test[:,3:4])

X_train = np.delete(X_train, 1, axis=1) #İsimler gereksiz olduğu için kaldırdım
X_test = np.delete(X_test, 1, axis=1) #İsimler gereksiz olduğu için kaldırdım

X_train = imputerMost.fit_transform(X_train)
X_test = imputerMost.fit_transform(X_test)


In [20]:
#Label encoder ile kategorik sütunları sayısal hale getirdim.
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

X_train[:,5] = le.fit_transform(X_train[:,5])
X_test[:,5] = le.fit_transform(X_test[:,5])
X_train[:,7] = le.fit_transform(X_train[:,7])
X_test[:,7] = le.fit_transform(X_test[:,7])

In [21]:
#One Hot Encoding ile kategorik sütunları sayısal hale getirdim.
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(),[1,8])], remainder='passthrough')
X_train = np.array(ct.fit_transform(X_train))
X_test = np.array(ct.fit_transform(X_test))

## Standartization

In [22]:
#Standartization sayesinde her sütündaki verileri -3 ile 3 arasında sayılara dönüştürdüm. (Hepsi orantılı olarak dönüşüyor)
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()

X_train[:,5:] = sc.fit_transform(X_train[:,5:])
X_test[:,5:] = sc.transform(X_test[:,5:])

## Logistic Regression

In [23]:
from sklearn.linear_model import LogisticRegression
log_model = LogisticRegression(random_state=42, max_iter=1000)
log_model.fit(X_train, y_train)

y_pred = log_model.predict(X_test)

cm = confusion_matrix(y_test, y_pred)
print(f"Logistic Regression Confusion Matrix:\n {cm}")

# Accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Logistic Regression Accuracy: {accuracy:.2f}")

# Precision
precision = precision_score(y_test, y_pred, average='binary')
print(f"Logistic Regression Precision: {precision:.2f}")

# Recall
recall = recall_score(y_test, y_pred, average='binary')
print(f"Logistic Regression Recall: {recall:.2f}")

# F1-Score
f1 = f1_score(y_test, y_pred, average='binary')
print(f"Logistic Regression F1-Score: {f1:.2f}")

Logistic Regression Confusion Matrix:
 [[253  13]
 [ 16 136]]
Logistic Regression Accuracy: 0.93
Logistic Regression Precision: 0.91
Logistic Regression Recall: 0.89
Logistic Regression F1-Score: 0.90


## K-NN

In [24]:
from sklearn.neighbors import KNeighborsClassifier
knnClassifier = KNeighborsClassifier(n_neighbors=5, metric='minkowski', p=2)
knnClassifier.fit(X_train, y_train)

y_pred = knnClassifier.predict(X_test)

cm = confusion_matrix(y_test, y_pred)
print(f"K-NN Confusion Matrix:\n {cm}")

# Accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"K-NN Accuracy: {accuracy:.2f}")

# Precision
precision = precision_score(y_test, y_pred, average='binary')
print(f"K-NN Precision: {precision:.2f}")

# Recall
recall = recall_score(y_test, y_pred, average='binary')
print(f"K-NN Recall: {recall:.2f}")

# F1-Score
f1 = f1_score(y_test, y_pred, average='binary')
print(f"K-NN F1-Score: {f1:.2f}")


K-NN Confusion Matrix:
 [[222  44]
 [ 29 123]]
K-NN Accuracy: 0.83
K-NN Precision: 0.74
K-NN Recall: 0.81
K-NN F1-Score: 0.77


## SVM (Linear)

In [25]:
from sklearn.svm import SVC
svm_model = SVC(kernel='linear', random_state=42)
svm_model.fit(X_train, y_train)

y_pred = svm_model.predict(X_test)

cm = confusion_matrix(y_test, y_pred)
print(f"SVM(Linear) Confusion Matrix:\n {cm}")

# Accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"SVM(Linear) Accuracy: {accuracy:.2f}")

# Precision
precision = precision_score(y_test, y_pred, average='binary')
print(f"SVM(Linear) Precision: {precision:.2f}")

# Recall
recall = recall_score(y_test, y_pred, average='binary')
print(f"SVM(Linear) Recall: {recall:.2f}")

# F1-Score
f1 = f1_score(y_test, y_pred, average='binary')
print(f"SVM(Linear) F1-Score: {f1:.2f}")

SVM(Linear) Confusion Matrix:
 [[266   0]
 [  0 152]]
SVM(Linear) Accuracy: 1.00
SVM(Linear) Precision: 1.00
SVM(Linear) Recall: 1.00
SVM(Linear) F1-Score: 1.00


## SVM (RBF)

In [26]:
from sklearn.svm import SVC
rbfsvm_model = SVC(kernel='rbf', random_state=42)
rbfsvm_model.fit(X_train, y_train)

y_pred = rbfsvm_model.predict(X_test)

cm = confusion_matrix(y_test, y_pred)
print(f"SVM(RBF) Confusion Matrix:\n {cm}")

# Accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"SVM(RBF) Accuracy: {accuracy:.2f}")

# Precision
precision = precision_score(y_test, y_pred, average='binary')
print(f"SVM(RBF) Precision: {precision:.2f}")

# Recall
recall = recall_score(y_test, y_pred, average='binary')
print(f"SVM(RBF) Recall: {recall:.2f}")

# F1-Score
f1 = f1_score(y_test, y_pred, average='binary')
print(f"SVM(RBF) F1-Score: {f1:.2f}")

SVM(RBF) Confusion Matrix:
 [[251  15]
 [  8 144]]
SVM(RBF) Accuracy: 0.94
SVM(RBF) Precision: 0.91
SVM(RBF) Recall: 0.95
SVM(RBF) F1-Score: 0.93


## Naive Bayes

In [27]:
from sklearn.naive_bayes import GaussianNB
gaus_model = GaussianNB()
gaus_model.fit(X_train, y_train)

y_pred = gaus_model.predict(X_test)

cm = confusion_matrix(y_test, y_pred)
print(f"Naive Bayes Confusion Matrix:\n {cm}")

# Accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Naive Bayes Accuracy: {accuracy:.2f}")

# Precision
precision = precision_score(y_test, y_pred, average='binary')
print(f"Naive Bayes Precision: {precision:.2f}")

# Recall
recall = recall_score(y_test, y_pred, average='binary')
print(f"Naive Bayes Recall: {recall:.2f}")

# F1-Score
f1 = f1_score(y_test, y_pred, average='binary')
print(f"Naive Bayes F1-Score: {f1:.2f}")

Naive Bayes Confusion Matrix:
 [[232  34]
 [  4 148]]
Naive Bayes Accuracy: 0.91
Naive Bayes Precision: 0.81
Naive Bayes Recall: 0.97
Naive Bayes F1-Score: 0.89


## Decision Tree

In [28]:
from sklearn.tree import DecisionTreeClassifier
dt_model = DecisionTreeClassifier(criterion = 'entropy', random_state = 0)
dt_model.fit(X_train, y_train)

y_pred = dt_model.predict(X_test)

cm = confusion_matrix(y_test, y_pred)
print(f"Decision Tree Confusion Matrix:\n {cm}")

# Accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Decision Tree Accuracy: {accuracy:.2f}")

# Precision
precision = precision_score(y_test, y_pred, average='binary')
print(f"Decision Tree Precision: {precision:.2f}")

# Recall
recall = recall_score(y_test, y_pred, average='binary')
print(f"Decision Tree Recall: {recall:.2f}")

# F1-Score
f1 = f1_score(y_test, y_pred, average='binary')
print(f"Decision Tree F1-Score: {f1:.2f}")

Decision Tree Confusion Matrix:
 [[190  76]
 [ 33 119]]
Decision Tree Accuracy: 0.74
Decision Tree Precision: 0.61
Decision Tree Recall: 0.78
Decision Tree F1-Score: 0.69


## Random Forest

In [29]:
from sklearn.ensemble import RandomForestClassifier
rf_model = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=42)
rf_model.fit(X_train, y_train)

y_pred = rf_model.predict(X_test)

cm = confusion_matrix(y_test, y_pred)
print(f"Random Forest Confusion Matrix:\n {cm}")

# Accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Random Forest Accuracy: {accuracy:.2f}")

# Precision
precision = precision_score(y_test, y_pred, average='binary')
print(f"Random Forest Precision: {precision:.2f}")

# Recall
recall = recall_score(y_test, y_pred, average='binary')
print(f"Random Forest Recall: {recall:.2f}")

# F1-Score
f1 = f1_score(y_test, y_pred, average='binary')
print(f"Random Forest F1-Score: {f1:.2f}")

Random Forest Confusion Matrix:
 [[243  23]
 [ 12 140]]
Random Forest Accuracy: 0.92
Random Forest Precision: 0.86
Random Forest Recall: 0.92
Random Forest F1-Score: 0.89


## Gradient Boosting

In [30]:
from xgboost import XGBClassifier
xgb_model = XGBClassifier(random_state=42,  eval_metric='mlogloss')
xgb_model.fit(X_train, y_train)

y_pred = xgb_model.predict(X_test)

cm = confusion_matrix(y_test, y_pred)
print(f"XGB Confusion Matrix:\n {cm}")

# Accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"XGB Accuracy: {accuracy:.2f}")

# Precision
precision = precision_score(y_test, y_pred, average='binary')
print(f"XGB Precision: {precision:.2f}")

# Recall
recall = recall_score(y_test, y_pred, average='binary')
print(f"XGB Recall: {recall:.2f}")

# F1-Score
f1 = f1_score(y_test, y_pred, average='binary')
print(f"XGB F1-Score: {f1:.2f}")

XGB Confusion Matrix:
 [[210  56]
 [ 17 135]]
XGB Accuracy: 0.83
XGB Precision: 0.71
XGB Recall: 0.89
XGB F1-Score: 0.79
