In [1]:
# kütüphaneler
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, TFBertForSequenceClassification
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import CategoricalCrossentropy
from tensorflow.keras.metrics import CategoricalAccuracy
from tensorflow.keras.utils import to_categorical
from sklearn.metrics import confusion_matrix, classification_report

2023-12-26 19:07:22.770971: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2023-12-26 19:07:22.771238: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2023-12-26 19:07:22.898242: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2023-12-26 19:07:23.174236: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


ImportError: cannot import name 'classification_rep' from 'sklearn.metrics' (/home/huseyin/.local/lib/python3.10/site-packages/sklearn/metrics/__init__.py)

In [None]:
# BERT modelini yükleme
model_name = 'dbmdz/bert-base-turkish-128k-uncased'
tokenizer = BertTokenizer.from_pretrained(model_name)
bert_model = TFBertForSequenceClassification.from_pretrained(model_name)

In [None]:
# Veri setini yükleme (örnek olarak bir dil veri seti kullanıldı)
data = pd.read_csv('/content/drive/MyDrive/Colab çalışma/Language_Detection.csv')
#Tekrarlı veri kontrolu
data.duplicated().sum()
#Tekrarlı veriyi silme
data.drop_duplicates(inplace= True)
print(data)

In [None]:
# Veri setini train ve test setlerine ayırma
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)

# LabelEncoder oluşturma
label_encoder = LabelEncoder()

# Eğitim verileri için Language sütununu dönüştürme
train_labels_encoded = label_encoder.fit_transform(train_data['Language'])

# Test verileri için Language sütununu dönüştürme
test_labels_encoded = label_encoder.transform(test_data['Language'])

# Eğitim etiketlerini one-hot encoding yapma
train_labels = to_categorical(train_labels_encoded, num_classes=len(label_encoder.classes_))

# Test etiketlerini one-hot encoding yapma
test_labels = to_categorical(test_labels_encoded, num_classes=len(label_encoder.classes_))


In [None]:
# Veri setini BERT için uygun formata dönüştürme
def convert_to_input(data):
    encodings = tokenizer(
        data['Text'].tolist(),
        padding=True,
        truncation=True,
        max_length=128,
        return_tensors='tf'
    )
    return encodings

train_encodings = convert_to_input(train_data)
test_encodings = convert_to_input(test_data)

# BERT modelini ince ayarlama için özelleştirme
input_layer = Input(shape=(128,), dtype='int32')
bert_output = bert_model(input_layer)
output_layer = Dense(len(label_encoder.classes_), activation='softmax')(bert_output.logits)
model = Model(inputs=input_layer, outputs=output_layer)

In [None]:
# Modeli derleme
optimizer = Adam(learning_rate=0.0001)  # Öğrenme oranını düşürme
model.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=['accuracy'])

In [None]:
# Modeli eğitme
history = model.fit(
    x=train_encodings['input_ids'],
    y=train_labels,
    validation_data=(test_encodings['input_ids'], test_labels),
    epochs=5,  # Daha fazla epoch eğitmeyi deneyebilirsiniz
    batch_size=32
)

In [None]:
# Modeli kaydetme
model.save('/content/drive/MyDrive/Colab çalışma/language_detector.h5')

In [None]:
# Eğitim sürecinin doğruluk (accuracy) ve kayıp (loss) grafiği
plt.plot(history.history['accuracy'], label='Training Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.title('Training and Validation Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend(loc='lower right')
plt.show()

plt.plot(history.history['loss'], label='Training Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.title('Training and Validation Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend(loc='upper right')
plt.show()

In [None]:
# Tahminler yapma
predictions = model.predict(test_encodings['input_ids'])
predicted_classes = np.argmax(predictions, axis=1)
true_classes = np.argmax(test_labels, axis=1)

In [None]:
# Confusion matrix
conf_matrix = confusion_matrix(true_classes, predicted_classes)
print("Confusion Matrix:")
print(conf_matrix)

In [None]:
# Classification report
class_report = classification_report(true_classes, predicted_classes)
print("\nClassification Report:")
print(class_report)