In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, TFBertForSequenceClassification
from transformers import InputExample, InputFeatures
import tensorflow as tf

# Veri setini yükleme
df = pd.read_csv('/content/sample_data/clean_questions.csv')

# Veri ön işleme
def preprocess_data(df):
    # Soru ve şıkları birleştirme
    df['text'] = df.apply(lambda row: f"Question: {row['question']} " +
                                     f"Choices: 1) {row['choice_1']} " +
                                     f"2) {row['choice_2']} " +
                                     f"3) {row['choice_3']} " +
                                     f"4) {row['choice_4']}", axis=1)

    # Cevap etiketlerini 0-3 aralığına düşürme (1-4 yerine)
    df['label'] = df['answer'] - 1

    return df[['text', 'label']]

processed_df = preprocess_data(df)

# Eğitim ve test setlerine ayırma
train_df, test_df = train_test_split(processed_df, test_size=0.2, random_state=42)

In [None]:
# BERT tokenizer'ını yükleme
model_name = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(model_name)

# Veriyi BERT için uygun formata dönüştürme
def convert_data_to_examples(df):
    examples = []
    for index, row in df.iterrows():
        examples.append(
            InputExample(
                guid=None,
                text_a=row['text'],
                label=row['label']
            )
        )
    return examples

def convert_examples_to_tf_dataset(examples, tokenizer, max_length=128):
    features = []

    for example in examples:
        input_dict = tokenizer.encode_plus(
            example.text_a,
            add_special_tokens=True,
            max_length=max_length,
            truncation=True,
            padding='max_length',
            return_attention_mask=True,
            return_token_type_ids=True
        )

        input_ids, token_type_ids, attention_mask = (
            input_dict['input_ids'],
            input_dict['token_type_ids'],
            input_dict['attention_mask']
        )

        features.append(
            InputFeatures(
                input_ids=input_ids,
                attention_mask=attention_mask,
                token_type_ids=token_type_ids,
                label=example.label
            )
        )

    def gen():
        for feature in features:
            yield (
                {
                    'input_ids': feature.input_ids,
                    'attention_mask': feature.attention_mask,
                    'token_type_ids': feature.token_type_ids
                },
                feature.label
            )

    return tf.data.Dataset.from_generator(
        gen,
        (
            {
                'input_ids': tf.int32,
                'attention_mask': tf.int32,
                'token_type_ids': tf.int32
            },
            tf.int64
        ),
        (
            {
                'input_ids': tf.TensorShape([None]),
                'attention_mask': tf.TensorShape([None]),
                'token_type_ids': tf.TensorShape([None])
            },
            tf.TensorShape([])
        )
    )

# Veriyi dönüştürme
train_examples = convert_data_to_examples(train_df)
test_examples = convert_data_to_examples(test_df)

train_data = convert_examples_to_tf_dataset(train_examples, tokenizer)
train_data = train_data.shuffle(100).batch(32).repeat(2)

test_data = convert_examples_to_tf_dataset(test_examples, tokenizer)
test_data = test_data.batch(32)

In [None]:
import tensorflow as tf
from transformers import DistilBertTokenizer, TFDistilBertForSequenceClassification

# Hızlı model ve tokenizer
model_name = 'distilbert-base-uncased'
tokenizer = DistilBertTokenizer.from_pretrained(model_name)

# Veri hazırlama (daha kısa maksimum uzunluk)
def create_dataset(df, tokenizer, max_length=64):
    encodings = tokenizer(
        df['text'].tolist(),
        truncation=True,
        padding='max_length',
        max_length=max_length,
        return_tensors='tf'
    )
    return tf.data.Dataset.from_tensor_slices((
        dict(encodings),
        df['label'].values
    )).batch(32)  # Daha büyük batch

train_dataset = create_dataset(train_df, tokenizer).shuffle(100)
test_dataset = create_dataset(test_df, tokenizer)

# Mixed precision (GPU varsa)
tf.keras.mixed_precision.set_global_policy('mixed_float16')

# Model
model = TFDistilBertForSequenceClassification.from_pretrained(model_name, num_labels=4)

# Optimizer
optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)

model.compile(optimizer=optimizer, loss=loss, metrics=['accuracy'])

# Kısa eğitim
history = model.fit(
    train_dataset,
    validation_data=test_dataset,
    epochs=5,  # Epoch sayısını azalt
    verbose=1
)

In [None]:
import random
import pandas as pd
from transformers import DistilBertTokenizer, TFDistilBertForSequenceClassification
import tensorflow as tf

# Model ve tokenizer yükleme
model_path = r"C:\Users\PC\Downloads\english_exam_model"
tokenizer = DistilBertTokenizer.from_pretrained(model_path)
model = TFDistilBertForSequenceClassification.from_pretrained(model_path)

# Veri setini yükleme
def load_data(filepath):
    df = pd.read_csv(filepath)
    df = df.dropna(subset=['question', 'choice_1', 'choice_2', 'choice_3', 'choice_4', 'answer'])
    return df

# Rastgele soru seçme
def get_random_question(df):
    random_row = df.sample(1).iloc[0]
    question = random_row['question']
    choices = {
        1: random_row['choice_1'],
        2: random_row['choice_2'],
        3: random_row['choice_3'],
        4: random_row['choice_4']
    }
    correct_answer = random_row['answer']
    return question, choices, correct_answer

# Model tahmini yapma
def predict_answer(model, tokenizer, question, choices):
    text = f"Question: {question} Choices: 1) {choices[1]} 2) {choices[2]} 3) {choices[3]} 4) {choices[4]}"
    inputs = tokenizer(text, return_tensors='tf', truncation=True, max_length=128)
    outputs = model(inputs)
    predicted_answer = tf.argmax(outputs.logits, axis=1).numpy()[0] + 1
    return predicted_answer

# Oyun döngüsü
def quiz_game(df, model, tokenizer):
    print("\n--- İngilizce Test Uygulaması ---")
    print("Çıkmak için 'q' girin\n")
    
    while True:
        question, choices, correct_answer = get_random_question(df)
        
        print(f"\nSoru: {question}")
        for num, choice in choices.items():
            print(f"{num}) {choice}")
        
        user_input = input("\nCevabınız (1-4): ")
        
        if user_input.lower() == 'q':
            print("Çıkış yapılıyor...")
            break
            
        try:
            user_answer = int(user_input)
            if user_answer not in [1, 2, 3, 4]:
                print("Lütfen 1-4 arasında bir numara girin!")
                continue
                
            # Model tahmini
            predicted = predict_answer(model, tokenizer, question, choices)
            
            if user_answer == correct_answer:
                print("✅ Doğru cevap!")
            else:
                print(f"❌ Yanlış cevap! Doğru cevap: {correct_answer}) {choices[correct_answer]}")
            
            print(f"Model tahmini: {predicted}) {choices.get(predicted, 'N/A')}")
            
        except ValueError:
            print("Geçersiz giriş! Lütfen numara girin.")

# Ana program
if __name__ == "__main__":
    try:
        df = load_data(r"C:\Users\PC\OneDrive\Belgeler\grammer_classification\clean_questions.csv")
        quiz_game(df, model, tokenizer)
    except FileNotFoundError:
        print("Hata: clean_questions.csv dosyası bulunamadı!")
    except Exception as e:
        print(f"Beklenmeyen bir hata oluştu: {str(e)}")