<a href="https://colab.research.google.com/github/imgsude/gsk/blob/main/HastalikTahmini1DCNN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
!pip install transformers torch scikit-learn

from transformers import AutoTokenizer, AutoModel
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from google.colab import files


uploaded = files.upload()
df = pd.read_csv("Duzenlenmis_Hastalik_Verisi.csv")

# Tokenizer ve BioBERT Modeli
tokenizer = AutoTokenizer.from_pretrained("bvanaken/CORe-clinical-diagnosis-prediction")
bert_model = AutoModel.from_pretrained("bvanaken/CORe-clinical-diagnosis-prediction", output_hidden_states=True)


sentences = df.apply(lambda row: " ".join(row.astype(str)), axis=1).tolist()

# Belirti etiketlerini sayisal hale getiriyoruz
label_encoder = LabelEncoder()
labels = label_encoder.fit_transform(df['Belirti'].values)

# BERT Embedding
def extract_features(sentences):
    all_embeddings = []
    for text in sentences:
        inputs = tokenizer(text, padding="max_length", truncation=True, max_length=128, return_tensors="pt")
        with torch.no_grad():
            outputs = bert_model(**inputs)
            hidden_states = outputs.hidden_states[-1]  # son katmanın çıktısı (batch_size, seq_len, hidden_size)
        all_embeddings.append(hidden_states.squeeze(0).numpy())  # (seq_len, hidden_size)
    return np.array(all_embeddings)

X = extract_features(sentences)  # (num_samples, seq_len, hidden_size)
y = np.array(labels)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Tensor'lara dönüştür (veri duzenlemenin bir yolu)
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.long)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test, dtype=torch.long)

train_dataset = torch.utils.data.TensorDataset(X_train_tensor, y_train_tensor)
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=16, shuffle=True)

# 1D CNN
class CNNClassifier(nn.Module):
    def __init__(self, input_dim, num_classes):
        super(CNNClassifier, self).__init__()
        self.conv1 = nn.Conv1d(in_channels=input_dim, out_channels=128, kernel_size=3, padding=1)
        self.relu = nn.ReLU()
        self.pool = nn.AdaptiveMaxPool1d(1)
        self.fc = nn.Linear(128, num_classes)

    def forward(self, x):
        x = x.permute(0, 2, 1)  # (batch_size, hidden_dim, seq_len)
        x = self.conv1(x)
        x = self.relu(x)
        x = self.pool(x).squeeze(-1)
        x = self.fc(x)
        return x

model = CNNClassifier(input_dim=768, num_classes=len(np.unique(y)))
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-4)

# Eğitim döngüsü
epochs = 3
for epoch in range(epochs):
    model.train()
    total_loss = 0
    for batch_x, batch_y in train_loader:
        optimizer.zero_grad()
        outputs = model(batch_x)
        loss = criterion(outputs, batch_y)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1}/{epochs}, Loss: {total_loss:.4f}")

# Test
model.eval()
with torch.no_grad():
    predictions = model(X_test_tensor)
    predicted_classes = torch.argmax(predictions, dim=1).numpy()

accuracy = accuracy_score(y_test, predicted_classes)
print(f"Model Doğruluğu (CNN): {accuracy * 100:.2f}%")




Saving Duzenlenmis_Hastalik_Verisi.csv to Duzenlenmis_Hastalik_Verisi (1).csv
Epoch 1/3, Loss: 42.6050
Epoch 2/3, Loss: 27.1685
Epoch 3/3, Loss: 17.8887
Model Doğruluğu (CNN): 91.43%
