In [None]:
##Meine eigene Modell

import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

base_drive_path = "/content/drive/MyDrive/wifo dosyalar/"

df_labeled = pd.read_csv(base_drive_path + "SON2_sadece_etiketli_veri.csv")
df_unlabeled = pd.read_csv(base_drive_path + "tumdataetiketsiz2.csv")


X = df_labeled["comment_clean"]
y = df_labeled["label"].astype(int)

# Trainings- und Testdaten trennen (geschichtet)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, stratify=y, test_size=0.2, random_state=42
)

# Pipeline-Definition (class_weight hinzugefügt)
model = Pipeline([
    ("tfidf", TfidfVectorizer(max_features=5000, ngram_range=(1, 2))),
    ("clf", LogisticRegression(max_iter=1000, random_state=42, class_weight='balanced'))
])


model.fit(X, y)

# Treffen Sie Vorhersagen auf Grundlage von Testdaten
y_pred = model.predict(X_test)


print("Meine eigene Model report:")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\n Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\n Classification Report:\n", classification_report(y_test, y_pred))


df_new = df_unlabeled[~df_unlabeled['id'].isin(df_labeled['id'])].copy()


df_new["label"] = model.predict(df_new["comment_clean"])
df_new["label"] = df_new["label"].astype(int)


df_labeled["source"] = "manual"
df_new["source"] = "predicted"


df_final = pd.concat([df_labeled, df_new], ignore_index=True)

df_final.to_csv("veri_tahminli_birlesikduplicateli.csv", index=False)
print("\n Tahminli CSV dosyası kaydedildi!")
print("\n Etiket dağılımı:\n", df_final["label"].value_counts())



Meine eigene Model report:
Accuracy: 0.9205776173285198

 Confusion Matrix:
 [[255  16]
 [ 28 255]]

 Classification Report:
               precision    recall  f1-score   support

           0       0.90      0.94      0.92       271
           1       0.94      0.90      0.92       283

    accuracy                           0.92       554
   macro avg       0.92      0.92      0.92       554
weighted avg       0.92      0.92      0.92       554


 Tahminli CSV dosyası kaydedildi!

 Etiket dağılımı:
 label
1.0    14453
0.0     3332
Name: count, dtype: int64


In [None]:
##Linear SVC

import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

base_drive_path = "/content/drive/MyDrive/wifo dosyalar/"

df_labeled = pd.read_csv(base_drive_path + "SON2_sadece_etiketli_veri.csv")
df_unlabeled = pd.read_csv(base_drive_path + "tumdataetiketsiz2.csv")

# X ve y ayır
X = df_labeled["comment_clean"]
y = df_labeled["label"].astype(int)

# Eğitim ve test verilerini ayır (stratified)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, stratify=y, test_size=0.2, random_state=42
)

# Pipeline tanımı (SVM + TF-IDF, class_weight eklendi)
model = Pipeline([
    ("tfidf", TfidfVectorizer(max_features=5000, ngram_range=(1, 2))),
    ("clf", LinearSVC(class_weight='balanced', C=1.0, max_iter=10000))
])

# Modeli eğit
model.fit(X_train, y_train)

# Test verisi üzerinde tahmin yap
y_pred = model.predict(X_test)

# Performans değerlendirme
print("✅ Accuracy:", accuracy_score(y_test, y_pred))
print("\n🧮 Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\n📊 Classification Report:\n", classification_report(y_test, y_pred))

# Etiketsiz verilerden daha önce etiketlenenleri çıkar
df_new = df_unlabeled[~df_unlabeled['id'].isin(df_labeled['id'])].copy()

# Yeni tahminler
df_new["label"] = model.predict(df_new["comment_clean"])
df_new["label"] = df_new["label"].astype(int)

# Kaynak etiketleri
df_labeled["source"] = "manual"
df_new["source"] = "predicted"

# Birleştir ve kaydet
df_final = pd.concat([df_labeled, df_new], ignore_index=True)
df_final.drop_duplicates(subset="comment_clean", keep="first", inplace=True)

df_final.to_csv("veri_tahminli_birlesik_svm.csv", index=False)
print("\n📁 Tahminli CSV dosyası (SVM) kaydedildi!")
print("\n🧾 Etiket dağılımı:\n", df_final["label"].value_counts())


✅ Accuracy: 0.8375451263537906

🧮 Confusion Matrix:
 [[234  37]
 [ 53 230]]

📊 Classification Report:
               precision    recall  f1-score   support

           0       0.82      0.86      0.84       271
           1       0.86      0.81      0.84       283

    accuracy                           0.84       554
   macro avg       0.84      0.84      0.84       554
weighted avg       0.84      0.84      0.84       554


📁 Tahminli CSV dosyası (SVM) kaydedildi!

🧾 Etiket dağılımı:
 label
1.0    10866
0.0     2824
Name: count, dtype: int64


In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

base_drive_path = "/content/drive/MyDrive/wifo dosyalar/"

df_labeled = pd.read_csv(base_drive_path + "SON2_sadece_etiketli_veri.csv")
df_unlabeled = pd.read_csv(base_drive_path + "tumdataetiketsiz2.csv")


# X ve y ayır
X = df_labeled["comment_clean"]
y = df_labeled["label"].astype(int)

# Eğitim ve test verilerini ayır (stratified)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, stratify=y, test_size=0.2, random_state=42
)

# Pipeline tanımı (Random Forest)
model = Pipeline([
    ("tfidf", TfidfVectorizer(max_features=5000, ngram_range=(1, 2))),
    ("clf", RandomForestClassifier(n_estimators=150, class_weight='balanced', random_state=42))
])

# Modeli eğit
model.fit(X_train, y_train)

# Test verisi üzerinde tahmin yap
y_pred = model.predict(X_test)

# Performans değerlendirme
print("✅ Accuracy:", accuracy_score(y_test, y_pred))
print("\n🧮 Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\n📊 Classification Report:\n", classification_report(y_test, y_pred))

# Etiketsiz verilerden daha önce etiketlenenleri çıkar
df_new = df_unlabeled[~df_unlabeled['id'].isin(df_labeled['id'])].copy()

# Yeni tahminler
df_new["label"] = model.predict(df_new["comment_clean"])
df_new["label"] = df_new["label"].astype(int)

# Kaynak etiketleri
df_labeled["source"] = "manual"
df_new["source"] = "predicted"

# Birleştir ve kaydet
df_final = pd.concat([df_labeled, df_new], ignore_index=True)
df_final.drop_duplicates(subset="comment_clean", keep="first", inplace=True)

df_final.to_csv("veri_tahminli_birlesik_rf.csv", index=False)
print("\n📁 Tahminli CSV dosyası (Random Forest) kaydedildi!")
print("\n🧾 Etiket dağılımı:\n", df_final["label"].value_counts())


✅ Accuracy: 0.8231046931407943

🧮 Confusion Matrix:
 [[223  48]
 [ 50 233]]

📊 Classification Report:
               precision    recall  f1-score   support

           0       0.82      0.82      0.82       271
           1       0.83      0.82      0.83       283

    accuracy                           0.82       554
   macro avg       0.82      0.82      0.82       554
weighted avg       0.82      0.82      0.82       554


📁 Tahminli CSV dosyası (Random Forest) kaydedildi!

🧾 Etiket dağılımı:
 label
1.0    10619
0.0     3071
Name: count, dtype: int64


In [None]:
##Hazır model

!pip install transformers

import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
from torch.nn.functional import softmax
from tqdm import tqdm

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Modell und Tokenizer
model_name = "savasy/bert-base-turkish-sentiment-cased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)
model.to(device)
model.eval()

base_drive_path = "/content/drive/MyDrive/wifo dosyalar/"
df = pd.read_csv(base_drive_path + "tumdataetiketsiz2.csv")

comments = df["comment_clean"].astype(str).tolist()

# Vorhersagen sammeln
predicted_labels = []
for comment in tqdm(comments, desc="Savasy BERT ile etiketleniyor"):
    inputs = tokenizer(comment, return_tensors="pt", truncation=True, padding=True, max_length=512).to(device)
    with torch.no_grad():
        outputs = model(**inputs)
        probs = softmax(outputs.logits, dim=1)
        label = torch.argmax(probs, dim=1).item()  # 0: negatif, 1: nötr, 2: pozitif
        predicted_labels.append(label)

df["label"] = predicted_labels
df.to_csv("tumdata_etiketli_savasy2.csv", index=False)
print("Etiketleme tamamlandı. Kaydedilen dosya: tumdata_etiketli_savasy2.csv")




The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/39.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/596 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/263k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/442M [00:00<?, ?B/s]

Savasy BERT ile etiketleniyor:   5%|▌         | 954/17785 [02:42<34:57,  8.02it/s]