In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
file_path = '/content/drive/My Drive/Colab Notebooks/Research Techniques II/DF_cleaned_v1.csv'

In [3]:
import pandas as pd

df = pd.read_csv(file_path)
df.head()

Unnamed: 0,text,labels
0,"Başa düşmürəm, mən o vaxt onlardan necə gedə b...",0
1,Halal olsun admin heyyətinə. Tək tük səhifələr...,1
2,Dunya seyaheti etmek ucun limitsiz bilet ve pul,1
3,"O, bu məhəbbəti əməlləri ilə qazanmışdı”.",1
4,Maraqlıdır siğə söhbətini aralarında bölüşdürü...,0


In [4]:
print(df.columns)


Index(['text', 'labels'], dtype='object')


In [5]:
corrections_df = pd.read_excel('/content/drive/My Drive/Colab Notebooks/Research Techniques II/word_correction_dict_2.xlsx')
word_correction_dict_2 = dict(zip(corrections_df['Column1'], corrections_df['Column2']))

In [6]:
import re

def correct_words_case_insensitive(text, correction_dict):
    def replace(match):
        original = match.group()
        lower = original.lower()
        if lower in correction_dict:
            corrected = correction_dict[lower]
            # Baş harf büyükse, düzeltmeyi de büyük harfli yap
            if original[0].isupper():
                return corrected.capitalize()
            else:
                return corrected
        return original

    pattern = re.compile(r'\b(' + '|'.join(re.escape(k) for k in correction_dict.keys()) + r')\b', flags=re.IGNORECASE)
    return pattern.sub(replace, str(text))

In [7]:
df['text'] = df['text'].apply(lambda x: correct_words_case_insensitive(x, word_correction_dict_2))

In [8]:
df['labels'].value_counts()

Unnamed: 0_level_0,count
labels,Unnamed: 1_level_1
0,13993
1,13939


In [9]:
import re

def basic_cleaning(text):
    # 1. Web adreslerini kaldır
    text = re.sub(r'https?://\S+|www\.\S+', '', text)

    # 2. En az iki kapanış parantezi olan ifadeleri kaldır
    text = re.sub(r'\){2,}', '', text)

    # 3. Çoklu boşlukları tek boşluğa indir
    text = re.sub(r'\s{2,}', ' ', text)

    return text.strip()

# Temizlemeyi uygula (sözlüksüz versiyon)
df_cleaned = df.copy()
df_cleaned['text'] = df_cleaned['text'].apply(basic_cleaning)

In [24]:
from sentence_transformers import SentenceTransformer

# LaBSE modelini yükle
model = SentenceTransformer('sentence-transformers/paraphrase-multilingual-mpnet-base-v2')

# Temizlenmiş metinleri al
texts = df_cleaned['text'].tolist()

# Embedding çıkar (batch size isteğe göre artırılabilir)
embeddings = model.encode(texts, batch_size=64, show_progress_bar=True)

modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/3.90k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/723 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/402 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.08M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/437 [00:00<?, ?it/s]

In [25]:
import numpy as np

# NumPy array'e dönüştürmek istiyorsan:
embeddings = np.array(embeddings)

# Şekli kontrol et (satır sayısı, embedding boyutu)
print(embeddings.shape)

(27932, 768)


In [26]:
X_seq = np.expand_dims(embeddings, axis=1)

In [27]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_seq, df_cleaned['labels'].values, test_size=0.2, random_state=42)

In [28]:
from keras.models import Sequential
from keras.layers import Bidirectional, GRU, Dense, Input

model = Sequential()
model.add(Input(shape=(1, X_seq.shape[2])))  # (seq_len=1, embedding_dim=768)
model.add(Bidirectional(GRU(64)))
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [29]:
model.fit(X_train, y_train, epochs=5, batch_size=32, validation_data=(X_test, y_test))

Epoch 1/5
[1m699/699[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 6ms/step - accuracy: 0.7609 - loss: 0.4921 - val_accuracy: 0.7963 - val_loss: 0.4322
Epoch 2/5
[1m699/699[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 5ms/step - accuracy: 0.7916 - loss: 0.4347 - val_accuracy: 0.7949 - val_loss: 0.4333
Epoch 3/5
[1m699/699[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 5ms/step - accuracy: 0.7908 - loss: 0.4311 - val_accuracy: 0.7970 - val_loss: 0.4278
Epoch 4/5
[1m699/699[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 5ms/step - accuracy: 0.7958 - loss: 0.4262 - val_accuracy: 0.7947 - val_loss: 0.4282
Epoch 5/5
[1m699/699[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 5ms/step - accuracy: 0.7997 - loss: 0.4167 - val_accuracy: 0.7943 - val_loss: 0.4302


<keras.src.callbacks.history.History at 0x7c8e53261690>

In [30]:
y_pred = (model.predict(X_test) > 0.5).astype("int32")

from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

[1m175/175[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step
              precision    recall  f1-score   support

           0       0.78      0.81      0.79      2743
           1       0.81      0.78      0.79      2844

    accuracy                           0.79      5587
   macro avg       0.79      0.79      0.79      5587
weighted avg       0.80      0.79      0.79      5587

