In [1]:
# train_nlp.py
import pandas as pd
import numpy as np
import re
import joblib
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout

# --- AYARLAR ---
VOCAB_SIZE = 5000    # En çok kullanılan 5000 kelime
MAX_LEN = 200        # Yorumun max uzunluğu (kelime)
EMBEDDING_DIM = 64   # Vektör boyutu
SAMPLE_SIZE = 15000  # Hızlı eğitim için veriyi kısıtlayalım (İstersen artırabilirsin)

# 1. Veri Yükleme ve Temizleme
print("Veri yükleniyor...")
df = pd.read_csv("IMDB Dataset.csv")

# Etiketleri sayıya çevir (positive->1, negative->0)
df['sentiment'] = df['sentiment'].map({'positive': 1, 'negative': 0})

# Temizleme Fonksiyonu
def clean_text(text):
    text = str(text).lower()
    text = re.sub(r'<.*?>', '', text) # HTML taglerini sil
    text = re.sub(r'[^a-zA-Z\s]', '', text) # Özel karakterleri sil
    return text

df['review'] = df['review'].apply(clean_text)

# Hız için örneklem al
df = df.sample(SAMPLE_SIZE, random_state=42)
X = df['review']
y = df['sentiment']

# Eğitim/Test Ayrımı
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# --- MODEL 1: KLASİK (TF-IDF + LOGISTIC REGRESSION) ---
print("Model 1 (Klasik) eğitiliyor...")
tfidf = TfidfVectorizer(max_features=VOCAB_SIZE)
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

lr_model = LogisticRegression()
lr_model.fit(X_train_tfidf, y_train)

# Başarı Skoru
y_pred_lr = lr_model.predict(X_test_tfidf)
print(f"Klasik Model Accuracy: {accuracy_score(y_test, y_pred_lr):.4f}")

# Kaydet
joblib.dump(lr_model, "nlp_classic_model.pkl")
joblib.dump(tfidf, "nlp_tfidf.pkl")

# --- MODEL 2: DEEP LEARNING (LSTM) ---
print("Model 2 (Deep Learning) eğitiliyor...")
tokenizer = Tokenizer(num_words=VOCAB_SIZE, oov_token="<OOV>")
tokenizer.fit_on_texts(X_train)

X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

X_train_pad = pad_sequences(X_train_seq, maxlen=MAX_LEN, padding='post', truncating='post')
X_test_pad = pad_sequences(X_test_seq, maxlen=MAX_LEN, padding='post', truncating='post')

# LSTM Mimarisi
model = Sequential([
    Embedding(VOCAB_SIZE, EMBEDDING_DIM, input_length=MAX_LEN),
    LSTM(64),
    Dropout(0.5),
    Dense(1, activation='sigmoid')
])

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Eğit
model.fit(X_train_pad, y_train, epochs=77, batch_size=64, validation_data=(X_test_pad, y_test), verbose=1)

# Kaydet
model.save("nlp_dl_model.h5")
joblib.dump(tokenizer, "nlp_tokenizer.pkl")

print("Tüm modeller ve dosyalar başarıyla kaydedildi!")

  if not hasattr(np, "object"):


Veri yükleniyor...
Model 1 (Klasik) eğitiliyor...
Klasik Model Accuracy: 0.8797
Model 2 (Deep Learning) eğitiliyor...




Epoch 1/77
[1m188/188[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 59ms/step - accuracy: 0.5096 - loss: 0.6932 - val_accuracy: 0.5040 - val_loss: 0.6909
Epoch 2/77
[1m188/188[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 55ms/step - accuracy: 0.5638 - loss: 0.6803 - val_accuracy: 0.5610 - val_loss: 0.6828
Epoch 3/77
[1m188/188[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 60ms/step - accuracy: 0.6193 - loss: 0.6723 - val_accuracy: 0.5810 - val_loss: 0.6711
Epoch 4/77
[1m188/188[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 58ms/step - accuracy: 0.5805 - loss: 0.6717 - val_accuracy: 0.6167 - val_loss: 0.6580
Epoch 5/77
[1m188/188[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 57ms/step - accuracy: 0.6144 - loss: 0.6465 - val_accuracy: 0.6107 - val_loss: 0.6344
Epoch 6/77
[1m188/188[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 57ms/step - accuracy: 0.6076 - loss: 0.6566 - val_accuracy: 0.5650 - val_loss: 0.6741
Epoch 7/77
[1m1



Tüm modeller ve dosyalar başarıyla kaydedildi!
