IMDB Dataset of 50K Movie Reviews uygulamamız için modellerimizi kaydedeceğiz

In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import pandas as pd

In [3]:
# AYARLAR

In [4]:
VOCAB_SIZE = 5000    # En çok kullanılan 5000 kelime

In [5]:
MAX_LEN = 200        # Yorumun max uzunluğu (kelime)

In [6]:
EMBEDDING_DIM = 64   # Vektör boyutu

In [7]:
SAMPLE_SIZE = 15000  # Hızlı eğitim için veriyi kısıtlayalım (İstersen artırabilirsin)

In [8]:
# Veri Yükleme ve Temizleme

In [9]:
df = pd.read_csv("IMDB Dataset.csv")

In [10]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     50000 non-null  object
 1   sentiment  50000 non-null  object
dtypes: object(2)
memory usage: 781.4+ KB


In [12]:
df.isnull().sum()

review       0
sentiment    0
dtype: int64

In [13]:
# Etiketleri sayıya çeviriyoruz (positive->1, negative->0)

In [14]:
df['sentiment'] = df['sentiment'].map({'positive': 1, 'negative': 0})

In [15]:
# Temizleme Fonksiyonu

In [16]:
def clean_text(text):
    text = str(text).lower()
    text = re.sub(r'<.*?>', '', text) # HTML taglerini sil
    text = re.sub(r'[^a-zA-Z\s]', '', text) # Özel karakterleri sil
    return text

In [17]:
import re

In [18]:
df['review'] = df['review'].apply(clean_text)

In [19]:
# Hız için örnekler alıyoruz

In [20]:
df = df.sample(SAMPLE_SIZE, random_state=42)

In [21]:
X = df['review']

In [22]:
y = df['sentiment']

In [23]:
# Train/Test Ayrımı

In [24]:
from sklearn.model_selection import train_test_split

In [25]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [26]:
# TF-IDF + LOGISTIC REGRESSION

In [27]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [28]:
tfidf = TfidfVectorizer(max_features=VOCAB_SIZE)

In [29]:
X_train_tfidf = tfidf.fit_transform(X_train)

In [30]:
X_test_tfidf = tfidf.transform(X_test)

In [31]:
from sklearn.linear_model import LogisticRegression

In [32]:
lr_model = LogisticRegression()

In [33]:
lr_model.fit(X_train_tfidf, y_train)

In [34]:
lr_model.score(X_train_tfidf, y_train)

0.9168333333333333

In [35]:
# Başarı Skoru

In [36]:
y_pred_lr = lr_model.predict(X_test_tfidf)

In [37]:
from sklearn.metrics import accuracy_score

In [38]:
print(f"Klasik Model Accuracy: {accuracy_score(y_test, y_pred_lr):.4f}")

Klasik Model Accuracy: 0.8797


In [39]:
# Save

In [40]:
import joblib

In [41]:
joblib.dump(lr_model, "nlp_classic_model.pkl")

['nlp_classic_model.pkl']

In [42]:
joblib.dump(tfidf, "nlp_tfidf.pkl")

['nlp_tfidf.pkl']

In [43]:
# DEEP LEARNING (LSTM)

In [44]:
from tensorflow.keras.preprocessing.text import Tokenizer

In [45]:
tokenizer = Tokenizer(num_words=VOCAB_SIZE, oov_token="<OOV>")

In [46]:
tokenizer.fit_on_texts(X_train)

In [47]:
X_train_seq = tokenizer.texts_to_sequences(X_train)

In [48]:
X_test_seq = tokenizer.texts_to_sequences(X_test)

In [49]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [50]:
X_train_pad = pad_sequences(X_train_seq, maxlen=MAX_LEN, padding='post', truncating='post')

In [51]:
X_test_pad = pad_sequences(X_test_seq, maxlen=MAX_LEN, padding='post', truncating='post')

In [52]:
# LSTM Mimarisi

In [53]:
from tensorflow.keras.models import Sequential

In [54]:
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout

In [55]:
model = Sequential([
    Embedding(VOCAB_SIZE, EMBEDDING_DIM, input_length=MAX_LEN),
    LSTM(64),
    Dropout(0.5),
    Dense(1, activation='sigmoid')
])

In [56]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [57]:
# Eğitim

In [58]:
model.fit(X_train_pad, y_train, epochs=62, batch_size=64, validation_data=(X_test_pad, y_test), verbose=1)

Epoch 1/62
[1m188/188[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 60ms/step - accuracy: 0.5366 - loss: 0.6907 - val_accuracy: 0.5170 - val_loss: 0.7277
Epoch 2/62
[1m188/188[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 55ms/step - accuracy: 0.5250 - loss: 0.6931 - val_accuracy: 0.5523 - val_loss: 0.6852
Epoch 3/62
[1m188/188[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 56ms/step - accuracy: 0.5922 - loss: 0.6579 - val_accuracy: 0.5903 - val_loss: 0.6776
Epoch 4/62
[1m188/188[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 59ms/step - accuracy: 0.6504 - loss: 0.6086 - val_accuracy: 0.7533 - val_loss: 0.5532
Epoch 5/62
[1m188/188[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 56ms/step - accuracy: 0.6848 - loss: 0.5747 - val_accuracy: 0.7890 - val_loss: 0.5203
Epoch 6/62
[1m188/188[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 54ms/step - accuracy: 0.7653 - loss: 0.5139 - val_accuracy: 0.7593 - val_loss: 0.5863
Epoch 7/62
[1m1

<keras.src.callbacks.history.History at 0x2390b896ae0>

In [59]:
# Save

In [60]:
model.save("nlp_dl_model.h5")



In [61]:
joblib.dump(tokenizer, "nlp_tokenizer.pkl")

['nlp_tokenizer.pkl']

lr_model.score: 0.92
Klasik Model Accuracy: 0.8797
DL LSTM accuracy: 0.99
val_accuracy: 0.82