In [None]:
import numpy as np
import pandas as pd

train_df=pd.read_csv('ımdb_reviews_train.csv')
valid_df=pd.read_csv('ımdb_reviews_valid.csv')
df=pd.concat([train_df,valid_df],sort=False)

In [None]:
import re

def clean_text(text):
    # 1.HTML etiketlerini kaldırma
    text = re.sub(r'<.*?>', '', text)
    
    # 2.URL'leri kaldırma
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    
    # 3.Gereksiz boşlukları kaldırma
    text = re.sub(r'\s+', ' ', text).strip()
    # Başlangıçtaki ve sondaki boşlukları temizle
    text = text.strip()
    # Cümle içindeki fazla boşlukları temizle
    text = ' '.join(text.split())
    
    # 4. Küçük harflere dönüştür
    text = text.lower()
    
    # 5. Noktalama işaretlerini kaldır
    text = re.sub(r'[^\w\s]', '', text)

    return text

In [None]:
df['review'] = df['review'].apply(lambda x: clean_text(x))
print(df['review'].head())  

In [None]:
import pandas as pd

# Yinelenen kelimeleri temizleme fonksiyonu
def remove_duplicate_words(text):
    words = text.split()
    seen = set()
    result = []
    for word in words:
        if word.lower() not in seen:  # Küçük harf duyarlılığı ile kontrol ediyoruz
            seen.add(word.lower())
            result.append(word)
    return " ".join(result)

# Yinelenen cümleleri temizleme fonksiyonu
def remove_duplicate_sentences(text):
    sentences = text.split('.')
    seen = set()
    result = []
    for sentence in sentences:
        sentence = sentence.strip()
        if sentence.lower() not in seen:
            seen.add(sentence.lower())
            result.append(sentence)
    return ". ".join(result)

In [None]:
# Yinelenen kelimeleri ve cümleleri temizleyelim
df['review'] = df['review'].apply(remove_duplicate_words)
print(df['review'].head())

In [None]:
"""import pandas as pd
from autocorrect import Speller

# Speller nesnesi oluşturma
spell = Speller(lang='en')

# Yazım hatalarını düzeltme fonksiyonu
def correct_spelling_autocorrect(text):
    return spell(text)"""

In [None]:
# Tüm veride yazım hatalarını düzeltelim
df['review'] = df['review'].apply(correct_spelling_autocorrect)
print(df['review'].head(5))

In [None]:
"""import language_tool_python

# LanguageTool aracı
tool = language_tool_python.LanguageTool('en-US')

# Gramer hatalarını düzeltme fonksiyonu
def correct_grammar(text):
    matches = tool.check(text)
    corrected_text = language_tool_python.utils.correct(text, matches)
    return corrected_text"""


In [None]:
df['review'] = df['review'].apply(correct_grammar)
print(df['review'].head(5))

In [None]:
"""import pandas as pd
import nltk
from nltk.corpus import wordnet

# Gerekli NLTK paketlerini indirme
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')"""

In [None]:
"""# Kelimenin sinonimini bulma fonksiyonu
def get_synonym(word):
    synonyms = wordnet.synsets(word)
    if synonyms:
        synonym = synonyms[0].lemmas()[0].name()
        return synonym if synonym != word else None
    return None

# Cümledeki kelimeleri sinonimleri ile değiştirme fonksiyonu
def replace_with_synonyms(text):
    words = nltk.word_tokenize(text)
    pos_tagged = nltk.pos_tag(words)
    new_words = []
    
    for word, tag in pos_tagged:
        # Yalnızca isimler, fiiller, sıfatlar ve zarflar üzerinde değişiklik yapalım
        if tag.startswith('NN') or tag.startswith('VB') or tag.startswith('JJ') or tag.startswith('RB'):
            synonym = get_synonym(word)
            if synonym:
                new_words.append(synonym)
            else:
                new_words.append(word)
        else:
            new_words.append(word)
    return ' '.join(new_words)
"""

In [None]:
# Verideki tüm cümlelerde sinonim değiştirme
df['review'] = df['review'].apply(replace_with_synonyms)
print(df['review'].head(5))

In [None]:
"""import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
import random

# NLTK veri setlerini indirme
nltk.download('punkt')"""

In [None]:
"""# Dataset Sınıfı
class ReviewDataset(Dataset):
    def __init__(self, reviews):
        self.reviews = reviews

    def __len__(self):
        return len(self.reviews)

    def __getitem__(self, idx):
        return self.reviews[idx]

# Generator Ağı
class Generator(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(Generator, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.fc2 = nn.Linear(hidden_size, output_size)
    
    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.sigmoid(self.fc2(x))
        return x

# Discriminator Ağı
class Discriminator(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(Discriminator, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.fc2 = nn.Linear(hidden_size, 1)
    
    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.sigmoid(self.fc2(x))
        return x
"""

In [None]:
"""# Hiperparametreler
input_size = 100  # Rastgele gürültü vektör boyutu
hidden_size = 128  # Gizli katman boyutu
output_size = 200  # Üretilecek cümle vektör boyutu
batch_size = 32
learning_rate = 0.0002
num_epochs = 1000

# Model örnekleri
generator = Generator(input_size, hidden_size, output_size)
discriminator = Discriminator(output_size, hidden_size)

# Optimizasyon
optimizer_g = optim.Adam(generator.parameters(), lr=learning_rate)
optimizer_d = optim.Adam(discriminator.parameters(), lr=learning_rate)

# Kayıp fonksiyonu
criterion = nn.BCELoss()

# Veriyi yükleme
reviews = df['review'].tolist()
dataset = ReviewDataset(reviews)
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

# Eğitim döngüsü
for epoch in range(num_epochs):
    for i, real_data in enumerate(dataloader):
        batch_size = real_data.size(0)

        # Discriminator için gerçek ve sahte etiketler
        real_labels = torch.ones(batch_size, 1)
        fake_labels = torch.zeros(batch_size, 1)

        # Discriminator'ı eğitme
        optimizer_d.zero_grad()
        real_data = real_data.float()
        real_output = discriminator(real_data)
        real_loss = criterion(real_output, real_labels)
        real_loss.backward()

        noise = torch.randn(batch_size, input_size)
        fake_data = generator(noise)
        fake_output = discriminator(fake_data.detach())
        fake_loss = criterion(fake_output, fake_labels)
        fake_loss.backward()
        optimizer_d.step()

        # Generator'ı eğitme
        optimizer_g.zero_grad()
        fake_output = discriminator(fake_data)
        generator_loss = criterion(fake_output, real_labels)
        generator_loss.backward()
        optimizer_g.step()

    if (epoch+1) % 100 == 0:
        print(f'Epoch [{epoch+1}/{num_epochs}], D Loss: {real_loss.item()+fake_loss.item()}, G Loss: {generator_loss.item()}')

# Üretilmiş cümleleri kaydetme
generated_reviews = []
for i in range(1000):  # 1000 tane cümle üretelim
    noise = torch.randn(1, input_size)
    generated_review = generator(noise)
    generated_reviews.append(generated_review)

# Üretilen cümleleri CSV dosyasına kaydetme
generated_df = pd.DataFrame({'review': generated_reviews})
generated_df.to_csv('generated_reviews.csv', index=False)"""


In [None]:
"""import random
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize

# Gürültü ekleme fonksiyonu
def inject_noise(text, noise_level=0.05):
    words = word_tokenize(text)
    num_words = len(words)
    noise_words = int(num_words * noise_level)
    
    # Rastgele yerlerden kelimeler ekleyelim
    for _ in range(noise_words):
        position = random.randint(0, num_words - 1)
        noise_word = ''.join(random.choices('abcdefghijklmnopqrstuvwxyz', k=3))  # 3 harfli rastgele kelime
        words.insert(position, noise_word)
    
    return ' '.join(words)"""

In [None]:
# Gürültü ekleyelim
df['review'] = df['review'].apply(inject_noise)
print(df['review'].head())

In [None]:
"""import torch
import torch.nn as nn
import torch.optim as optim

# Duygusal tonları temsil eden sınıflar
emotions = ['happy', 'angry', 'sad', 'neutral']

# Geliştirilmiş Generator
class EmotionGenerator(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(EmotionGenerator, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.fc2 = nn.Linear(hidden_size, output_size)
        self.fc3 = nn.Linear(output_size, len(emotions))  # Duygusal tonlar için çıktı katmanı

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        x = self.fc3(x)
        return x

# Geliştirilmiş Discriminator
class EmotionDiscriminator(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(EmotionDiscriminator, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.fc2 = nn.Linear(hidden_size, len(emotions))  # Duygusal tonlar için çıktı katmanı

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = self.fc2(x)
        return x

# Hiperparametreler
input_size = 200  # Girdi boyutu
hidden_size = 128  # Gizli katman boyutu
output_size = 200  # Çıktı boyutu
batch_size = 32
learning_rate = 0.0002
num_epochs = 1000

# Model örnekleri
generator = EmotionGenerator(input_size, hidden_size, output_size)
discriminator = EmotionDiscriminator(output_size, hidden_size)

# Optimizasyon
optimizer_g = optim.Adam(generator.parameters(), lr=learning_rate)
optimizer_d = optim.Adam(discriminator.parameters(), lr=learning_rate)

# Kayıp fonksiyonu
criterion = nn.CrossEntropyLoss()

# Eğitim döngüsü
for epoch in range(num_epochs):
    for i, (real_data, labels) in enumerate(dataloader):
        batch_size = real_data.size(0)

        # Discriminator için gerçek ve sahte etiketler
        real_labels = torch.ones(batch_size, len(emotions))
        fake_labels = torch.zeros(batch_size, len(emotions))

        # Discriminator'ı eğitme
        optimizer_d.zero_grad()
        real_data = real_data.float()
        real_output = discriminator(real_data)
        real_loss = criterion(real_output, real_labels)
        real_loss.backward()

        noise = torch.randn(batch_size, input_size)
        fake_data = generator(noise)
        fake_output = discriminator(fake_data.detach())
        fake_loss = criterion(fake_output, fake_labels)
        fake_loss.backward()
        optimizer_d.step()

        # Generator'ı eğitme
        optimizer_g.zero_grad()
        fake_output = discriminator(fake_data)
        generator_loss = criterion(fake_output, real_labels)
        generator_loss.backward()
        optimizer_g.step()

    if (epoch+1) % 100 == 0:
        print(f'Epoch [{epoch+1}/{num_epochs}], D Loss: {real_loss.item()+fake_loss.item()}, G Loss: {generator_loss.item()}')

# Üretilmiş cümleleri kaydetme
generated_reviews = []
for i in range(1000):  # 1000 tane cümle üretelim
    noise = torch.randn(1, input_size)
    generated_review = generator(noise)
    generated_reviews.append(generated_review)

generated_df = pd.DataFrame({'review': generated_reviews})
generated_df.to_csv("emotion_generated_reviews.csv", index=False)
print(generated_df.head())"""

In [None]:
import torch
import pandas as pd
"""
# Modeli yükleyin
generator = torch.load('emotion_generator.pth')
discriminator = torch.load('emotion_discriminator.pth')

# Üretim için fonksiyon
def generate_sentences(num_sentences=1000):
    generator.eval()
    sentences = []
    for _ in range(num_sentences):
        noise = torch.randn(1, input_size)
        generated_sentence = generator(noise).detach().numpy()
        sentences.append(generated_sentence)
    return sentences

# Cümleleri oluşturun
generated_sentences = generate_sentences()

# Discriminator kullanarak duygusal tonları belirleme
def classify_emotion(sentences):
    discriminator.eval()
    emotions = ['happy', 'angry', 'sad', 'neutral']
    classified_emotions = []
    for sentence in sentences:
        sentence_tensor = torch.tensor(sentence).float()
        output = discriminator(sentence_tensor)
        emotion = emotions[torch.argmax(output)]
        classified_emotions.append(emotion)
    return classified_emotions

# Cümlelerin duygusal tonlarını belirleyin
emotional_tones = classify_emotion(generated_sentences)

# Sonuçları kaydedin
results_df = pd.DataFrame({'sentence': generated_sentences, 'emotion': emotional_tones})
results_df.to_csv("classified_emotional_sentences.csv", index=False)"""


In [None]:
"""import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Veri yükleme
data = pd.read_csv("classified_emotional_sentences.csv")

# Metin ve etiketleri ayırma
texts = data['review'].values
labels = data['sentiment'].values

# Etiketleri sayısal hale getirme
label_encoder = LabelEncoder()
encoded_labels = label_encoder.fit_transform(labels)

# Veriyi eğitim ve test setlerine ayırma
X_train, X_test, y_train, y_test = train_test_split(texts, encoded_labels, test_size=0.2, random_state=42)

# Tokenizer ve dizilere dönüştürme
tokenizer = Tokenizer(num_words=10000)
tokenizer.fit_on_texts(X_train)

X_train_sequences = tokenizer.texts_to_sequences(X_train)
X_test_sequences = tokenizer.texts_to_sequences(X_test)

X_train_padded = pad_sequences(X_train_sequences, maxlen=100)
X_test_padded = pad_sequences(X_test_sequences, maxlen=100)"""


In [None]:
"""from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense

# Modeli oluşturma
model_lstm = Sequential()
model_lstm.add(Embedding(input_dim=10000, output_dim=128, input_length=100))
model_lstm.add(LSTM(128, return_sequences=True))
model_lstm.add(LSTM(64))
model_lstm.add(Dense(1, activation='sigmoid'))

# Modeli derleme
model_lstm.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Modeli eğitme
history = model_lstm.fit(X_train_padded, y_train, epochs=5, batch_size=64, validation_split=0.1)

# Test seti üzerinde değerlendirme
loss, accuracy = model_lstm.evaluate(X_test_padded, y_test)
print(f'LSTM Model Test Accuracy: {accuracy:.4f}')
"""

In [None]:
"""# GAN ile üretilmiş veri yükleme
gan_data = pd.read_csv("classified_emotional_sentences.csv")

# Metin ve etiketleri ayırma
gan_texts = gan_data['sentence'].values
gan_labels = gan_data['emotion'].values

# Etiketleri sayısal hale getirme
gan_encoded_labels = label_encoder.transform(gan_labels)

# Veriyi eğitim ve test setlerine ayırma
X_gan_train, X_gan_test, y_gan_train, y_gan_test = train_test_split(gan_texts, gan_encoded_labels, test_size=0.2, random_state=42)

# Tokenizer ve dizilere dönüştürme
X_gan_train_sequences = tokenizer.texts_to_sequences(X_gan_train)
X_gan_test_sequences = tokenizer.texts_to_sequences(X_gan_test)

X_gan_train_padded = pad_sequences(X_gan_train_sequences, maxlen=100)
X_gan_test_padded = pad_sequences(X_gan_test_sequences, maxlen=100)

# Modeli eğitme
history_gan = model_lstm.fit(X_gan_train_padded, y_gan_train, epochs=5, batch_size=64, validation_split=0.1)

# Test seti üzerinde değerlendirme
gan_loss, gan_accuracy = model_lstm.evaluate(X_gan_test_padded, y_gan_test)
print(f'LSTM Model GAN Üretilmiş Veriler ile Test Accuracy: {gan_accuracy:.4f}')"""
