In [1]:
import nltk
# NLTK verilerini belirli bir dizine indir
nltk.data.path.append('/GANsAugmentation-SentimentAnalysis/nltk_data')  

# Gerekli NLTK bileşenlerini indir
nltk.download('all', download_dir='/GANsAugmentation-SentimentAnalysis/nltk_data')

In [2]:
import numpy as np
import pandas as pd

train_df=pd.read_csv('ımdb_reviews_train.csv')
valid_df=pd.read_csv('ımdb_reviews_valid.csv')
df=pd.concat([train_df,valid_df],sort=False)

In [3]:
print("veri seti boyutu:",df.shape)
print("###############################################")
print(df.info())
print("###############################################")
print(df['sentiment'].value_counts())
print("###############################################")
print(df.isnull().sum())

veri seti boyutu: (74999, 2)
###############################################
<class 'pandas.core.frame.DataFrame'>
Index: 74999 entries, 0 to 24999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     74999 non-null  object
 1   sentiment  74999 non-null  object
dtypes: object(2)
memory usage: 1.7+ MB
None
###############################################
sentiment
positive      37488
negative      37487
1;               11
0;                9
negative;;        1
positive"         1
positive;;        1
0"                1
Name: count, dtype: int64
###############################################
review       0
sentiment    0
dtype: int64


In [4]:
# 'sentiment' sütunundaki değerleri düzelt
df['sentiment'] = df['sentiment'].replace({
    '1;': 'positive',
    '0;': 'negative',
    'positive"': 'positive',
    'negative;;': 'negative',
    'positive;;': 'positive',
    '0"': 'negative',
})


# Sonuçları kontrol et
print(df['sentiment'].value_counts())


sentiment
positive    37501
negative    37498
Name: count, dtype: int64


In [5]:
import re

def clean_text(text):
    # 1.HTML etiketlerini kaldırma
    text = re.sub(r'<.*?>', '', text)
    
    # 2.URL'leri kaldırma
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    
    # 3.Gereksiz boşlukları kaldırma
    text = re.sub(r'\s+', ' ', text).strip()
    # Başlangıçtaki ve sondaki boşlukları temizle
    text = text.strip()
    # Cümle içindeki fazla boşlukları temizle
    text = ' '.join(text.split())
    
    # 4. Küçük harflere dönüştür
    text = text.lower()
    
    # 5. Noktalama işaretlerini kaldır
    text = re.sub(r'[^\w\s]', '', text)

    return text

In [6]:
df['review'] = df['review'].apply(lambda x: clean_text(x))
print(df['review'].head())  

0    one of the other reviewers has mentioned that ...
1    a wonderful little production the filming tech...
2    i thought this was a wonderful way to spend ti...
3    basically theres a family where a little boy j...
4    petter matteis love in the time of money is a ...
Name: review, dtype: object


In [7]:
# Yinelenen kelimeleri temizleme fonksiyonu
def remove_duplicate_words(text):
    words = text.split()
    seen = set()
    result = []
    for word in words:
        if word.lower() not in seen:  # Küçük harf duyarlılığı ile kontrol ediyoruz
            seen.add(word.lower())
            result.append(word)
    return " ".join(result)

# Yinelenen cümleleri temizleme fonksiyonu
def remove_duplicate_sentences(text):
    sentences = text.split('.')
    seen = set()
    result = []
    for sentence in sentences:
        sentence = sentence.strip()
        if sentence.lower() not in seen:
            seen.add(sentence.lower())
            result.append(sentence)
    return ". ".join(result)

In [8]:
# Yinelenen kelimeleri ve cümleleri temizleyelim
df['review'] = df['review'].apply(remove_duplicate_words)
print(df['review'].head())

0    one of the other reviewers has mentioned that ...
1    a wonderful little production the filming tech...
2    i thought this was a wonderful way to spend ti...
3    basically theres a family where little boy jak...
4    petter matteis love in the time of money is a ...
Name: review, dtype: object


In [13]:
import language_tool_python

# Dil ve yazım hatalarını düzeltme
tool = language_tool_python.LanguageTool('en-US')  # 'en-US' ingizce için

def correct_spelling_and_grammar(text):
    matches = tool.check(text)
    corrected_text = language_tool_python.utils.correct(text, matches)
    return corrected_text


In [15]:
df['review'] = df['review'].apply(correct_spelling_and_grammar)
print(df['review'].head())

0    one of the other reviewers has mentioned that ...
1    a wonderful little production the filming tech...
2    i thought this was a wonderful way to spend ti...
3    basically theres a family where little boy jak...
4    petter matteis love in the time of money is a ...
Name: review, dtype: object


In [16]:
from nltk.tokenize import word_tokenize
from nltk.corpus import wordnet

# Sinonimlerle değiştirme
def replace_with_synonyms(text):
    words = word_tokenize(text)
    new_sentence = []
    for word in words:
        synonyms = wordnet.synsets(word)
        if synonyms:
            synonym = synonyms[0].lemmas()[0].name()
            new_sentence.append(synonym)
        else:
            new_sentence.append(word)
    return " ".join(new_sentence)

In [18]:
df['review'] = df['review'].apply(replace_with_synonyms)
print(df['review'].head())

0    one of the other reviewers has mentioned that ...
1    a wonderful little production the filming tech...
2    i thought this was a wonderful way to spend ti...
3    basically theres a family where little boy jak...
4    petter matteis love in the time of money is a ...
Name: review, dtype: object


In [19]:
import random

def add_noise(text, noise_words):
    words = text.split()
    num_words_to_add = int(len(words) * 0.1)  # Metnin %10'u kadar gürültü kelimesi ekle
    for _ in range(num_words_to_add):
        random_index = random.randint(0, len(words) - 1)
        noise_word = random.choice(noise_words)
        words.insert(random_index, noise_word)
    return ' '.join(words)

# Gürültü için kullanılacak kelimeler listesi
noise_words = [
    'um', 'uh', 'like', 'so', 'you know', 'actually', 'basically',
    'seriously', 'literally', 'well', 'honestly', 'truly', 'really',
    'I mean', 'basically', 'just', 'kinda', 'sorta', 'probably',
    'maybe', 'definitely', 'literally', 'figuratively', 'almost'
]

In [21]:
df['review'] = df['review'].apply(lambda x: add_noise(x, noise_words))
print(df['review'].head())

0    one of the other reviewers has mentioned that ...
1    a wonderful little production the filming tech...
2    i thought this was a wonderful way to spend ti...
3    basically theres a family where little boy jak...
4    petter matteis love in the time of money is a ...
Name: review, dtype: object


In [22]:
from transformers import pipeline

# GAN ile cümle yeniden yazma
paraphrase = pipeline("text-generation", model="t5-small")


  from .autonotebook import tqdm as notebook_tqdm


In [None]:
from transformers import pipeline

# Text rewriting için bir pipeline oluşturun
text_generator = pipeline("text2text-generation", model="facebook/bart-large-cnn")

def rewrite_sentence(sentence):
    # Cümleyi yeniden ifade etme
    rewritten = text_generator(sentence, max_length=100, num_beams=5, early_stopping=True)
    return rewritten[0]['generated_text']

# Kullanım örneği
sentence = "Bu bir örnek cümledir."
rewritten_sentence = rewrite_sentence(sentence)
print("Orijinal Cümle:", sentence)
print("Yeniden İfade Edilmiş Cümle:", rewritten_sentence)
