<a href="https://colab.research.google.com/github/knetic0/turkish-store-reviews-sentiment-analysis/blob/master/turkish_sentiment_analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [112]:
import pandas as pd
import chardet
import re
from collections import Counter
import torch
import torch.nn as nn
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import Dataset, DataLoader
import nltk
nltk.download("stopwords")
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [113]:
file_path = "./sample_data/magaza_yorumlari_duygu_analizi.csv"

with open(file_path, 'rb') as f:
  result = chardet.detect(f.read(1000))

df = pd.read_csv(file_path, encoding=result["encoding"])

df.head()

Unnamed: 0,Görüş,Durum
0,"ses kalitesi ve ergonomisi rezalet, sony olduğ...",Olumsuz
1,hizli teslimat tesekkürler,Tarafsız
2,ses olayı süper....gece çalıştır sıkıntı yok.....,Olumlu
3,geldi bigün kullandık hemen bozoldu hiçtavsiye...,Olumsuz
4,Kulaklığın sesi kaliteli falan değil. Aleti öv...,Olumsuz


In [114]:
df.dropna(inplace=True)
df.reset_index(drop=True, inplace=True)

print("First 5 Row: ")
print(df.head())

print("*"*25)

print("Describe about Dataset: ")
print(df.describe())

print("*"*25)

print("Unique values of Situation: ")
print(df["Durum"].unique())

First 5 Row: 
                                               Görüş     Durum
0  ses kalitesi ve ergonomisi rezalet, sony olduğ...   Olumsuz
1                         hizli teslimat tesekkürler  Tarafsız
2  ses olayı süper....gece çalıştır sıkıntı yok.....    Olumlu
3  geldi bigün kullandık hemen bozoldu hiçtavsiye...   Olumsuz
4  Kulaklığın sesi kaliteli falan değil. Aleti öv...   Olumsuz
*************************
Describe about Dataset: 
             Görüş   Durum
count        11426   11426
unique       11407       3
top     İdare eder  Olumlu
freq             3    4252
*************************
Unique values of Situation: 
['Olumsuz' 'Tarafsız' 'Olumlu']


- Olumsuz -> 0
- Tarafsiz -> 1
- Olumlu -> 2


In [115]:
label_mapping = {
    "Olumsuz": 0,
    "Tarafsız": 1,
    "Olumlu": 2
}

In [116]:
def simple_tokenizer(text:str):
    text = text.lower()
    tokens = re.findall(r'\b\w+\b', text)
    return tokens

In [117]:
turkish_stopwords = set(stopwords.words("turkish"))

all_tokens = []
for text in df["Görüş"]:
    tokens = simple_tokenizer(text)
    tokens = [token for token in tokens if token not in turkish_stopwords]
    all_tokens.extend(tokens)

token_counts = Counter(all_tokens)

min_freq = 2
vocab_tokens = [token for token, count in token_counts.items() if count >= min_freq]

vocab = {"<pad>": 0, "<unk>": 1}
for token in vocab_tokens:
    vocab[token] = len(vocab)

print("Vocab örneği:", list(vocab.items())[:10])

Vocab örneği: [('<pad>', 0), ('<unk>', 1), ('ses', 2), ('kalitesi', 3), ('ergonomisi', 4), ('rezalet', 5), ('sony', 6), ('olduğu', 7), ('aldım', 8), ('4', 9)]


In [118]:
def collate_fn(batch):
    texts, labels = zip(*batch)
    pad_idx = vocab.get("<pad>", 0)
    texts_padded = pad_sequence(texts, batch_first=True, padding_value=pad_idx)
    labels = torch.stack(labels, dim=0)
    return texts_padded, labels

In [119]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [120]:
class SentimentDataset(Dataset):
  def __init__(self, texts, labels, tokenizer, vocab, label_mapping, device):
    self.texts = texts
    self.labels = labels
    self.tokenizer = tokenizer
    self.vocab = vocab
    self.label_mapping = label_mapping

  def __len__(self):
    return len(self.texts)

  def __getitem__(self, idx):
    text = self.texts[idx]
    label = self.labels[idx]
    tokens = self.tokenizer(text)
    numericalized = [self.vocab.get(token, self.vocab.get("<unk>")) for token in tokens]
    label = self.label_mapping.get(label, 0)
    return torch.tensor(numericalized, dtype=torch.long).to(device), torch.tensor(label, dtype=torch.long).to(device)

In [121]:
train_dataset = SentimentDataset(df["Görüş"], df["Durum"], simple_tokenizer, vocab, label_mapping, device)
train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True, collate_fn=collate_fn)

In [122]:
class SentimentModel(nn.Module):
  def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, device, n_layers=1, dropout=0.5):
    super(SentimentModel, self).__init__()
    self.embedding = nn.Embedding(vocab_size, embedding_dim).to(device)
    self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers=n_layers, batch_first=True, dropout=dropout).to(device)
    self.fc = nn.Linear(hidden_dim, output_dim).to(device)
    self.dropout = nn.Dropout(dropout).to(device)

  def forward(self, x):
    embedded = self.dropout(self.embedding(x))
    output, (hidden, cell) = self.lstm(embedded)
    hidden = self.dropout(hidden[-1,:,:])
    return self.fc(hidden)

In [123]:
model = SentimentModel(vocab_size=len(vocab), embedding_dim=300, hidden_dim=128, output_dim=3, device=device)
model.to(device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)



In [124]:
NUM_EPOCHS = 100

for epoch in range(NUM_EPOCHS):
  model.train()
  epoch_loss = 0.0
  epoch_acc = 0.0
  for texts, labels in train_dataloader:
    optimizer.zero_grad()
    predictions = model(texts)
    loss = criterion(predictions, labels)
    loss.backward()
    optimizer.step()
    epoch_loss += loss.item()
  print(f"Epoch {epoch + 1}, Loss: {epoch_loss / len(train_dataloader)}")
torch.save(model.state_dict(), "sentiment_model.pt")

Epoch 1, Loss: 1.0924673396781837
Epoch 2, Loss: 1.0827247270658695
Epoch 3, Loss: 1.023913972204624
Epoch 4, Loss: 0.9071518377551819
Epoch 5, Loss: 0.8328040503589801
Epoch 6, Loss: 0.7825445965681662
Epoch 7, Loss: 0.7651596622094096
Epoch 8, Loss: 0.7357134685836024
Epoch 9, Loss: 0.6997901503433729
Epoch 10, Loss: 0.6853223716080522
Epoch 11, Loss: 0.6554227840967018
Epoch 12, Loss: 0.6354333477812772
Epoch 13, Loss: 0.6163309894127553
Epoch 14, Loss: 0.5939028101801539
Epoch 15, Loss: 0.5656518188185532
Epoch 16, Loss: 0.5534069820322804
Epoch 17, Loss: 0.5337797910308039
Epoch 18, Loss: 0.5199457596383947
Epoch 19, Loss: 0.5077113557877487
Epoch 20, Loss: 0.48265085937507324
Epoch 21, Loss: 0.47477657089852754
Epoch 22, Loss: 0.46383624528040435
Epoch 23, Loss: 0.437183819086858
Epoch 24, Loss: 0.427659462377762
Epoch 25, Loss: 0.4178388813645813
Epoch 26, Loss: 0.4037461483736611
Epoch 27, Loss: 0.3951620422928027
Epoch 28, Loss: 0.3831531784221447
Epoch 29, Loss: 0.37564968028

In [125]:
test_input = input("Enter a any text: ")
tokens = simple_tokenizer(test_input)
numericalized = [vocab.get(token, vocab.get("<unk>")) for token in tokens]
input_tensor = torch.tensor(numericalized, dtype=torch.long).unsqueeze(0)
input_tensor = input_tensor.to(device)
model.to(device)
with torch.no_grad():
    output = model(input_tensor)
    predicted_class = torch.argmax(output, dim=1).item()
label_mapping_reverse = {
  0: "Olumsuz",
  1: "Tarafsız",
  2: "Olumlu"
}
print(label_mapping_reverse[predicted_class])

Enter a any text: ürün elime çok hızlı ulaştı
Olumlu
