In [1]:
import os
import pandas as pd

In [2]:
directory = "Reviews/"
merged_file = os.path.join(directory, "Reviews.csv")

header_written = False

for filename in os.listdir(directory):
    if filename.endswith(".csv"):
        file_path = os.path.join(directory, filename)

        ax = pd.read_csv(file_path)

        if ax.shape[1] == 3:
            ax = ax.drop(ax.columns[0], axis=1)

        ax.columns = ['Review', 'Label']

        ax.to_csv(merged_file, mode='a', index=False, header=not header_written)
        header_written = True

In [3]:
all = pd.read_csv('Reviews/Reviews.csv')
all.shape

(12618, 2)

In [200]:
all

Unnamed: 0,Review,Label
0,фотоға адемы жер,positive
1,Очень круто но очередь большая,positive
2,"Приятное место для прогулок, с мужем часто пос...",positive
3,❤️😎,positive
4,Атрикционы мощные прям адреналин😅🔥,positive
...,...,...
3211,"Если за мясом, то точно в это заведение! Огром...",positive
3212,"Зашел пообедать. Бизнес-ланча нет, ну ладно , ...",negative
3213,"Пришли по рекомендации знакомых, Пробовали реб...",positive
3214,Невероятно атмосферное место с безумно вкусным...,positive


Preprocessing

In [201]:
all['Label'] = all['Label'].str.strip().str.lower()

label_mapping = {
    "good": "positive",
    "bad": "negative"
}
all['Label'] = all['Label'].replace(label_mapping)
all

Unnamed: 0,Review,Label
0,фотоға адемы жер,positive
1,Очень круто но очередь большая,positive
2,"Приятное место для прогулок, с мужем часто пос...",positive
3,❤️😎,positive
4,Атрикционы мощные прям адреналин😅🔥,positive
...,...,...
3211,"Если за мясом, то точно в это заведение! Огром...",positive
3212,"Зашел пообедать. Бизнес-ланча нет, ну ладно , ...",negative
3213,"Пришли по рекомендации знакомых, Пробовали реб...",positive
3214,Невероятно атмосферное место с безумно вкусным...,positive


In [202]:
print(all["Label"].unique())


['positive' 'neutral' 'negative']


In [203]:
valid_labels = {"positive", "neutral", "negative"}
all = all[all["Label"].isin(valid_labels)]

In [204]:
emo1 = pd.read_csv('emojiData/1.csv')
emo1.head(3)

Unnamed: 0,emoji,code_points,category,description
0,⏰,23F0,Basic_Emoji,alarm clock
1,⏳,23F3,Basic_Emoji,hourglass not done
2,♿,267F,Basic_Emoji,wheelchair symbol


In [205]:
emo2 = pd.read_csv('emojiData/2.csv')
emo2.head(3)

Unnamed: 0,emoji,code_points,status,name
0,😀,1F600 ...,fully-qualified,grinning face
1,😃,1F603 ...,fully-qualified,grinning face with big eyes
2,😄,1F604 ...,fully-qualified,grinning face with smiling eyes


In [206]:
emo3 = pd.read_csv('emojiData/3.csv')
emo3.head(3)

Unnamed: 0,emoji,code_points,description
0,👨‍❤️‍👨,1F468 200D 2764 FE0F 200D 1F468,"couple with heart: man, man"
1,👨‍❤️‍💋‍👨,1F468 200D 2764 FE0F 200D 1F48B 200D 1F468,"kiss: man, man"
2,👨‍👦,1F468 200D 1F466,"family: man, boy"


In [207]:
emo1 = emo1[["emoji", "code_points", "description"]]
emo2 = emo2.rename(columns={"name": "description"})[["emoji", "code_points", "description"]]
emo3 = emo3[["emoji", "code_points", "description"]]

In [208]:
merge = pd.concat([emo1, emo2, emo3], ignore_index=True).drop_duplicates(subset=['emoji'], keep='first')
merge = merge.groupby("emoji", as_index=False).first()

In [209]:
merge.columns

Index(['emoji', 'code_points', 'description'], dtype='object')

In [210]:
merge["description"] = merge["description"].str.replace(" ", "_")
merge

Unnamed: 0,emoji,code_points,description
0,#⃣,0023 20E3 ...,keycap:_#
1,#️⃣,0023 FE0F 20E3,keycap:_\x{23}
2,*⃣,002A 20E3 ...,keycap:_*
3,*️⃣,002A FE0F 20E3,keycap:_*
4,0⃣,0030 20E3 ...,keycap:_0
...,...,...,...
5037,🫸🏻,1FAF8 1F3FB,rightwards_pushing_hand:_light_skin_tone
5038,🫸🏼,1FAF8 1F3FC,rightwards_pushing_hand:_medium-light_skin_tone
5039,🫸🏽,1FAF8 1F3FD,rightwards_pushing_hand:_medium_skin_tone
5040,🫸🏾,1FAF8 1F3FE,rightwards_pushing_hand:_medium-dark_skin_tone


In [211]:
merge.to_csv('all.csv', index=False, encoding='utf-8')

In [212]:
emoji_dict = dict(zip(merge['emoji'], merge['description']))

def replace_emojis_with_description(text):
    if not isinstance(text, str):
        return ""

    for emoji_char, description in emoji_dict.items():
        text = text.replace(emoji_char, f" {description} ")

    return text.strip()

all['Review'] = all['Review'].apply(replace_emojis_with_description)
all

Unnamed: 0,Review,Label
0,фотоға адемы жер,positive
1,Очень круто но очередь большая,positive
2,"Приятное место для прогулок, с мужем часто пос...",positive
3,red_heart ️ smiling_face_with_sunglasses,positive
4,Атрикционы мощные прям адреналин grinning_face...,positive
...,...,...
3211,"Если за мясом, то точно в это заведение! Огром...",positive
3212,"Зашел пообедать. Бизнес-ланча нет, ну ладно , ...",negative
3213,"Пришли по рекомендации знакомых, Пробовали реб...",positive
3214,Невероятно атмосферное место с безумно вкусным...,positive


In [183]:
import torch
from collections import Counter
from torch.nn.utils.rnn import pad_sequence
from sklearn.preprocessing import LabelEncoder

def tokenize(text):
    return text.lower().split()

word_counts = Counter()
for text in all['Review']:
    word_counts.update(tokenize(text))

vocab = {"<PAD>": 0, "<UNK>": 1}
for idx, word in enumerate(word_counts.keys(), start=2):
    vocab[word] = idx

def numericalize(text):
    return torch.tensor([vocab.get(word, vocab["<UNK>"]) for word in tokenize(text)], dtype=torch.long)

sequences = [numericalize(text) for text in all['Review']]

padded_sequences = pad_sequence(sequences, batch_first=True, padding_value=vocab["<PAD>"])

label_encoder = LabelEncoder()
labels = torch.tensor(label_encoder.fit_transform(all['Label']), dtype=torch.long)

In [184]:
ax = pd.DataFrame({
    "Numericalized_Review": [seq.tolist() for seq in padded_sequences],
    "Encoded_Label": labels.tolist()
})

ax

Unnamed: 0,Numericalized_Review,Encoded_Label
0,"[2, 3, 4, 5, 6, 7, 8, 9, 10, 0, 0, 0, 0, 0, 0,...",2
1,"[11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 2...",2
2,"[13, 21, 24, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",1
3,"[25, 26, 27, 28, 4, 29, 4, 30, 31, 32, 33, 34,...",2
4,"[50, 51, 52, 53, 54, 12, 55, 56, 57, 58, 59, 6...",2
...,...,...
136003,"[1197, 70, 8707, 651, 120, 107, 259, 2533, 870...",2
136004,"[7479, 8717, 8718, 429, 482, 5127, 55, 125, 87...",0
136005,"[685, 224, 8063, 8726, 6278, 8548, 4, 7911, 54...",2
136006,"[905, 1595, 749, 27, 2508, 6383, 8729, 919, 11...",2


In [185]:
all

Unnamed: 0,Review,Label
0,Странный смузи и вкусные бельгийские вафли) w...,positive
1,Все понравилось кухня вкусная спасибо персонал...,positive
2,Кухня на 3,neutral
3,"Заказала фетучини с курицей и грибами, и она о...",positive
4,"Блин у вас ,всё очень понравилось , но особенн...",positive
...,...,...
136065,"Если за мясом, то точно в это заведение! Огром...",positive
136066,"Зашел пообедать. Бизнес-ланча нет, ну ладно , ...",negative
136067,"Пришли по рекомендации знакомых, Пробовали реб...",positive
136068,Невероятно атмосферное место с безумно вкусным...,positive


In [None]:
from torch.utils.data import DataLoader, Dataset

class ReviewDataset(Dataset):
    def __init__(self, x, y):
        self.x = x
        self.y = y
    def __len__(self):
        return len(self.x)
    def __getitem__(self, item):
        return self.x[item], self.y[item]

dataset = ReviewDataset(padded_sequences, labels)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)

In [187]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(padded_sequences, labels, test_size=0.2, random_state=42)

train_dataset = ReviewDataset(x_train, y_train)
test_dataset = ReviewDataset(x_test, y_test)

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)

In [188]:
import torch.nn as nn

class RNN(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, output_dim, dropout=0.5):
        super(RNN, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
        self.rnn = nn.RNN(embed_dim, hidden_dim, num_layers=2, batch_first=True)
        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        embedded = self.embedding(x)
        rnn_out, hidden = self.rnn(embedded)
        out = self.fc(self.dropout(hidden[-1]))
        return out

In [189]:
model = RNN(len(vocab), 256, 64, 3)
print(model)

RNN(
  (embedding): Embedding(9612, 256, padding_idx=0)
  (rnn): RNN(256, 64, num_layers=2, batch_first=True)
  (dropout): Dropout(p=0.5, inplace=False)
  (fc): Linear(in_features=64, out_features=3, bias=True)
)


In [190]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

RNN(
  (embedding): Embedding(9612, 256, padding_idx=0)
  (rnn): RNN(256, 64, num_layers=2, batch_first=True)
  (dropout): Dropout(p=0.5, inplace=False)
  (fc): Linear(in_features=64, out_features=3, bias=True)
)

In [191]:
import torch.optim as optim

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [192]:
num_epochs = 20

for epoch in range(num_epochs):
    total_loss = 0
    model.train()

    for batch in dataloader:
        inputs, targets = batch
        inputs, targets = inputs.to(device), targets.to(device)

        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, targets)

        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {total_loss / len(dataloader)}")

Epoch 1/20, Loss: 0.6223508849042467
Epoch 2/20, Loss: 0.6179882463070399
Epoch 3/20, Loss: 0.6177187017996545
Epoch 4/20, Loss: 0.6170504284140588
Epoch 5/20, Loss: 0.6154430702984178
Epoch 6/20, Loss: 0.6146759385713211
Epoch 7/20, Loss: 0.6145187849803297
Epoch 8/20, Loss: 0.6146421983217638
Epoch 9/20, Loss: 0.6151009911377271
Epoch 10/20, Loss: 0.6145917976871431
Epoch 11/20, Loss: 0.6150720615024933
Epoch 12/20, Loss: 0.612934754395928
Epoch 13/20, Loss: 0.6129411464192059
Epoch 14/20, Loss: 0.6116139306983677
Epoch 15/20, Loss: 0.6111367502736361
Epoch 16/20, Loss: 0.6114660944694128
Epoch 17/20, Loss: 0.6120335899452916
Epoch 18/20, Loss: 0.6126130648646122
Epoch 19/20, Loss: 0.6149150423724857
Epoch 20/20, Loss: 0.6134508970035606


In [196]:
model.eval()
total_correct = 0
total_samples = 0

with torch.no_grad():
    for batch in test_loader:
        inputs, targets = batch
        inputs, targets = inputs.to(device), targets.to(device)

        outputs = model(inputs)
        predicted = torch.argmax(outputs, dim=1)

        total_correct += (predicted == targets).sum().item()
        total_samples += targets.size(0)

accuracy = total_correct / total_samples
print(f"Test Accuracy: {accuracy:.4f}")

Test Accuracy: 0.7915
