## Text Transformation

In [None]:
import torch
import zipfile
import pandas as pd
from transformers import BertTokenizer
import numpy as np
import emoji
import re
from kan import KAN
import string
import pickle
import json
import nltk
nltk.download('stopwords')

In [None]:
zip_path = './media/datasets/archive.zip'
extract_dir = './media/datasets/csv/'

with zipfile.ZipFile(zip_path, 'r') as zip_file:
    zip_file.extractall(extract_dir)

dataframe_train = pd.read_csv(extract_dir+'twitter_training.csv', names=['ID','user','SC','Comment'])
dataframe_test = pd.read_csv(extract_dir+'twitter_validation.csv', names=['ID','user','SC','Comment'])

In [None]:
dataframe_train.iloc[1250:1257]

In [None]:
def dropRowValue(dataframe,column,values):
    return dataframe[~dataframe[column].isin(values)]

def sentimentFilter(sentence, category):
    """
    By default the category entered to the sentimentFilter function will be transformed to 1
    """
    sentiment_num_list = []
    for sentiment in sentence:
        if sentiment == category:
            sentiment_num_list.append(1)
        else:
            sentiment_num_list.append(0)
    return sentiment_num_list

# training
filter_dataframe_train = dropRowValue(dataframe_train,'SC',['Neutral','Irrelevant']).drop(['ID','user'], axis=1)
list_x_train = filter_dataframe_train['Comment'].to_list()
list_y_train = filter_dataframe_train['SC'].to_list()
y_train = sentimentFilter(list_y_train,'Positive')

# testing
filter_dataframe_test = dropRowValue(dataframe_test,'SC',['Neutral','Irrelevant']).drop(['ID','user'], axis=1)
list_x_test = filter_dataframe_test['Comment'].to_list()
list_y_test = filter_dataframe_test['SC'].to_list()
y_test = sentimentFilter(list_y_test,'Positive')

In [None]:
import emoji

emoji.replace_emoji("Hello! ☀️ I'm currently enjoying a beautiful day on a tropical island 🏝️ while practicing some yoga 🧘‍♂️ and sipping on a refreshing watermelon juice 🍉🍹. The sound of the waves 🌊 and the warmth of the sun ☀️ make it the perfect day to relax in my swimsuit 👙 and listen to the birds 🦜 singing in the palm trees 🌴. Don't forget your sunglasses 🕶️!",
                        replace=lambda chars,
                        data_dict: chars.encode('ascii', 'namereplace').decode())

In [None]:
def emojiMask(sentence):
    emoji_mask_sentence = emoji.replace_emoji(sentence,
                        replace=lambda chars,
                        data_dict: chars.encode('ascii', 'namereplace').decode())
    emoji_mask_sentence = re.sub(r"\\N\{(.+?)\}", r"\1", emoji_mask_sentence)
    return emoji_mask_sentence


x_train_emojimask = [emojiMask(str(row)) for row in list_x_train]
x_test_emojimask = [emojiMask(str(row)) for row in list_x_test]

In [None]:
def cleanSentence(sentence, stopwords=True):
    stopwords_vocabulary = nltk.corpus.stopwords.words('english')
    stopwords_pattern = r'\b(?:' + '\s*|'.join(map(re.escape, stopwords_vocabulary)) + r')\b' if stopwords else ''
    # clean_sentence = re.sub(stopwords_pattern + r'|[^\w\s]', '', sentence)
    lower_sentence = sentence.lower()
    clean_sentence = re.sub(stopwords_pattern + r'|[^\w\s]', '', lower_sentence)
    return clean_sentence


x_train_clean = [cleanSentence(row) for row in x_train_emojimask]
x_test_clean = [cleanSentence(row) for row in x_test_emojimask]

In [None]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

encoded_x_train = list(map(lambda x: tokenizer.encode(x), x_train_clean))
encoded_x_test = list(map(lambda x: tokenizer.encode(x), x_test_clean))

In [None]:
# Determine the maximum length of sequences
max_train_length = max(len(seq) for seq in encoded_x_train)
max_test_length = max(len(seq) for seq in encoded_x_test)
max_length = max(max_test_length,max_train_length)


# Pad sequences to ensure they all have the same length
padded_encoded_x_train = np.array([seq + [0]*(max_length - len(seq)) for seq in encoded_x_train])
padded_encoded_x_test = np.array([seq + [0]*(max_length - len(seq)) for seq in encoded_x_test])

In [None]:
dataset = {}

dataset['train_input'] = torch.from_numpy(padded_encoded_x_train)
dataset['test_input'] = torch.from_numpy(padded_encoded_x_test)
dataset['train_label'] = torch.from_numpy(np.array(y_train))
dataset['test_label'] = torch.from_numpy(np.array(y_test))

In [None]:
KAN_model = KAN(width=[max_length,2], grid=3, k=3)

In [None]:
def train_acc():
    return torch.mean((torch.argmax(KAN_model(dataset['train_input']), dim=1) == dataset['train_label']).float())

def test_acc():
    return torch.mean((torch.argmax(KAN_model(dataset['test_input']), dim=1) == dataset['test_label']).float())

results = KAN_model.train(dataset, opt="LBFGS", steps=20, metrics=(train_acc, test_acc), loss_fn=torch.nn.CrossEntropyLoss());

In [None]:
with open('./media/models/result_kan1.obj', 'wb') as f:
    pickle.dump(results, f)

In [None]:
with open('./media/models/result_kan1.obj', 'rb') as fr:
    loaded_result = pickle.load(fr)

In [28]:
KAN.load_ckpt(KAN_model, name='KAN_model', folder='./media/models/')

In [None]:
example = torch.argmax(KAN_model(dataset['train_input']), dim=1)

In [None]:
np.where(example.numpy() == 0)[0]

In [None]:
KAN_model(dataset['train_input'])

In [None]:
with open('./media/datasets/MMcomments/comments.json') as json_file:
    comment_dict = json.load(json_file)


comment_list = comment_dict['comments']
comment_list[0].items()

threshold = 0.5  

filter_comment_list = []
for _ in range(len(comment_list)):
    for sentence, sentiment in comment_list[_].items():
        sentence_emojiMask = emojiMask(sentence)
        clean_sentence = str(cleanSentence(sentence_emojiMask))
        filter_comment_list.append(clean_sentence)

In [None]:
filter_tokens_comments = list(map(lambda x: tokenizer.encode(x), filter_comment_list))

# Determine the maximum length of sequences
filter_comment_dict = {}
max_comments_length = max(len(seq) for seq in filter_tokens_comments)
padded_encoded_filter_tokens = np.array([seq + [0]*(max_comments_length - len(seq)) for seq in filter_tokens_comments])
filter_comment_dict['comments'] = torch.from_numpy(padded_encoded_filter_tokens)

In [29]:
KAN_model(filter_comment_dict['comments'])

RuntimeError: shape '[286, 400]' is invalid for input of size 20020

## Continuous Learning

In [None]:
CL_KAN_model = KAN(width=[max_length,2], k=3, noise_scale=0.1, bias_trainable=False, sp_trainable=False, sb_trainable=False)

def train_acc():
    return torch.mean((torch.argmax(CL_KAN_model(dataset['train_input']), dim=1) == dataset['train_label']).float())

def test_acc():
    return torch.mean((torch.argmax(CL_KAN_model(dataset['test_input']), dim=1) == dataset['test_label']).float())

CL_results = CL_KAN_model.train(dataset, opt="LBFGS", steps=20, update_grid=False, metrics=(train_acc, test_acc), loss_fn=torch.nn.CrossEntropyLoss());

In [None]:
torch.cuda.is_available()