In [28]:
import pandas as pd
import re
from nltk.corpus import stopwords
import nltk
from sklearn.utils import shuffle
from sklearn.metrics import accuracy_score
import numpy as np
import tensorflow as tf
from tensorflow import keras
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text

In [29]:
RANDOM_SEED = 42
NUMBER_OF_CATEGORIZED_WORDS=2000

# Wstępne przetwarzenie danych

## Wczytywanie danych

In [30]:
df = pd.read_csv("data/file.csv", index_col=0)

In [31]:
df.head()

Unnamed: 0,tweets,labels
0,ChatGPT: Optimizing Language Models for Dialog...,neutral
1,"Try talking with ChatGPT, our new AI system wh...",good
2,ChatGPT: Optimizing Language Models for Dialog...,neutral
3,"THRILLED to share that ChatGPT, our new model ...",good
4,"As of 2 minutes ago, @OpenAI released their ne...",bad


## Przetwarzanie zbioru

In [32]:
stopwords  = stopwords.words("english")

In [33]:
stopwords

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [34]:
stemmer = nltk.stem.PorterStemmer()

In [35]:

def preprocessTweets(t):
    # usuwa url
    preprocessedTweet = re.sub('(https?:\/\/)(\s)*(www\.)?(\s)*((\w|\s)+\.)*([\w\-\s]+\/)*([\w\-]+)((\?)?[\w\s]*=\s*[\w\%&]*)*', ' ', t) 
    # usuwa \\n
    preprocessedTweet = re.sub(r'\\n', ' ', preprocessedTweet) 
    # tylko litery - usuwa emotikony 
    preprocessedTweet = re.sub(r"[^a-zA-Z']", ' ', preprocessedTweet) 
    # male litery
    preprocessedTweet = preprocessedTweet.lower() 

    preprocessedTweet = preprocessedTweet.split()

    #usuwa zbedne slowa
    preprocessedTweet = [ word for word in preprocessedTweet if word not in stopwords]

    #dokonaj "stemming" - zamienia słowo na jego rdzeń
    preprocessedTweet = [ stemmer.stem(word) for word in preprocessedTweet]

    #sklej z powrotem
    return " ".join(preprocessedTweet)

In [36]:
df.tweets = df.tweets.apply(preprocessTweets)

In [37]:
df.iloc[19][0]

'ask chatgpt new ai system optim dialogu teach seo minut result'

In [38]:
df = shuffle(df, random_state=RANDOM_SEED)

## Podział danych

In [39]:
dataset_len = df.shape[0]
train_dataset_len = int(dataset_len*0.75)
test_dataset_len = int(dataset_len*0.25*0.5)

In [40]:
X_train, y_train = df.iloc[:train_dataset_len, 0], df.iloc[:train_dataset_len,1]
X_valid, y_valid = df.iloc[train_dataset_len:train_dataset_len+test_dataset_len, 0], df.iloc[train_dataset_len:train_dataset_len+test_dataset_len,1]
X_test, y_test = df.iloc[train_dataset_len+test_dataset_len:, 0], df.iloc[train_dataset_len + test_dataset_len:, 1]

## Przetwarzanie zbioru pod Bayens i Logistic Regression

In [41]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer 

cv = TfidfVectorizer(binary=True, max_features=NUMBER_OF_CATEGORIZED_WORDS)
cv.fit(df.tweets)

X_vectorized_train = cv.transform(X_train)
X_vectorized_valid = cv.transform(X_valid)
X_vectorized_test = cv.transform(X_test)

In [42]:
X_vectorized_test.shape

(27413, 2000)

## Przetwarzanie zbioru pod RNN

In [43]:
words = tf.constant(cv.get_feature_names_out())
word_ids = tf.range(len(cv.get_feature_names_out()), dtype=tf.int64)
vocab_init = tf.lookup.KeyValueTensorInitializer(words, word_ids)
num_oov_buckets = 1000
table = tf.lookup.StaticVocabularyTable(vocab_init, num_oov_buckets)

In [44]:
X_train_tokenized = table.lookup(tf.strings.split(X_train)).to_tensor()

In [45]:
y_train_tokenized = tf.convert_to_tensor(np.where(y_train=="good", 2, np.where(y_train=="neutral", 1, 0)), dtype=tf.int64)

In [46]:
X_valid_tokenized = table.lookup(tf.strings.split(X_valid)).to_tensor()

In [47]:
y_valid_tokenized = tf.convert_to_tensor(np.where(y_valid=="good", 2, np.where(y_valid=="neutral", 1, 0)),dtype=tf.int64)

In [48]:
X_test_tokenized = table.lookup(tf.strings.split(X_test)).to_tensor()

In [49]:
y_test_tokenized = tf.convert_to_tensor(np.where(y_test=="good", 2, np.where(y_test=="neutral", 1, 0)), dtype=tf.int64)

# Bayens 

In [50]:
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
gnb.fit(X_vectorized_train.toarray(), y_train);

In [51]:
accuracy_score(gnb.predict(X_vectorized_test.toarray()), y_test)

0.6248860029912815

# Logistic regression

In [52]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(solver="liblinear", random_state=RANDOM_SEED, C=5, penalty="l2", max_iter=1000)
model = lr.fit(X_vectorized_train,y_train)

In [53]:
accuracy_score(lr.predict(X_vectorized_test), y_test)

0.8037792288330354

# RNN

In [54]:


embed_size = 128
rnn_model = keras.models.Sequential([
    keras.layers.Embedding(NUMBER_OF_CATEGORIZED_WORDS + num_oov_buckets, embed_size,
                           mask_zero=True, 
                           input_shape=[None]),
    keras.layers.GRU(128, return_sequences=True),
    keras.layers.GRU(128),
    keras.layers.Dense(3, activation="sigmoid")
])
rnn_model.compile(loss="SparseCategoricalCrossentropy", optimizer="adam", metrics=["accuracy"])
rnn_history = rnn_model.fit(x=X_train_tokenized, y=y_train_tokenized, epochs=5, validation_data=(X_valid_tokenized, y_valid_tokenized))



Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [55]:
rnn_model.evaluate(X_test_tokenized,y_test_tokenized)



[0.39708229899406433, 0.8540473580360413]

# BERT

In [56]:
def build_bert_model(preprocessor_url, model_url):
    text_input = tf.keras.layers.Input(shape=(), dtype=tf.string, name='text')
    preprocessing_layer = hub.KerasLayer(preprocessor_url, name='preprocessing')
    encoder_inputs = preprocessing_layer(text_input)
    encoder = hub.KerasLayer(model_url, trainable=True, name='BERT_encoder')
    outputs = encoder(encoder_inputs)
    net = outputs['pooled_output']
    net = tf.keras.layers.Dropout(0.1)(net)
    net = tf.keras.layers.Dense(3, activation="softmax", name='classifier')(net)
    return tf.keras.Model(text_input, net)

In [57]:
medium_bert_model_url = "https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-4_H-256_A-4/2"
medium_bert_preprocessor_url = "https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3"

In [58]:
medium_bert_model = build_bert_model(medium_bert_preprocessor_url, medium_bert_model_url)

2023-04-15 19:11:58.012293: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'seq_length' with dtype int32
	 [[{{node seq_length}}]]
2023-04-15 19:11:58.012672: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'seq_length' with dtype int32
	 [[{{node seq_length}}]]
2023-04-15 19:11:58.012770: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'seq_length' with dtype int32
	 [[{{node seq_length}}]]
2023-04-15 19:11:58.013079: I tensorflow/core/

In [59]:
medium_bert_model.compile(optimizer="adam", loss="SparseCategoricalCrossentropy", metrics=['accuracy'])

medium_bert_history = medium_bert_model.fit(x=X_train, y=y_train_tokenized, epochs=10, validation_data=(X_valid, y_valid_tokenized))

Epoch 1/10


2023-04-15 19:12:06.352491: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'inputs' with dtype int32 and shape [?,128]
	 [[{{node inputs}}]]
2023-04-15 19:12:06.352554: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'inputs_1' with dtype int32 and shape [?,128]
	 [[{{node inputs_1}}]]
2023-04-15 19:12:07.329916: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/bert_encoder/StatefulPartitionedCall_grad/bert_encoder/StatefulPart

Epoch 2/10
Epoch 3/10
Epoch 4/10

KeyboardInterrupt: 