In [None]:
%pip install tensorflow==2.15.0
%pip install contractions==0.1.73
%pip install word2number==1.1
%pip install Unidecode==1.3.7
%pip install num2words==0.5.13

In [None]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import tensorflow.keras.layers as L
from tensorflow.keras.losses import SparseCategoricalCrossentropy
from tensorflow.keras.optimizers import Adam
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
from nltk.corpus import stopwords
from nltk.stem.snowball import PorterStemmer
import re
import spacy
import collections
import nltk
from nltk.stem import WordNetLemmatizer
import wordcloud
from wordcloud import WordCloud
from collections import Counter
from string import punctuation
from sklearn.feature_extraction.text import TfidfVectorizer
import contractions
from word2number import w2n
import unidecode
from sklearn.metrics import ConfusionMatrixDisplay
import num2words
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from imblearn.over_sampling import SMOTE
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
import inflect
from tensorflow.keras.layers import Dropout
from sklearn.metrics import roc_curve, auc
from nltk import word_tokenize
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
#from tensorflow.keras.wrappers.scikit_learn import KerasClassifier
from mpl_toolkits.mplot3d import Axes3D
from sklearn.metrics import precision_recall_curve, average_precision_score
import tensorflow as tf
from transformers import DistilBertTokenizer, TFDistilBertForSequenceClassification

plt.rcParams['figure.figsize'] = (12,5)

nltk.download("stopwords")
nltk.download('punkt')
nltk.download('wordnet')
pd.set_option('display.max_colwidth', None)

# 1. Cel
Celem projektu jest stworzenie modelu analizy sentymentu, a dokładniej klasyfikatora binarnego wykrywającego agresję językową (obraźliwy język) we wpisach na portalu Twitter.


# 2. Pozyskanie danych

Dane pobrano ze strony Kaggle.com - [Hate Speech and Offensive Language Dataset](https://www.kaggle.com/datasets/mrmorj/hate-speech-and-offensive-language-dataset). W pierwszej kolejności pobrano dane i usunięto niepotrzebne kolumny, zostawiając tylko class i tweet, czyli klasyfikację wpisu i jego treść. Domyślnie wpisy neutralne mają klasę 2, a agresywne 1, zaś klasa 0, czyli tzw. mowa nienawiści została całkowicie usunięta, wobec czego zmieniono klasę wpisów neutralnych na 0.

In [None]:
df = pd.read_csv("./labeled_data.csv")
df.drop(columns = ['Unnamed: 0', "count", "hate_speech", "offensive_language", "neither"], inplace=True)
df = df[df["class"] != 0]
df.replace({"class": {2: 0}}, inplace=True)

df.head()

# 3. Wstępna ocena danych

Jak widać po poniższych wynikach operacja przebiegła pomyślnie. Ramka danych składa się teraz z dwóch kolumn, class przyjmuje wartość 0 lub 1, a w zbiorze danych nie ma wartości brakujących ani zduplikowanych wpisów.

In [None]:
df.info()

In [None]:
# sprawdzamy czy na pewno mamy odpowiednią liczbę klas
df['class'].unique()

In [None]:
# sprawdzamy czy istnieją zduplikowane tweety
df[df.duplicated(['tweet'], keep=False)]

In [None]:
# sprawdzamy czy istnieją brakujące wartości
df.isnull().sum()

Poniżej wyświetlono przykładowe wpisy neutralne oraz agresywne. Można zauważyć, że wpisy agresywne charakteryzują się dużo większą liczbą wulgaryzmów oraz wyzwiskami.

In [None]:
df[df['class'] == 0].head(5)

In [None]:
df[df['class'] == 1].head(5)

Następnie sprawdzono unikalne znaki. Jak widać oprócz liter znajduje się również wiele znaków interpunkcyjnych oraz cyfr. W dodatku litery występują zarówno jako duże, jak i małe.

In [None]:
print("Lista unikalnych znaków: ", set(df['tweet'].sum()))

Liczba unikalnych słów w całym zbiorze danych to 51350. Jednakże wiele z nich to tak zwane "stop words" lub słowa charakterystyczne dla Twittera, np. rt, czyli retweet.

In [None]:
results = set()
df['tweet'].str.lower().str.split().apply(results.update)
print("Liczba unikalnych słów: ", len(results))

In [None]:
# sprawdzamy najpopularniejsze słowa
print("Najpopularniejsze słowa i ich liczność: ", Counter(" ".join(df['tweet'].str.lower()).split()).most_common(100))

### Liczność klas

Sprawdzono liczność klas, jak widać na poniższym wykresie zbiór jest niezbalansowany i jest widoczna kilkukrotnie większa liczba wpisów agresywnych, niż neutralnych.

In [None]:
# sprawdzamy liczność klas
sns.countplot(x = "class", data = df)
plt.title("Liczność klas")
plt.show()

## Najczęściej występujące słowa

Następnie sprawdzono najpopularniejsze słowa dla całego zbioru danych, agresywnych wpisów i wpisów neutralnych.

In [None]:
def plot_top_words(top_words, title):
    words = [item[0] for item in top_words]
    counts = [item[1] for item in top_words]
    plt.bar(words, counts)
    plt.xlabel("Words")
    plt.ylabel("Counts")
    plt.title(title)
    plt.xticks(rotation=45)
    plt.show()

In [None]:
top_words_for_whole_dataset = Counter(" ".join(df['tweet'].str.lower()).split()).most_common(30)
plot_top_words(top_words_for_whole_dataset, "The most common 30 words in dataset")

In [None]:
top_words_for_agressive_tweets = Counter(" ".join(df[df['class']==1]["tweet"].str.lower()).split()).most_common(30)
plot_top_words(top_words_for_agressive_tweets, "The most common 30 words in agressive tweets")

In [None]:
top_words_for_neutral_tweets = Counter(" ".join(df[df['class']==0]["tweet"].str.lower()).split()).most_common(30)
plot_top_words(top_words_for_neutral_tweets, "The most common 30 words in neutral tweets")

W celu lepszego zwizualizowania najczęściej występujących słów utworzono "world cloud" dla całego zbioru danych, wpisów agresywnych i neutralnych.

In [None]:
def plot_word_cloud(words, title):
    wordCloud = WordCloud(width=400, height=300, random_state=100, max_font_size=100).generate(words)
    plt.figure(figsize=(11, 9))
    plt.axis('off')
    plt.imshow(wordCloud, interpolation="bilinear")
    plt.title(title)
    plt.show()

def join_tweets_by_class(data, class_label=None):
    if isinstance(data, pd.DataFrame):
        if class_label is not None:
            tweets = ' '.join([tweet for tweet in data[data['class'] == class_label]['tweet']])
        else:
            tweets = ' '.join([tweet for tweet in data['tweet']])

    if isinstance(data, pd.Series):
        tweets = ' '.join([tweet for tweet in data])

    return tweets

In [None]:
all_words = join_tweets_by_class(df)
plot_word_cloud(all_words, "Word cloud for whole dataset")

In [None]:
agressive_tweets_words = join_tweets_by_class(df, 1)
plot_word_cloud(agressive_tweets_words, "Word cloud for agressive tweets")

In [None]:
neutral_tweets_words = join_tweets_by_class(df, 0)
plot_word_cloud(neutral_tweets_words, "Word cloud for neutral tweets")

W związku z przewagą wpisów agresywnych w zbiorze danych najczęstsze słowa w całym zbiorze są zdominowane przez słowa charakterystyczne dla wpisów agresywnych. Uwagę zwraca również obecność dużej liczby tagów Twitter'a - "rt" oraz innych słów nie mających wpływu na końcowe znaczenie zdania.

## Rozkład długości wpisów pod względem liczby znaków
Obliczono statysyki opisowe oraz utworzono histogramy i wykresy pudełkowe w celu prześledzenia rozkładu długości wpisów.

In [None]:
df['tweet_length'] = df['tweet'].apply(len)
print("Statystyki opisowe dla długości wpisów w całym zbiorze danych: \n", df["tweet_length"].describe(), "\n")
print("Statystyki opisowe dla długości wpisów agresywnych: \n", df[df['class'] == 1]["tweet_length"].describe(), "\n")
print("Statystyki opisowe dla długości wpisów neutralnych: \n", df[df['class'] == 0]["tweet_length"].describe(), "\n")

In [None]:
# sprawdzamy rozkład długości wpisów pod względem liczby znaków
sns.displot(df['tweet_length'])
plt.title("Distribution of tweet lengths")
plt.show()

In [None]:
# sprawdzamy rozkład długości wpisów w poszczególnych klasach
graph = sns.FacetGrid(data=df, col='class')
graph.map(plt.hist, 'tweet_length', bins=40)
graph.set_titles("Class - {col_name}")
plt.suptitle("Distribution of tweet lenghts by class", y=1.1)

plt.show()

In [None]:
sns.boxplot(y='tweet_length', x='class', data=df)
plt.title("Tweet length by class")
plt.xlabel("Class")
plt.ylabel("Tweet Length")

plt.show()

Rozkłady długości wpisów w całym zbiorze danych oraz w poszczególnych klasach są zbliżone. Uwagę zwraca większa liczba wartości odstających w przypadku wpisów agresywnych.

In [None]:
df['number_of_words'] = df['tweet'].str.split().apply(len)
print("Statystyki opisowe dla długości wpisów pod względem liczby słów w całym zbiorze danych: \n", df["number_of_words"].describe(), "\n")
print("Statystyki opisowe dla długości wpisów agresywnych pod względem liczby słów: \n", df[df['class'] == 1]["number_of_words"].describe(), "\n")
print("Statystyki opisowe dla długości wpisów neutralnych pod względem liczby słów: \n", df[df['class'] == 0]["number_of_words"].describe(), "\n")

## Rozkład długości wpisów pod względem liczby słów
Obliczono statysyki opisowe oraz utworzono histogramy i wykresy pudełkowe w celu prześledzenia rozkładu liczby słów we wpisach.

In [None]:
sns.displot(df['number_of_words'])
plt.title("Distribution of number of words in tweets")
plt.show()

In [None]:
graph = sns.FacetGrid(data=df, col='class')
graph.map(plt.hist, 'number_of_words', bins=30)
graph.set_titles("Class - {col_name}")
plt.suptitle("Distribution of number of words in tweet by class", y=1.1)

plt.show()

In [None]:
sns.boxplot(y='number_of_words', x='class', data=df)
plt.title("Number of words in tweet by class")
plt.xlabel("Class")
plt.ylabel("Number of words in tweet")

plt.show()

Również w przypadku liczby słów we wpisach rozkłady w całym zbiorze danych oraz w poszczególnych klasach są zbliżone.

# 4. Przygotowanie danych do modelowania

Po wstępnej ocenie danych rozpoczęto ich przetwarzanie. W pierwszej kolejności podzielono ramkę danych na X i y.

### Podział na X i y

In [None]:
X = df['tweet'].copy()
y = df['class'].copy()


### Przygotowanie funkcji oczyszczających dane

Po podziale zbioru przygotowano funkcje mające na celu przeprowadzenie poszczególnych elementów normalizacji oraz lematyzacji.

In [None]:
def lower(tweet: str) -> str:
  return tweet.lower()

def clean_tweet(tweet: str) -> str:
    tweet = re.sub(r'http\S+', ' ', tweet) # usuwanie url
    tweet = re.sub(r'<.*?>',' ', tweet) # usuwanie tagów html
    tweet = re.sub(r'#\w+',' ', tweet) # usuwanie hasztagów
    tweet = re.sub(r'@\w+',' ', tweet) # usuwanie oznaczeń

    return tweet

def remove_punctuation(tweet: list) -> list:
    sentence = []
    for word in tweet:
      word = re.sub(r'[^\w\s]','', word) # usuwanie wszystkiego co nie jest literami oraz interpunkcją
      word = word.replace('-', '')
      word = word.replace('_', '')
      word = word.replace(',', '')
      word = re.sub('[^A-Za-z0-9]+', '', word) # usuwanie wszystkiego co nie jest literami oraz cyframi
      sentence.append(word) if word != '' else None
    return sentence

def remove_twitter_tags(tweet: list) -> list:
    twitter_tags = ("ff", "rt")
    tweet = [word for word in tweet if word not in twitter_tags]
    return tweet

def remove_stop_words(tweet: list) -> list:
    stop_words_extended = ["a","a's","able","about","above","according","accordingly","across","actually","after","afterwards","again","against","ain't","all","allow","allows","almost","alone","along","already","also","although","always","am","among","amongst","an","and","another","any","anybody","anyhow","anyone","anything","anyway","anyways","anywhere","apart","appear","appreciate","appropriate","are","aren't","around","as","aside","ask","asking","associated","at","available","away","awfully","b","be","became","because","become","becomes","becoming","been","before","beforehand","behind","being","believe","below","beside","besides","best","better","between","beyond","both","brief","but","by","c","c'mon","c's","came","can","can't","cannot","cant","cause","causes","certain","certainly","changes","clearly","co","com","come","comes","concerning","consequently","consider","considering","contain","containing","contains","corresponding","could","couldn't","course","currently","d","definitely","described","despite","did","didn't","different","do","does","doesn't","doing","don't","done","down","downwards","during","e","each","edu","eg","eight","either","else","elsewhere","enough","entirely","especially","et","etc","even","ever","every","everybody","everyone","everything","everywhere","ex","exactly","example","except","f","far","few","fifth","first","five","followed","following","follows","for","former","formerly","forth","four","from","further","furthermore","g","get","gets","getting","given","gives","go","goes","going","gone","got","gotten","greetings","h","had","hadn't","happens","hardly","has","hasn't","have","haven't","having","he","he's","hello","help","hence","her","here","here's","hereafter","hereby","herein","hereupon","hers","herself","hi","him","himself","his","hither","hopefully","how","howbeit","however","i","i'd","i'll","i'm","i've","ie","if","ignored","immediate","in","inasmuch","inc","indeed","indicate","indicated","indicates","inner","insofar","instead","into","inward","is","isn't","it","it'd","it'll","it's","its","itself","j","just","k","keep","keeps","kept","know","known","knows","l","last","lately","later","latter","latterly","least","less","lest","let","let's","like","liked","likely","little","look","looking","looks","ltd","m","mainly","many","may","maybe","me","mean","meanwhile","merely","might","more","moreover","most","mostly","much","must","my","myself","n","name","namely","nd","near","nearly","necessary","need","needs","neither","never","nevertheless","new","next","nine","no","nobody","non","none","noone","nor","normally","not","nothing","novel","now","nowhere","o","obviously","of","off","often","oh","ok","okay","old","on","once","one","ones","only","onto","or","other","others","otherwise","ought","our","ours","ourselves","out","outside","over","overall","own","p","particular","particularly","per","perhaps","placed","please","plus","possible","presumably","probably","provides","q","que","quite","qv","r","rather","rd","re","really","reasonably","regarding","regardless","regards","relatively","respectively","right","s","said","same","saw","say","saying","says","second","secondly","see","seeing","seem","seemed","seeming","seems","seen","self","selves","sensible","sent","serious","seriously","seven","several","shall","she","should","shouldn't","since","six","so","some","somebody","somehow","someone","something","sometime","sometimes","somewhat","somewhere","soon","sorry","specified","specify","specifying","still","sub","such","sup","sure","t","t's","take","taken","tell","tends","th","than","thank","thanks","thanx","that","that's","thats","the","their","theirs","them","themselves","then","thence","there","there's","thereafter","thereby","therefore","therein","theres","thereupon","these","they","they'd","they'll","they're","they've","think","third","this","thorough","thoroughly","those","though","three","through","throughout","thru","thus","to","together","too","took","toward","towards","tried","tries","truly","try","trying","twice","two","u","un","under","unfortunately","unless","unlikely","until","unto","up","upon","us","use","used","useful","uses","using","usually","uucp","v","value","various","very","via","viz","vs","w","want","wants","was","wasn't","way","we","we'd","we'll","we're","we've","welcome","well","went","were","weren't","what","what's","whatever","when","whence","whenever","where","where's","whereafter","whereas","whereby","wherein","whereupon","wherever","whether","which","while","whither","who","who's","whoever","whole","whom","whose","why","will","willing","wish","with","within","without","won't","wonder","would","wouldn't","x","y","yes","yet","you","you'd","you'll","you're","you've","your","yours","yourself","yourselves","z","zero"]
    stop_words_set = set(stopwords.words('english'))
    stop_words_set = stop_words_set.union(set(stop_words_extended))

    tweet = [word for word in tweet if word not in stop_words_set]

    return tweet

def stemming(tweet: list) -> list:
    stemmer = PorterStemmer()
    tweet = [stemmer.stem(word) for word in tweet]
    return tweet

def tokenize_tweet(tweet: str) -> list:
  tokenized_tweet = nltk.word_tokenize(tweet)
  return tokenized_tweet

def remove_extra_whitespaces(tweet: str) -> str:
  tweet = re.sub(' +',' ', tweet) # jedna lub więcej spacji
  return tweet

def expand_contractions(tweet: str) -> str:
  tweet = contractions.fix(tweet)
  return tweet

def unidecode_characters(tweet: str) -> str:
  tweet = unidecode.unidecode(tweet)
  return tweet

def join_list_to_sentence(list_of_words: list) -> str:
  sentence = ' '.join(list_of_words)
  sentence = sentence.strip()
  return sentence

def transform_numbers_to_words(word_list: list) -> list:
    engine = inflect.engine()
    transformed_list = []

    for word in word_list:
        if word.isdigit():
            transformed_word = engine.number_to_words(word)
            transformed_list.append(transformed_word)
        else:
            transformed_list.append(word)

    return transformed_list

def lemmatize_words(word_list: list) -> list:
    lemmatizer = WordNetLemmatizer()
    lemmatized_words = []

    for word in word_list:
        lemmatized_word = lemmatizer.lemmatize(word)
        lemmatized_words.append(lemmatized_word)

    return lemmatized_words

### Normalizacja
Po przygotowaniu odpowiednich funkcji przystąpiono do normalizacji danych.

In [None]:
X.head(1)

Normalizacje rozpoczęto od ujednolicenia wielkości liter poprzez zmniejszenie ich.

In [None]:
# zmniejszanie liter
X_lowed = X.apply(lower)
X_lowed.head(1)

Następnie usunięto zbędne elementy, jak na przykład tagi HTMl albo elementy adresów URL.

In [None]:
# usuwanie niepotrzebnych elementów, np.tagi HTML
X_cleaned = X_lowed.apply(clean_tweet)
X_cleaned.head(1)

Po usunięciu zbędnych elementów przeprowadzono unidecoding, czyli zmianę formatu unicode na format ASCII.

In [None]:
# unidecoding
X_without_accented_characters = X_cleaned.apply(unidecode_characters)
X_without_accented_characters.head(1)

Następnie usunięto ze zbioru nadmierne odstępy między słowami.

In [None]:
# usuwanie nadmiernych spacji
X_without_extra_whitespaces = X_without_accented_characters.apply(remove_extra_whitespaces)
X_without_extra_whitespaces.head(1)

Kolejnym krokiem była zamiana skróconych wersji słów do ich pełnych form, przykładowo I've -> I have.

In [None]:
# expand contractions
X_with_expanded_contractions = X_without_extra_whitespaces.apply(expand_contractions)
X_with_expanded_contractions.head(1)

Następnie przeprowadzono tokenizację.

In [None]:
# tokenizacja
X_tokenized = X_with_expanded_contractions.apply(tokenize_tweet)
X_tokenized.head(1)

Następnie zamieniono wartości liczbowe do odpowiadających im słowom.



In [None]:
# zamiana liczb na słowa
X_without_numbers = X_tokenized.apply(transform_numbers_to_words)
X_without_numbers.head(1)

W kolejnym kroku usunięto znaki interpunkcyjne.

In [None]:
# usuwanie znaków interpunkcyjnych
X_without_punctuation = X_without_numbers.apply(remove_punctuation)
X_without_punctuation.head(1)

Następnie usunięto słowa ff i rt, czyli tagi na Twitter'ze.

In [None]:
# usuwanie tagów Twitter
X_without_twitter_tags = X_without_punctuation.apply(remove_twitter_tags)
X_without_twitter_tags.head(1)

Po usunięciu tagów usunięto tzw. "stop words", czyli słowa mające mały wpływ na końcowe znaczenie zdania, na przykład: a, an, the.

In [None]:
# usuwanie stop words
X_without_stop_words = X_without_twitter_tags.apply(remove_stop_words)
X_without_stop_words.head(1)

### Stemming

Na znormalizowanym zbiorze danych można przeprowadzić stemming lub lematyzację, czyli redukcję słów do ich bazowej formy. W eksperymencie zdecydowano się na wykorzystanie lematyzacji, która służy do tego samego zadania co stemming, ale zachowuje znaczenie słów, np. nie ujednolica training i train do tego samego znaczenia.

In [None]:
# stemming
# X_stemmed = X_without_stop_words.apply(stemming)
# X_stemmed.head(3)

### Lematyzacja

In [None]:
# lematyzacja
X_lemmatized = X_without_stop_words.apply(lemmatize_words)
X_lemmatized.head(1)

In [None]:
X_preprocessed = X_lemmatized.apply(join_list_to_sentence)
X_preprocessed.head(1)

## Sprawdzenie skuteczności przeprowadzonych operacji

Po przeprowadzeniu normalizacji oraz lematyzacji ponownie obliczono statystyki oraz wygenerowano wykresy użyte w poprzedniej fazie eksperymentu w celu upewnienia się, że zastosowane zabiegi były skuteczne.

In [None]:
# sprawdzamy unikalne symbole po oczyszczeniu danych
print("Lista unikalnych znaków po przetworzeniu danych: ", set(X_preprocessed.sum()))

In [None]:
# sprawdzamy liczbę unikalnych słów po przetworzeniu danych
results = set()
X_preprocessed.str.lower().str.split().apply(results.update)
print("Liczba unikalnych słów po przetworzeniu danych: ", len(results))

In [None]:
# sprawdzamy najpopularniejsze słowa
Counter(" ".join(X_preprocessed).split()).most_common(10)

In [None]:
all_words_preprocessed = join_tweets_by_class(X_preprocessed)
plot_word_cloud(all_words_preprocessed, "Word cloud for whole dataset")

In [None]:
top_words_for_whole_dataset_preprocessed = Counter(" ".join(X_preprocessed.str.lower()).split()).most_common(30)
plot_top_words(top_words_for_whole_dataset_preprocessed, "The most common 30 words in dataset")

In [None]:
X_preprocessed_copy = X_preprocessed.copy()
X_preprocessed_copy = pd.DataFrame(X_preprocessed_copy)

In [None]:
X_preprocessed_copy['tweet_length'] = X_preprocessed_copy["tweet"].apply(len)

sns.boxplot(y='tweet_length', data=X_preprocessed_copy)
plt.title("Tweet length")
plt.xlabel("Class")
plt.ylabel("Tweet Length")
plt.show()

In [None]:
X_preprocessed_copy['number_of_words'] = X_preprocessed_copy["tweet"].str.split().apply(len)
sns.displot(X_preprocessed_copy['number_of_words'])
plt.title("Distribution of number of words in tweets")
plt.show()

## Rozkład klas

In [None]:
sns.countplot(x=y)
plt.title("Liczność klas")
plt.show()

## Podział na zbiór treningowy, walidacyjny i testowy

Następnie dokonano podziału na zbiór treningowy, walidacyjny i testowy w proporcji 70:15:15.

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_preprocessed, y, test_size=0.3, random_state=101)
X_test, X_val, y_test, y_val = train_test_split(X_test, y_test, test_size=0.5, random_state=101)

In [None]:
print(X_train.shape)
print(X_val.shape)
print(X_test.shape)

## Oversampling

Jak zauważono na początku klasy są niezbalansowane, więc zdecydowano się na oversampling wpisów neutralnych dla zbioru treningowego.

In [None]:
from imblearn.over_sampling import RandomOverSampler

y_train_reshaped = np.array(y_train).reshape(-1, 1)
X_train_reshaped = np.array(X_train.values).reshape(-1, 1)
rus = RandomOverSampler(random_state=42)
X_train, y_train = rus.fit_resample(X_train_reshaped, y_train_reshaped)
X_train = pd.Series(X_train.flatten())
y_train = pd.Series(y_train.flatten())

In [None]:
sns.countplot(x=y_train)
plt.title("Liczność klas")
plt.show()

In [None]:
y_train = tf.convert_to_tensor(y_train)
y_val = tf.convert_to_tensor(y_val)
y_test = tf.convert_to_tensor(y_test)

## Word embedding

In [None]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)

X_train = tokenizer.texts_to_sequences(X_train)
X_val = tokenizer.texts_to_sequences(X_val)
X_test = tokenizer.texts_to_sequences(X_test)

X_train = pad_sequences(X_train, maxlen=128, padding='post', truncating='post')
X_val = pad_sequences(X_val, maxlen=128, padding='post', truncating='post')
X_test = pad_sequences(X_test, maxlen=128, padding='post', truncating='post')

vocabulary_size = len(tokenizer.word_index) + 1

# 5. Modelowanie

In [None]:
def plot_training_history(model_history):
    epochs = range(1, len(model_history['loss']) + 1)
    loss = model_history['loss']
    validation_loss = model_history['val_loss']
    accuracy = model_history['accuracy']
    validation_accuracy = model_history['val_accuracy']

    plt.figure(figsize=(9, 5))
    plt.title('Wartość straty w zależności od epoki')
    plt.plot(epochs, validation_loss, 'r', label='Wartość straty dla zbioru walidacyjnego',)
    plt.plot(epochs, loss, 'b', label='Wartość straty dla zbioru treningowego',)
    plt.ylabel('Wartość straty')
    plt.xlabel('Epoka')
    plt.grid(True)
    plt.legend()

    plt.xticks(epochs)

    plt.show()

def plot_accuracy_history(model_history):
    epochs = range(1, len(model_history['accuracy']) + 1)
    accuracy = model_history['accuracy']
    validation_accuracy = model_history['val_accuracy']

    plt.figure(figsize=(9, 5))
    plt.title('Dokładność w zależności od epoki')
    plt.plot(epochs, validation_accuracy, 'r', label='Dokładność dla zbioru walidacyjnego')
    plt.plot(epochs, accuracy, 'b', label='Dokładność dla zbioru treningowego')
    plt.ylabel('Dokładność')
    plt.xlabel('Epoka')
    plt.legend()
    plt.grid(True)

    plt.xticks(epochs)

    plt.show()


## Fine-tuning modelu języka

In [None]:
callback = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

ft_model = TFDistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2)
ft_model.compile(optimizer='adam',
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
              metrics=['accuracy'])

ft_model_history = ft_model.fit(X_train, y_train, epochs=20,
                                validation_data=(X_val, y_val), batch_size=16,
                                callbacks=[callback])


In [None]:
plot_training_history(ft_model_history)

In [None]:
plot_accuracy_history(ft_model_history)

# 6. Ewaluacja


In [None]:
def prepare_confusion_matrix(preds, title):
    predictions_result = tf.cast(tf.squeeze(tf.round(preds)), dtype=tf.int32)
    conf = confusion_matrix(y_test, predictions_result)
    cm = pd.DataFrame(
        conf, index=[f'Wpis w rzeczywistości {i}' for i in ['neutralny', 'agresywny']],
        columns=[f'Wpis przewidziany jako {i}' for i in ['neutralny', 'agresywny']]
    )
    sns.heatmap(cm, annot=True, fmt="d", cmap='Blues')
    plt.ylabel('Rzeczywista wartość')
    plt.xlabel('Przewidziana wartość')
    plt.title(title)
    plt.show()

def prepare_class_report(preds, message):
  print(message)
  predictions_result = tf.cast(tf.squeeze(tf.round(preds)), dtype=tf.int32)
  print(classification_report(y_test, predictions_result, target_names=['Neutralny','Agresywny']))


## Model języka

In [None]:
ft_y_pred = ft_model.predict(X_test)
ft_y_pred

In [None]:
ft_y_pred = tf.argmax(ft_y_pred.logits, axis=1).numpy()
report = classification_report(y_test, ft_y_pred)
print(report)

In [None]:
prepare_confusion_matrix(ft_y_pred, 'Macierz omyłek dla dużego modelu językowegp')