Preprocessing FIFA WWC 2022

In [None]:
import pandas as pd
import re
from sklearn.preprocessing import LabelEncoder
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag
from nltk.corpus import wordnet

file_path = 'fifa_world_cup_2022_tweets.csv'
tweets_df = pd.read_csv(file_path)

nltk.download('punkt')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

def clean_tweet_text(text):
    text = re.sub(r'http\S+', '', text)  # odstraneni URL
    text = re.sub(r'@\w+', '', text)  # odstraneni uzivatelskych jmen
    text = re.sub(r'#', '', text)  # odstraneni hashtagu
    text = re.sub(r'\n', ' ', text)  # nahrazeni novych radku mezerami
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # odstraneni nealfabetickych znaku
    text = re.sub(r'\s+', ' ', text).strip()  # odstraneni nadbytecnych mezer
    return text

tweets_df['Cleaned Tweet'] = tweets_df['Tweet'].apply(clean_tweet_text)
print("\nData after cleaning:")
print(tweets_df[['Tweet', 'Cleaned Tweet']].head())

lemmatizer = WordNetLemmatizer()

def get_wordnet_pos(treebank_tag):
    """Prevede treebank tagy na tagy WordNet."""
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN  

def tokenize_and_lemmatize(text):
    tokens = word_tokenize(text) # tokenizace textu
    tagged_tokens = pos_tag(tokens)  # POS tagging
    lemmatized_words = [lemmatizer.lemmatize(word, get_wordnet_pos(tag)) for word, tag in tagged_tokens]
    return ' '.join(lemmatized_words)

tweets_df['Tokenized and Lemmatized Tweet'] = tweets_df['Cleaned Tweet'].apply(tokenize_and_lemmatize)
print("\nData after tokenization and lemmatization:")
print(tweets_df[['Cleaned Tweet', 'Tokenized and Lemmatized Tweet']].head())

# prevod textovych sentimentu na ciselne kody
label_encoder = LabelEncoder()
tweets_df['Sentiment Code'] = label_encoder.fit_transform(tweets_df['Sentiment'])

WordCloud for dataset

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(8, 6))
sns.countplot(x='Sentiment', data=tweets_df, palette='viridis')
plt.title('Distribution of Sentiment')
plt.xlabel('Sentiment')
plt.ylabel('Count')
plt.show()

selected_sentiments = sentiments[:3]

for sentiment in selected_sentiments:
    plt.figure(figsize=(8, 6))
    subset = tweets_df[tweets_df['Sentiment'] == sentiment]
    text = " ".join(review for review in subset['Tokenized and Lemmatized Tweet'])
    wordcloud = WordCloud(background_color="white", max_words=200, contour_width=3, contour_color='steelblue').generate(text)
    
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.title(f'WordCloud for {sentiment} Sentiment')
    plt.axis("off")
    plt.show()

Training models without lemmatization and tokenization

In [None]:
from sklearn.model_selection import train_test_split

# rozdeleni dat na trenovaci a testovaci
X_train, X_test, y_train, y_test = train_test_split(
    tweets_df['Cleaned Tweet'],  
    tweets_df['Sentiment Code'],  
    test_size=0.2,  # velikost testovaci sady
    random_state=42  # seed
)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# inicializace TfidfVectorizeru
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

Naive Bayes without lemmatization and tokenization

In [None]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

nb_classifier = MultinomialNB()
nb_classifier.fit(X_train_tfidf, y_train)

y_pred = nb_classifier.predict(X_test_tfidf)

accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")
print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))

Logistic Regression without lemmatization and tokenization

In [None]:
from sklearn.linear_model import LogisticRegression

log_reg_classifier = LogisticRegression(max_iter=1000, random_state=42)
log_reg_classifier.fit(X_train_tfidf, y_train)

y_pred_log_reg = log_reg_classifier.predict(X_test_tfidf)
accuracy_log_reg = accuracy_score(y_test, y_pred_log_reg)

print(f"Accuracy: {accuracy_log_reg}")
print(classification_report(y_test, y_pred_log_reg, target_names=label_encoder.classes_))

Support Vector Machines without lemmatization and tokenization

In [None]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report

svm_classifier = SVC(kernel='linear', random_state=42)
svm_classifier.fit(X_train_tfidf, y_train)

y_pred_svm = svm_classifier.predict(X_test_tfidf)
accuracy_svm = accuracy_score(y_test, y_pred_svm)
print(f"Accuracy: {accuracy_svm}")
print(classification_report(y_test, y_pred_svm, target_names=label_encoder.classes_))

BERT 

In [None]:
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split
import pandas as pd
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# tokenizace a priprava datovych sekvenci
encoded_data = [tokenizer.encode(text, add_special_tokens=True, max_length=512, truncation=True) for text in tweets_df['Cleaned Tweet']]
max_len = max([len(sent) for sent in encoded_data])
input_ids = [sent + [0] * (max_len - len(sent)) for sent in encoded_data]

attention_masks = [[float(i > 0) for i in seq] for seq in input_ids]

input_ids = torch.tensor(input_ids)
attention_masks = torch.tensor(attention_masks)
labels = torch.tensor(tweets_df['Sentiment Code'].values).long()

# rozdeleni dat na trenovaci a validacni sadu
train_inputs, validation_inputs, train_labels, validation_labels = train_test_split(input_ids, labels, random_state=42, test_size=0.2)
train_masks, validation_masks, _, _ = train_test_split(attention_masks, input_ids, random_state=42, test_size=0.2)

train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=32)

model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased",  
    num_labels=3,  
    output_attentions=False,
    output_hidden_states=False,
)

# nastaveni optimizeru a scheduleru
optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8)
epochs = 4
total_steps = len(train_dataloader) * epochs
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps=0,
                                            num_training_steps=total_steps)

# trenovani modelu
for epoch_i in range(0, epochs):
    print(f"{'='*8} Epoch {epoch_i + 1} / {epochs} {'='*8}")

    model.train()
    total_loss = 0

    for step, batch in enumerate(train_dataloader):
        b_input_ids = batch[0]
        b_input_mask = batch[1]
        b_labels = batch[2]
        
        model.zero_grad()        
        outputs = model(b_input_ids, 
                        token_type_ids=None, 
                        attention_mask=b_input_mask, 
                        labels=b_labels)
        
        loss = outputs.loss
        total_loss += loss.item()
        loss.backward()
        
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        
        optimizer.step()
        scheduler.step()

    avg_train_loss = total_loss / len(train_dataloader)
    print(f"Average training loss: {avg_train_loss}")

print("Training complete.")

In [None]:
import matplotlib.pyplot as plt

epochs = [1, 2, 3, 4]
training_losses = [0.5781036733593851, 0.34983963131528545, 0.2245372372976143, 0.15559972962364554]

plt.figure(figsize=(8, 5))
plt.plot(epochs, training_losses, marker='o', linestyle='-', color='blue')
plt.title('Training Loss per Epoch')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.grid(visible=True)
plt.xticks(epochs)  
plt.tight_layout()  
plt.show()

In [None]:
import numpy as np
from sklearn.metrics import accuracy_score, classification_report

validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels)
validation_sampler = SequentialSampler(validation_data)
validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=32)

model.eval()

predictions, true_labels = [], []

for batch in validation_dataloader:
    batch = tuple(t for t in batch)
    
    b_input_ids, b_input_mask, b_labels = batch

    with torch.no_grad():
        outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)

    logits = outputs.logits
    logits = logits.detach().cpu().numpy()
    label_ids = b_labels.to('cpu').numpy()
    
    predictions.append(logits)
    true_labels.append(label_ids)

predictions = np.concatenate(predictions, axis=0)
true_labels = np.concatenate(true_labels, axis=0)
predicted_labels = np.argmax(predictions, axis=1)

print("Validation Accuracy: {:.2f}".format(accuracy_score(true_labels, predicted_labels)))
print("\nClassification Report:\n", classification_report(true_labels, predicted_labels, target_names=['Negative', 'Neutral', 'Positive']))

In [None]:
from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np  

conf_mat_bert = confusion_matrix(true_labels, predicted_labels)

plt.figure(figsize=(10, 7))
sns.heatmap(conf_mat_bert, annot=True, fmt='d', cmap='Blues', xticklabels=['Negative', 'Neutral', 'Positive'], yticklabels=['Negative', 'Neutral', 'Positive'])
plt.title('Confusion Matrix for BERT Classifier')
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.show()

In [None]:
model_save_path = "/Plocha/bp "
model.save_pretrained(model_save_path)
tokenizer.save_pretrained(model_save_path)
print(f"Model saved to {model_save_path}")

Training models with lemmatization and tokenization

In [None]:
from sklearn.model_selection import train_test_split

# rozdeleni dat na trenovaci a testovaci
X_train, X_test, y_train, y_test = train_test_split(
    tweets_df['Tokenized and Lemmatized Tweet'],  
    tweets_df['Sentiment Code'],  
    test_size=0.2,  # velikost testovaci sady
    random_state=42  # seed
)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

Naive Bayes with lemmatization and tokenization

In [None]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

nb_classifier = MultinomialNB()
nb_classifier.fit(X_train_tfidf, y_train)

y_pred = nb_classifier.predict(X_test_tfidf)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")
print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))

In [None]:
from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

conf_mat = confusion_matrix(y_test, y_pred)

plt.figure(figsize=(10, 7))
sns.heatmap(conf_mat, annot=True, fmt='d', cmap='Blues',
            xticklabels=label_encoder.classes_, yticklabels=label_encoder.classes_)
plt.title('Confusion Matrix for Naive Bayes')
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.show()

Logistic Regression without lemmatization and tokenization

In [None]:
from sklearn.linear_model import LogisticRegression

log_reg_classifier = LogisticRegression(max_iter=1000, random_state=42)
log_reg_classifier.fit(X_train_tfidf, y_train)
y_pred_log_reg = log_reg_classifier.predict(X_test_tfidf)
accuracy_log_reg = accuracy_score(y_test, y_pred_log_reg)
print(f"Accuracy: {accuracy_log_reg}")
print(classification_report(y_test, y_pred_log_reg, target_names=label_encoder.classes_))

In [None]:
from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

conf_mat_log_reg = confusion_matrix(y_test, y_pred_log_reg)

plt.figure(figsize=(10, 7))
sns.heatmap(conf_mat_log_reg, annot=True, fmt='d', cmap='Blues',
            xticklabels=label_encoder.classes_, yticklabels=label_encoder.classes_)
plt.title('Confusion Matrix for Logistic Regression')
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.show()

Support Vector Machines without lemmatization and tokenization

In [None]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report

svm_classifier = SVC(kernel='linear', random_state=42)
svm_classifier.fit(X_train_tfidf, y_train)

y_pred_svm = svm_classifier.predict(X_test_tfidf)
accuracy_svm = accuracy_score(y_test, y_pred_svm)
print(f"Accuracy: {accuracy_svm}")
print(classification_report(y_test, y_pred_svm, target_names=label_encoder.classes_))

In [None]:
from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

conf_mat_svm = confusion_matrix(y_test, y_pred_svm)

plt.figure(figsize=(10, 7))
sns.heatmap(conf_mat_svm, annot=True, fmt='d', cmap='Blues',
            xticklabels=label_encoder.classes_, yticklabels=label_encoder.classes_)
plt.title('Confusion Matrix for SVM Classifier')
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.show()

Preprocessing WWC23

In [None]:
import pandas as pd

file_paths = [
    'dataset_tweet-scraper_2024-03-27_20-40-06-746.csv',
    'dataset_tweet-scraper_2024-03-27_20-39-24-804.csv',
    'dataset_tweet-scraper_2024-03-27_20-38-14-709.csv',
    'dataset_tweet-scraper_2024-03-27_20-37-19-059.csv',
    'dataset_tweet-scraper_2024-03-27_20-36-29-379.csv',
    'dataset_tweet-scraper_2024-03-27_20-35-46-867.csv',
    'dataset_tweet-scraper_2024-03-27_20-35-15-063.csv',
    'dataset_tweet-scraper_2024-03-27_20-34-05-633.csv',
    'dataset_tweet-scraper_2024-03-25_21-20-38-158.csv',
    'dataset_tweet-scraper_2024-03-25_20-55-59-951.csv',
]

merged_df = pd.concat(dataframes, ignore_index=True)

from nltk.corpus import stopwords
import nltk
import re
nltk.download('stopwords')

# detekce sloupce s textem tweetu
if "text" not in merged_df.columns:
    for col in merged_df.columns:
        if merged_df[col].dtype == "object":
            text_column = col
            break
else:
    text_column = "text"

def preprocess_text(text):
    text = text.lower() # konverze textu na mala pismena
    text = re.sub(r"http\S+|www\S+|https\S+", '', text, flags=re.MULTILINE)     # odstraneni URL
    text = re.sub(r'\@\w+|\#','', text)     # odstraneni uzivatelskych jmen a hashtagu
    text = re.sub(r'[^\w\s]', '', text)     # odstraneni speciálních znaků
    stop_words = set(stopwords.words('english'))        # filtrace stopwords
    text = ' '.join([word for word in text.split() if word not in stop_words])
    return text

merged_df[text_column + '_preprocessed'] = merged_df[text_column].apply(preprocess_text)
merged_df[[text_column, text_column + '_preprocessed']].head()

Adding sentiments to tweets

In [None]:
import nltk
import re
from textblob import TextBlob
from sklearn.preprocessing import LabelEncoder

def sentiment_analysis(tweet_df):
    def getSubjectivity(text):
        return TextBlob(text).sentiment.subjectivity

    def getPolarity(text):
        return TextBlob(text).sentiment.polarity
    
    # aplikace funkci pro vypocet subjektivity a polarity
    tweet_df['TextBlob_Subjectivity'] = tweet_df[text_column + '_preprocessed'].apply(getSubjectivity)
    tweet_df['TextBlob_Polarity'] = tweet_df[text_column + '_preprocessed'].apply(getPolarity)

    # prirazeni sentimentu na zaklade polarity
    def getAnalysis(score):
        if score < 0:
            return 'Negative'
        elif score == 0:
            return 'Neutral'
        else:
            return 'Positive'
    
    tweet_df['TextBlob_Analysis'] = tweet_df['TextBlob_Polarity'].apply(getAnalysis)
    return tweet_df

merged_df = sentiment_analysis(merged_df)

# prevod textovych labelu sentimentu na numericke kody
label_encoder = LabelEncoder()
merged_df['Sentiment Code'] = label_encoder.fit_transform(merged_df['TextBlob_Analysis'])

merged_df[[text_column, text_column + '_preprocessed', 'TextBlob_Analysis', 'Sentiment Code']].head()

Applying trained BERT on WWC23

In [None]:
import torch
import numpy as np
from transformers import BertTokenizer, BertForSequenceClassification
from sklearn.metrics import classification_report, accuracy_score 
from torch.utils.data import DataLoader, TensorDataset

model_path = "/Plocha/bp"

# Nacteni tokenizeru a modelu BERT z lokalniho uloziste
tokenizer = BertTokenizer.from_pretrained(model_path)
model = BertForSequenceClassification.from_pretrained(model_path)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

def encode_texts(texts):
    encoding = tokenizer.batch_encode_plus(
        texts,
        add_special_tokens=True,
        max_length=128,
        return_attention_mask=True,
        pad_to_max_length=True,
        return_tensors='pt',
    )
    return encoding

# priprava dat
texts = merged_df[text_column + '_preprocessed'].tolist()
labels = merged_df['Sentiment Code'].tolist()  

encoded_texts = encode_texts(texts)
input_ids = encoded_texts['input_ids']
attention_masks = encoded_texts['attention_mask']

labels = torch.tensor(labels)

dataset = TensorDataset(input_ids, attention_masks, labels)
data_loader = DataLoader(dataset, batch_size=32)  

def predict(data_loader):
    model.eval()
    predictions = []
    real_values = []

    with torch.no_grad():
        for batch in data_loader:
            b_input_ids, b_input_mask, b_labels = tuple(t.to(device) for t in batch)
            outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)
            logits = outputs.logits
            logits = logits.detach().cpu().numpy()
            label_ids = b_labels.to('cpu').numpy()
            predictions.append(logits)
            real_values.append(label_ids)

    predictions = np.concatenate(predictions, axis=0)
    real_values = np.concatenate(real_values, axis=0)
    return predictions, real_values

# provedeni predikce
predictions, real_values = predict(data_loader)
predicted_labels = np.argmax(predictions, axis=1)

accuracy = accuracy_score(real_values, predicted_labels)
print("Accuracy:", accuracy)

report = classification_report(real_values, predicted_labels, target_names=['Negative', 'Neutral', 'Positive'])
print(report)