<a href="https://colab.research.google.com/github/javmencia/COBWEBfiles/blob/main/Bert2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [17]:
import pandas as pd
import numpy as np
import re
from sklearn.ensemble import VotingClassifier
from sentence_transformers import SentenceTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold, cross_val_predict
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
import os
import pandas as pd
import re
from collections import Counter
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
import nltk

from sklearn.linear_model import LogisticRegressionCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [15]:

os.environ["TOKENIZERS_PARALLELISM"] = "false"

def mark_links_binary(text):
    text = text.encode('ascii', 'ignore').decode()

    # Regex patterns
    youtube_pattern = r'(?:https?://)?(?:www\.)?(?:youtube\.com|youtu\.be)'
    external_pattern = r'\b(?:http[s]?://|www\.)\S+\.(?:com|net|org|io|co|biz|info|me|us|ca|edu|gov|tv|uk|au|de|se|in|jp|cn|ru|fr|es|it|nl)(/\S*)?\b'

    contains_youtube_link = 0
    contains_external_link = 0

    # Check for YouTube links
    if re.search(youtube_pattern, text):
        contains_youtube_link = 1

    # Check for external links (excluding YouTube)
    external_links = re.findall(external_pattern, text)
    if any("youtube.com" not in url and "youtu.be" not in url for url in external_links):
        contains_external_link = 1

    return contains_youtube_link, contains_external_link

def is_person_name(author):
    # Either First Last or just First
    person_pattern = r'^[A-Z][a-z]+(\s[A-Z][a-z]+)?$'

    if re.match(person_pattern, author):
        return 1
    else:
        return 0

# For spam comments

def clean_and_tokenize(text):
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in ENGLISH_STOP_WORDS and word not in stopwords.words('english')]
    return tokens

def count_top_spam_words(comment, spam_words):
    comment = comment.lower()
    count = sum([1 for word in spam_words if re.search(rf'\b{re.escape(word)}\b', comment)])
    return count


In [29]:

# Load Training Data
train_data = pd.read_csv("train.csv") #"../detect-spam-youtube-comment/train.csv")
train_data['CONTENT'] = train_data['CONTENT'].str.replace('\ufeff', '', regex=False)
train_data['CLASS'] = train_data['CLASS'].astype(int)

texts = train_data['CONTENT']
labels = train_data['CLASS']
video_names = train_data['VIDEO_NAME']
authors = train_data['AUTHOR']
train_data['is_date_na'] = train_data['DATE'].isna().astype(int)  # 1 if DATE is NA, 0 otherwise

# Cleaning
train_data['contains_youtube_link'], train_data['contains_external_link'] = zip(*train_data['CONTENT'].apply(mark_links_binary))
train_data['caps_count'] = train_data['CONTENT'].str.count(r'[A-Z]')
train_data['content_length'] = train_data['CONTENT'].str.len()
train_data['is_person_name'] = train_data['AUTHOR'].apply(is_person_name)

# Load Testing Data
test_data = pd.read_csv("test.csv") #"../detect-spam-youtube-comment/test.csv")
test_data['CONTENT'] = test_data['CONTENT'].str.replace('\ufeff', '', regex=False)
video_names_test = test_data['VIDEO_NAME']
authors_test = test_data['AUTHOR']
test_data['is_date_na'] = test_data['DATE'].isna().astype(int)  # 1 if DATE is NA, 0 otherwise

# Cleaning
test_data['contains_youtube_link'], test_data['contains_external_link'] = zip(*test_data['CONTENT'].apply(mark_links_binary))
test_data['caps_count'] = test_data['CONTENT'].str.count(r'[A-Z]')
test_data['content_length'] = test_data['CONTENT'].str.len()
test_data['is_person_name'] = test_data['AUTHOR'].apply(is_person_name)

# SPAM words counter

spam_comments = train_data[train_data['CLASS'] == 1]['CONTENT']
spam_words = spam_comments.apply(clean_and_tokenize).sum()
word_freq = Counter(spam_words)
top5spam = [word for word, count in word_freq.most_common(5)]
print("Top 5 Spam Words:", top5spam)

train_data['containstop5spam'] = train_data['CONTENT'].apply(lambda x: count_top_spam_words(x, top5spam))
test_data['containstop5spam'] = test_data['CONTENT'].apply(lambda x: count_top_spam_words(x, top5spam))



Top 5 Spam Words: ['check', 'video', 'subscribe', 'youtube', 'channel']


In [30]:


# Load BERT Model
model = SentenceTransformer('all-mpnet-base-v2')

# Generate Embeddings for Training Data
content_embeddings_train = model.encode(texts.tolist(), show_progress_bar=True)
author_embeddings_train = model.encode(authors.tolist(), show_progress_bar=True)
video_name_embeddings_train = model.encode(video_names.tolist(), show_progress_bar=True)

combined_embeddings_train = np.array([
    np.hstack((content, author, video,
               [train_data['is_date_na'].iloc[i],
                train_data['caps_count'].iloc[i],
                train_data['content_length'].iloc[i]],
               train_data['contains_youtube_link'].iloc[i],
               train_data['contains_external_link'].iloc[i],
               train_data['is_person_name'].iloc[i],
               train_data['containstop5spam'].iloc[i]
               ))
    for i, (content, author, video) in enumerate(zip(content_embeddings_train, author_embeddings_train, video_name_embeddings_train))
])

# Generate Embeddings for Testing Data
content_embeddings_test = model.encode(test_data['CONTENT'].tolist(), show_progress_bar=True)
author_embeddings_test = model.encode(authors_test.tolist(), show_progress_bar=True)
video_name_embeddings_test = model.encode(video_names_test.tolist(), show_progress_bar=True)

combined_embeddings_test = np.array([
    np.hstack((content, author, video,
               [test_data['is_date_na'].iloc[i],
                test_data['caps_count'].iloc[i],
                test_data['content_length'].iloc[i]],
               test_data['contains_youtube_link'].iloc[i],
               test_data['contains_external_link'].iloc[i],
               test_data['is_person_name'].iloc[i],
               test_data['containstop5spam'].iloc[i]
               ))
    for i, (content, author, video) in enumerate(zip(content_embeddings_test, author_embeddings_train, video_name_embeddings_train))
])

# Define Voting Classifier
voting_clf = VotingClassifier(
            estimators=[
                        ('logistic', LogisticRegression(max_iter=1000, random_state=42)),
                        ('svm', SVC(kernel='linear', C=1, probability=True, random_state=42)),
                        ('xgb', XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)),
                        ('rf', RandomForestClassifier(n_estimators=100, random_state=42))
            ],
            voting='soft'  # Use 'soft' for probabilities, 'hard' for majority vote
            )




Batches:   0%|          | 0/43 [00:00<?, ?it/s]

Batches:   0%|          | 0/43 [00:00<?, ?it/s]

Batches:   0%|          | 0/43 [00:00<?, ?it/s]

Batches:   0%|          | 0/19 [00:00<?, ?it/s]

Batches:   0%|          | 0/19 [00:00<?, ?it/s]

Batches:   0%|          | 0/19 [00:00<?, ?it/s]

# Results

## Original

Code that we submitted last night

In [5]:
kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

# Perform Cross-Validation Predictions
kf_predictions = cross_val_predict(
            voting_clf, combined_embeddings_train, labels,
                cv=kf, n_jobs=-1  # Parallel processing for efficiency
                )

# Compute Confusion Matrix
kf_conf_matrix = confusion_matrix(labels, kf_predictions)

# Compute Accuracy
kf_accuracy = accuracy_score(labels, kf_predictions)

# Print Results
print("Confusion Matrix (Stratified K-Fold):\n", kf_conf_matrix)
print("\nClassification Report (Stratified K-Fold):\n", classification_report(labels, kf_predictions))
print("\nStratified K-Fold Accuracy:", kf_accuracy)

# Train Voting Classifier on Full Training Data
voting_clf.fit(combined_embeddings_train, labels)



Confusion Matrix (Stratified K-Fold):
 [[648  11]
 [ 26 684]]

Classification Report (Stratified K-Fold):
               precision    recall  f1-score   support

           0       0.96      0.98      0.97       659
           1       0.98      0.96      0.97       710

    accuracy                           0.97      1369
   macro avg       0.97      0.97      0.97      1369
weighted avg       0.97      0.97      0.97      1369


Stratified K-Fold Accuracy: 0.972972972972973


Parameters: { "use_label_encoder" } are not used.



# With containstop5spam

Added this variable but performance got a bit worse

In [22]:
kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

# Perform Cross-Validation Predictions
kf_predictions = cross_val_predict(
            voting_clf, combined_embeddings_train, labels,
                cv=kf, n_jobs=-1  # Parallel processing for efficiency
                )

# Compute Confusion Matrix
kf_conf_matrix = confusion_matrix(labels, kf_predictions)

# Compute Accuracy
kf_accuracy = accuracy_score(labels, kf_predictions)

# Print Results
print("Confusion Matrix (Stratified K-Fold):\n", kf_conf_matrix)
print("\nClassification Report (Stratified K-Fold):\n", classification_report(labels, kf_predictions))
print("\nStratified K-Fold Accuracy:", kf_accuracy)

# Train Voting Classifier on Full Training Data
voting_clf.fit(combined_embeddings_train, labels)



Confusion Matrix (Stratified K-Fold):
 [[647  12]
 [ 28 682]]

Classification Report (Stratified K-Fold):
               precision    recall  f1-score   support

           0       0.96      0.98      0.97       659
           1       0.98      0.96      0.97       710

    accuracy                           0.97      1369
   macro avg       0.97      0.97      0.97      1369
weighted avg       0.97      0.97      0.97      1369


Stratified K-Fold Accuracy: 0.970781592403214


Parameters: { "use_label_encoder" } are not used.



## With regularization on original code

Basically the same results as the original

In [32]:
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.model_selection import StratifiedKFold, cross_val_predict
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

# Define Stratified K-Fold Cross-Validation
kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

# Define Voting Classifier with Penalization
voting_clf = VotingClassifier(
    estimators=[
        # Logistic Regression with L2 (ridge) regularization
        ('logistic', LogisticRegression(max_iter=1000, penalty='l2', C=1.0, random_state=42)),

        # Support Vector Classifier with L2 regularization (default for SVC)
        ('svm', SVC(kernel='linear', C=1, probability=True, random_state=42)),

        # XGBoost with default regularization (lambda = 1 for L2)
        ('xgb', XGBClassifier(
            use_label_encoder=False,
            eval_metric='logloss',
            reg_lambda=1.0,  # L2 regularization
            reg_alpha=0.0,   # No L1 regularization
            random_state=42
        )),

        # Random Forest with implicit penalization through depth control
        ('rf', RandomForestClassifier(
            n_estimators=100,
            max_depth=10,  # Limit tree depth to prevent overfitting
            min_samples_split=5,  # Increase minimum samples for splitting
            random_state=42
        ))
    ],
    voting='soft'  # Use soft voting for probability averaging
)

# Perform Cross-Validation Predictions
kf_predictions = cross_val_predict(
    voting_clf, combined_embeddings_train, labels, cv=kf, n_jobs=-1
)

# Compute Confusion Matrix
kf_conf_matrix = confusion_matrix(labels, kf_predictions)

# Compute Accuracy
kf_accuracy = accuracy_score(labels, kf_predictions)

# Print Results
print("Confusion Matrix (Stratified K-Fold):\n", kf_conf_matrix)
print("\nClassification Report (Stratified K-Fold):\n", classification_report(labels, kf_predictions))
print("\nStratified K-Fold Accuracy:", kf_accuracy)

# Train Voting Classifier on Full Training Data
voting_clf.fit(combined_embeddings_train, labels)

# Make Predictions on Test Data
test_predictions = voting_clf.predict(combined_embeddings_test)

# Output Predictions to CSV
output = pd.DataFrame({'COMMENT_ID': test_data['COMMENT_ID'], 'CLASS': test_predictions})
output.to_csv("penalized_test_predictions.csv", index=False)


Confusion Matrix (Stratified K-Fold):
 [[648  11]
 [ 27 683]]

Classification Report (Stratified K-Fold):
               precision    recall  f1-score   support

           0       0.96      0.98      0.97       659
           1       0.98      0.96      0.97       710

    accuracy                           0.97      1369
   macro avg       0.97      0.97      0.97      1369
weighted avg       0.97      0.97      0.97      1369


Stratified K-Fold Accuracy: 0.9722425127830533


Parameters: { "use_label_encoder" } are not used.



## Lasso

Performs worse, i don't really know what the features are

In [31]:
# Define Stratified K-Fold for Cross-Validation
kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

lasso_clf = Pipeline([
    ('scaler', StandardScaler()),  # Standardize features
    ('lasso_cv', LogisticRegressionCV(
        penalty='l1',  # L1 regularization for feature selection
        solver='liblinear',  # Use liblinear solver
        cv=kf,  # Cross-validation
        max_iter=1000,
        random_state=42,
        Cs=10**np.linspace(0, 4, 20)  # Reduce regularization strength
    ))
])

# Fit Lasso with Cross-Validation
lasso_clf.fit(combined_embeddings_train, labels)

# Extract selected features
lasso_model = lasso_clf.named_steps['lasso_cv']
selected_features = np.where(np.any(lasso_model.coef_ != 0, axis=0))[0]

print("Number of Selected Features:", len(selected_features))
print("Selected Feature Indices:", selected_features)

# Evaluate on Training Data (Cross-Validation Accuracy)
kf_predictions = cross_val_predict(lasso_clf, combined_embeddings_train, labels, cv=kf, n_jobs=-1)

# Confusion Matrix and Accuracy
from sklearn.metrics import confusion_matrix
kf_conf_matrix = confusion_matrix(labels, kf_predictions)
kf_accuracy = accuracy_score(labels, kf_predictions)

print("\nConfusion Matrix (Cross-Validation):\n", kf_conf_matrix)
print("\nClassification Report (Cross-Validation):\n", classification_report(labels, kf_predictions))
print("\nCross-Validation Accuracy:", kf_accuracy)


Number of Selected Features: 249
Selected Feature Indices: [   3    6   14   19   25   27   29   32   33   35   41   65   66   84
   86   89   94   95  107  118  122  132  134  140  152  153  155  157
  169  176  183  186  188  192  194  197  199  201  203  211  218  221
  222  224  232  237  238  246  249  270  272  278  296  298  300  317
  318  320  322  333  341  343  344  353  362  363  368  369  370  375
  381  382  383  388  400  405  406  410  416  418  437  438  446  455
  457  463  472  479  480  482  485  488  491  508  511  513  521  525
  548  560  566  567  568  577  578  581  583  584  592  597  598  601
  604  605  610  611  620  622  629  630  631  634  648  650  655  657
  662  664  670  672  673  676  681  687  699  700  701  706  710  711
  716  722  728  756  758  766  780  785  791  793  799  834  841  846
  864  869  884  897  900  903  907  909  911  913  916  918  932  933
  938  952  957  975  997 1008 1009 1010 1022 1032 1050 1077 1079 1101
 1111 1127 1141 11

## variable importance

In [33]:
# Train the model
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(combined_embeddings_train, labels)

# Get feature importance
importances = rf.feature_importances_
feature_names = ['is_date_na', 'caps_count', 'content_length',
                 'contains_youtube_link', 'contains_external_link',
                 'is_person_name', 'containstop5spam']
# Assuming the last few features are your specified ones:
selected_features_importances = importances[-len(feature_names):]

# Print results
for name, importance in zip(feature_names, selected_features_importances):
    print(f"{name}: {importance}")


is_date_na: 0.006565963856232835
caps_count: 0.0041531569066052265
content_length: 0.010927052717944717
contains_youtube_link: 5.157909648794955e-05
contains_external_link: 0.008201523484206557
is_person_name: 0.0
containstop5spam: 0.010947828228453646


### checking coefficients of logistic regression

In [35]:
logistic = LogisticRegression(max_iter=1000, random_state=42)
logistic.fit(combined_embeddings_train, labels)

coefficients = logistic.coef_[0][-7:]
for name, coef in zip(feature_names, coefficients):
    print(f"{name}: {coef}")


is_date_na: 3.1830482488123404
caps_count: 0.017403620815376112
content_length: 0.011951594487397625
contains_youtube_link: -1.0657277952042903
contains_external_link: 3.6223452340776143
is_person_name: -0.16593924574328467
containstop5spam: 1.4886175596775053
