<a href="https://colab.research.google.com/github/javmencia/COBWEBfiles/blob/main/STA314project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [10]:
from sklearn.ensemble import VotingClassifier
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report, confusion_matrix


In [22]:
import re
import pandas as pd

def mark_links_binary(text):
    text = text.encode('ascii', 'ignore').decode()

    # Regex patterns
    youtube_pattern = r'(?:https?://)?(?:www\.)?(?:youtube\.com|youtu\.be)'
    external_pattern = r'\b(?:http[s]?://|www\.)\S+\.(?:com|net|org|io|co|biz|info|me|us|ca|edu|gov|tv|uk|au|de|se|in|jp|cn|ru|fr|es|it|nl)(/\S*)?\b'

    contains_youtube_link = 0
    contains_external_link = 0

    # Check for YouTube links
    if re.search(youtube_pattern, text):
        contains_youtube_link = 1

    # Check for external links (excluding YouTube)
    external_links = re.findall(external_pattern, text)
    if any("youtube.com" not in url and "youtu.be" not in url for url in external_links):
        contains_external_link = 1

    return contains_youtube_link, contains_external_link

def is_person_name(author):
    # Either First Last or just First
    person_pattern = r'^[A-Z][a-z]+(\s[A-Z][a-z]+)?$'

    if re.match(person_pattern, author):
        return 1
    else:
        return 0

In [23]:

# Load Training Data
train_data = pd.read_csv("train.csv")
train_data['CONTENT'] = train_data['CONTENT'].str.replace('\ufeff', '', regex=False)
train_data['CLASS'] = train_data['CLASS'].astype(int)

texts = train_data['CONTENT']
labels = train_data['CLASS']
video_names = train_data['VIDEO_NAME']
authors = train_data['AUTHOR']
train_data['is_date_na'] = train_data['DATE'].isna().astype(int)  # 1 if DATE is NA, 0 otherwise

# Cleaning
train_data['contains_youtube_link'], train_data['contains_external_link'] = zip(*train_data['CONTENT'].apply(mark_links_binary))
train_data['caps_count'] = train_data['CONTENT'].str.count(r'[A-Z]')
train_data['content_length'] = train_data['CONTENT'].str.len()
train_data['is_person_name'] = train_data['AUTHOR'].apply(is_person_name)

# Load Testing Data
test_data = pd.read_csv("test.csv")
test_data['CONTENT'] = test_data['CONTENT'].str.replace('\ufeff', '', regex=False)
video_names_test = test_data['VIDEO_NAME']
authors_test = test_data['AUTHOR']
test_data['is_date_na'] = test_data['DATE'].isna().astype(int)  # 1 if DATE is NA, 0 otherwise


Unnamed: 0,COMMENT_ID,AUTHOR,DATE,CONTENT,VIDEO_NAME,CLASS,is_date_na,contains_youtube_link,contains_external_link,caps_count,content_length,is_person_name
10,11,Leonel Hernandez,2014-11-14 12:35:38,"Something to dance to, even if your sad JUST d...",PSY - GANGNAM STYLE(?????) M/V,0,0,1,0,23,115,1
11,12,Pedro Lombillo,2015-03-13 00:27:41.285,Check out this playlist on YouTube:,"LMFAO - Party Rock Anthem ft. Lauren Bennett, ...",1,0,0,0,3,35,1
12,13,FreshX,,Share this video.. This song can beat PSY - Ga...,Eminem - Love The Way You Lie ft. Rihanna,1,1,0,0,7,58,0
13,14,OverSpace33,2014-11-06 19:40:59,For Christmas Song visit my channel! ;),PSY - GANGNAM STYLE(?????) M/V,1,0,0,0,3,39,0
14,15,Adela Korman,,Check out my SEXY VIDEO :*,Eminem - Love The Way You Lie ft. Rihanna,1,1,0,0,10,26,1
15,16,TLouX music,2014-11-05 22:41:42,Add me here...https://www.facebook.com/TLouXmusic,PSY - GANGNAM STYLE(?????) M/V,1,0,0,1,4,49,0
16,17,asd ad,2014-01-21 08:22:06,psy=korean,PSY - GANGNAM STYLE(?????) M/V,0,0,0,0,0,10,0
17,18,Chinsoman Films,2014-11-07 01:22:02,Please subscribe to me,PSY - GANGNAM STYLE(?????) M/V,1,0,0,0,1,22,1
18,19,railn j sander,2015-05-26 05:32:15.041,I guss this song is one of my worst fears in l...,Eminem - Love The Way You Lie ft. Rihanna,0,0,0,0,1,115,0
19,20,Joao Victor Canassa,2015-05-20 17:41:30.002,Charlie from Lost!,Eminem - Love The Way You Lie ft. Rihanna,0,0,0,0,2,18,0


In [24]:
# Load BERT Model
model = SentenceTransformer('all-mpnet-base-v2')

# Generate Embeddings for Training Data
content_embeddings = model.encode(texts.tolist(), show_progress_bar=True)
author_embeddings = model.encode(authors.tolist(), show_progress_bar=True)
video_name_embeddings = model.encode(video_names.tolist(), show_progress_bar=True)

# Combine embeddings and additional features (is_date_na, caps_count, content_length)
combined_embeddings = np.array([
    np.hstack((content, author, video,
               [train_data['is_date_na'].iloc[i],
                train_data['caps_count'].iloc[i],
                train_data['content_length'].iloc[i]],
               train_data['contains_youtube_link'].iloc[i],
               train_data['contains_external_link'].iloc[i],
               train_data['is_person_name'].iloc[i]
               ))
    for i, (content, author, video) in enumerate(zip(content_embeddings, author_embeddings, video_name_embeddings))
])


Batches:   0%|          | 0/43 [00:00<?, ?it/s]

Batches:   0%|          | 0/43 [00:00<?, ?it/s]

Batches:   0%|          | 0/43 [00:00<?, ?it/s]

In [25]:
# Split data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(
    combined_embeddings, labels, test_size=0.2, random_state=42, stratify=labels
)

# Define Voting Classifier
voting_clf = VotingClassifier(
    estimators=[
        ('logistic', LogisticRegression(max_iter=1000, random_state=42)),
        ('svm', SVC(kernel='linear', C=1, probability=True, random_state=42)),
        ('xgb', XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)),
        ('rf', RandomForestClassifier(n_estimators=100, random_state=42))
    ],
    voting='soft'  # Use 'soft' for probabilities, 'hard' for majority vote
)

# Train Voting Classifier on Training Data
voting_clf.fit(X_train, y_train)

# Evaluate on Validation Data
y_val_pred = voting_clf.predict(X_val)
y_val_pred_proba = voting_clf.predict_proba(X_val)[:, 1]

# Calculate evaluation metrics
accuracy = accuracy_score(y_val, y_val_pred)
roc_auc = roc_auc_score(y_val, y_val_pred_proba)
report = classification_report(y_val, y_val_pred)
conf_matrix = confusion_matrix(y_val, y_val_pred)

# Print results
print("Validation Accuracy:", accuracy)
print("Validation ROC AUC Score:", roc_auc)
print("\nClassification Report:\n", report)
print("Confusion Matrix:\n", conf_matrix)

Parameters: { "use_label_encoder" } are not used.



Validation Accuracy: 0.9890510948905109
Validation ROC AUC Score: 0.9963188220230473

Classification Report:
               precision    recall  f1-score   support

           0       0.98      1.00      0.99       132
           1       1.00      0.98      0.99       142

    accuracy                           0.99       274
   macro avg       0.99      0.99      0.99       274
weighted avg       0.99      0.99      0.99       274

Confusion Matrix:
 [[132   0]
 [  3 139]]


Original code results

In [11]:
# Split data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(
    combined_embeddings, labels, test_size=0.2, random_state=42, stratify=labels
)

# Define Voting Classifier
voting_clf = VotingClassifier(
    estimators=[
        ('logistic', LogisticRegression(max_iter=1000, random_state=42)),
        ('svm', SVC(kernel='linear', C=1, probability=True, random_state=42)),
        ('xgb', XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)),
        ('rf', RandomForestClassifier(n_estimators=100, random_state=42))
    ],
    voting='soft'  # Use 'soft' for probabilities, 'hard' for majority vote
)

# Train Voting Classifier on Training Data
voting_clf.fit(X_train, y_train)

# Evaluate on Validation Data
y_val_pred = voting_clf.predict(X_val)
y_val_pred_proba = voting_clf.predict_proba(X_val)[:, 1]

# Calculate evaluation metrics
accuracy = accuracy_score(y_val, y_val_pred)
roc_auc = roc_auc_score(y_val, y_val_pred_proba)
report = classification_report(y_val, y_val_pred)
conf_matrix = confusion_matrix(y_val, y_val_pred)

# Print results
print("Validation Accuracy:", accuracy)
print("Validation ROC AUC Score:", roc_auc)
print("\nClassification Report:\n", report)
print("Confusion Matrix:\n", conf_matrix)

Parameters: { "use_label_encoder" } are not used.



Validation Accuracy: 0.9708029197080292
Validation ROC AUC Score: 0.9934912505335041

Classification Report:
               precision    recall  f1-score   support

           0       0.94      1.00      0.97       132
           1       1.00      0.94      0.97       142

    accuracy                           0.97       274
   macro avg       0.97      0.97      0.97       274
weighted avg       0.97      0.97      0.97       274

Confusion Matrix:
 [[132   0]
 [  8 134]]


Test output

In [None]:

# Generate Embeddings for Testing Data
content_embeddings_test = model.encode(test_data['CONTENT'].tolist(), show_progress_bar=True)
author_embeddings_test = model.encode(authors_test.tolist(), show_progress_bar=True)
video_name_embeddings_test = model.encode(video_names_test.tolist(), show_progress_bar=True)

combined_embeddings_test = np.array([
        np.hstack((content, author, video, [test_data['is_date_na'].iloc[i]]))
            for i, (content, author, video) in enumerate(zip(content_embeddings_test, author_embeddings_test, video_name_embeddings_test))
            ])

# Define Voting Classifier
voting_clf = VotingClassifier(
            estimators=[
                        ('logistic', LogisticRegression(max_iter=1000, random_state=42)),
                        ('svm', SVC(kernel='linear', C=1, probability=True, random_state=42)),
                        ('xgb', XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)),
                        ('rf', RandomForestClassifier(n_estimators=100, random_state=42))
            ],
            voting='soft'  # Use 'soft' for probabilities, 'hard' for majority vote
            )

# Train Voting Classifier on Full Training Data
voting_clf.fit(combined_embeddings, labels)

# Predict Labels for Testing Data
test_predictions = voting_clf.predict(combined_embeddings_test)

# Output Predictions to CSV
output = pd.DataFrame({'COMMENT_ID': test_data['COMMENT_ID'], 'CLASS': test_predictions})
output.to_csv("test_predictions.csv", index=False)
print("Predictions saved to 'test_predictions.csv'")