In [17]:
import json
import re
from nltk.tokenize import word_tokenize
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
from scipy.sparse import hstack
import numpy as np
import pandas as pd
from collections import Counter
from sklearn.svm import SVC

In [2]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/abiwaqasyasir/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
# Function to analyze the data
def analyze_data(file_path):
    data = []
    with open(file_path, 'r') as f:
        for line in f:
            data.append(json.loads(line))

    df = pd.DataFrame(data)
    print(f"Total samples: {len(df)}")
    print("\nColumns in the dataset:")
    print(df.columns)

    print("\nSample sizes:")
    print(df['postText'].apply(len).describe())

    print("\nMost common words in postText:")
    all_words = [word for post in df['postText'] for word in post[0].lower().split()]
    print(Counter(all_words).most_common(10))

    print("\nSpoiler types distribution:")
    print(df['tags'].value_counts())


In [4]:
# Function to preprocess text
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^a-zA-Z0-9\s?]', '', text)
    tokens = word_tokenize(text)
    return ' '.join(tokens)

In [22]:
# Function to preprocess data
def preprocess_data(file_path):
    processed_data = []
    with open(file_path, 'r') as f:
        for line in f:
            item = json.loads(line)
            processed_item = {
                'id': item.get('id') or item.get('postId'),  # Handle both 'id' and 'postId'
                'postText': preprocess_text(item['postText'][0]),
                'targetTitle': preprocess_text(item['targetTitle']),
                'targetParagraphs': ' '.join([preprocess_text(p) for p in item['targetParagraphs']]),
                'spoilerType': item['tags'][0] if 'tags' in item else None  # Handle missing 'tags' in test data
            }
            processed_data.append(processed_item)
    return processed_data

In [23]:
# Function to engineer features
def engineer_features(data):
    for item in data:
        item['postLength'] = len(item['postText'].split())
        item['titleLength'] = len(item['targetTitle'].split())
        item['paragraphLength'] = len(item['targetParagraphs'].split())
        item['titleInPost'] = int(item['targetTitle'].lower() in item['postText'].lower())
    return data

In [24]:
# Function to combine TF-IDF vectors with additional features
def combine_features(tfidf_vectors, features):
    feature_array = np.array(features)
    combined = hstack([tfidf_vectors, feature_array])
    return combined

In [25]:
# Load and preprocess data
train_file_path = "/Users/abiwaqasyasir/Desktop/UWaterloo_Academics/TERM_3/MSCI_641/MSCI_Project/Dataset/train.jsonl"
val_file_path = "/Users/abiwaqasyasir/Desktop/UWaterloo_Academics/TERM_3/MSCI_641/MSCI_Project/Dataset/val.jsonl"


In [26]:
train_data = preprocess_data(train_file_path)
train_data = engineer_features(train_data)

val_data = preprocess_data(val_file_path)
val_data = engineer_features(val_data)

In [27]:
# Prepare text data for vectorization
x_train = [item['postText'] + ' ' + item['targetTitle'] for item in train_data]
y_train = [item['spoilerType'] for item in train_data]

x_val = [item['postText'] + ' ' + item['targetTitle'] for item in val_data]
y_val = [item['spoilerType'] for item in val_data]

# Prepare text data for vectorization
x_train_texts = [item['postText'] + ' ' + item['targetTitle'] + ' ' + item['targetParagraphs'] for item in train_data]
y_train = [item['spoilerType'] for item in train_data]

x_val_texts = [item['postText'] + ' ' + item['targetTitle'] + ' ' + item['targetParagraphs'] for item in val_data]
y_val = [item['spoilerType'] for item in val_data]


In [28]:
# Vectorize text data
vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1, 2))  # Include bi-grams
x_train_vec = vectorizer.fit_transform(x_train_texts)
x_val_vec = vectorizer.transform(x_val_texts)

In [29]:
# Prepare additional features
train_features = [[item['postLength'], item['titleLength'], item['paragraphLength'], item['titleInPost']] for item in train_data]
val_features = [[item['postLength'], item['titleLength'], item['paragraphLength'], item['titleInPost']] for item in val_data]

In [30]:
# Combine TF-IDF vectors with additional features
x_train_combined = combine_features(x_train_vec, train_features)
x_val_combined = combine_features(x_val_vec, val_features)

In [14]:
# Train Naive Bayes classifier with combined features
clf = MultinomialNB()
clf.fit(x_train_combined, y_train)

In [15]:
# Predict and evaluate
y_pred = clf.predict(x_val_combined)
print(classification_report(y_val, y_pred))

              precision    recall  f1-score   support

       multi       0.00      0.00      0.00        84
     passage       0.57      0.52      0.54       154
      phrase       0.51      0.81      0.63       162

    accuracy                           0.53       400
   macro avg       0.36      0.44      0.39       400
weighted avg       0.43      0.53      0.46       400



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [31]:
# Train SVM classifier with combined features and class weights
clf = SVC(class_weight='balanced')
clf.fit(x_train_combined, y_train)

In [19]:
# Predict and evaluate
y_pred = clf.predict(x_val_combined)
print(classification_report(y_val, y_pred))

              precision    recall  f1-score   support

       multi       0.34      0.19      0.24        84
     passage       0.50      0.28      0.36       154
      phrase       0.46      0.76      0.57       162

    accuracy                           0.46       400
   macro avg       0.43      0.41      0.39       400
weighted avg       0.45      0.46      0.42       400



In [32]:
# Predict and evaluate_with_additional TF-IDF features for target paragraphs and bi-grams in the TF-IDF vectorizer
y_pred = clf.predict(x_val_combined)
print(classification_report(y_val, y_pred))

              precision    recall  f1-score   support

       multi       0.30      0.49      0.37        84
     passage       0.29      0.01      0.02       154
      phrase       0.47      0.74      0.57       162

    accuracy                           0.41       400
   macro avg       0.35      0.41      0.32       400
weighted avg       0.36      0.41      0.32       400



Implementation using Random Forests with Grid Search for hyperparameter tuning

In [33]:
import json
import re
from nltk.tokenize import word_tokenize
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from scipy.sparse import hstack
import numpy as np
import pandas as pd
from collections import Counter

In [34]:
def analyze_data(file_path):
    data = []
    with open(file_path, 'r') as f:
        for line in f:
            data.append(json.loads(line))

    df = pd.DataFrame(data)
    print(f"Total samples: {len(df)}")
    print("\nColumns in the dataset:")
    print(df.columns)

    print("\nSample sizes:")
    print(df['postText'].apply(len).describe())

    print("\nMost common words in postText:")
    all_words = [word for post in df['postText'] for word in post[0].lower().split()]
    print(Counter(all_words).most_common(10))

    print("\nSpoiler types distribution:")
    print(df['tags'].value_counts())


In [35]:
# Function to preprocess text
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^a-zA-Z0-9\s?]', '', text)
    tokens = word_tokenize(text)
    return ' '.join(tokens)

# Function to preprocess data
def preprocess_data(file_path):
    processed_data = []
    with open(file_path, 'r') as f:
        for line in f:
            item = json.loads(line)
            processed_item = {
                'id': item.get('id') or item.get('postId'),  # Handle both 'id' and 'postId'
                'postText': preprocess_text(item['postText'][0]),
                'targetTitle': preprocess_text(item['targetTitle']),
                'targetParagraphs': ' '.join([preprocess_text(p) for p in item['targetParagraphs']]),
                'spoilerType': item['tags'][0] if 'tags' in item else None  # Handle missing 'tags' in test data
            }
            processed_data.append(processed_item)
    return processed_data

In [36]:
# Function to engineer features
def engineer_features(data):
    for item in data:
        item['postLength'] = len(item['postText'].split())
        item['titleLength'] = len(item['targetTitle'].split())
        item['paragraphLength'] = len(item['targetParagraphs'].split())
        item['titleInPost'] = int(item['targetTitle'].lower() in item['postText'].lower())
    return data

# Function to combine TF-IDF vectors with additional features
def combine_features(tfidf_vectors, features):
    feature_array = np.array(features)
    combined = hstack([tfidf_vectors, feature_array])
    return combined


In [37]:
# Load and preprocess data
train_file_path = "/Users/abiwaqasyasir/Desktop/UWaterloo_Academics/TERM_3/MSCI_641/MSCI_Project/Dataset/train.jsonl"
val_file_path = "/Users/abiwaqasyasir/Desktop/UWaterloo_Academics/TERM_3/MSCI_641/MSCI_Project/Dataset/val.jsonl"

train_data = preprocess_data(train_file_path)
train_data = engineer_features(train_data)

val_data = preprocess_data(val_file_path)
val_data = engineer_features(val_data)

# Prepare text data for vectorization
x_train_texts = [item['postText'] + ' ' + item['targetTitle'] + ' ' + item['targetParagraphs'] for item in train_data]
y_train = [item['spoilerType'] for item in train_data]

x_val_texts = [item['postText'] + ' ' + item['targetTitle'] + ' ' + item['targetParagraphs'] for item in val_data]
y_val = [item['spoilerType'] for item in val_data]


In [38]:
# Vectorize text data
vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1, 2))  # Include bi-grams
x_train_vec = vectorizer.fit_transform(x_train_texts)
x_val_vec = vectorizer.transform(x_val_texts)

# Prepare additional features
train_features = [[item['postLength'], item['titleLength'], item['paragraphLength'], item['titleInPost']] for item in train_data]
val_features = [[item['postLength'], item['titleLength'], item['paragraphLength'], item['titleInPost']] for item in val_data]

# Combine TF-IDF vectors with additional features
x_train_combined = combine_features(x_train_vec, train_features)
x_val_combined = combine_features(x_val_vec, val_features)


In [39]:
# Define the parameter grid for Grid Search
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

In [40]:
# Initialize Random Forest Classifier
rf = RandomForestClassifier(class_weight='balanced', random_state=42)


In [41]:
# Perform Grid Search
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=3, n_jobs=-1, verbose=2)
grid_search.fit(x_train_combined, y_train)


Fitting 3 folds for each of 81 candidates, totalling 243 fits


In [42]:
# Get the best estimator
best_rf = grid_search.best_estimator_

In [43]:
# Predict and evaluate
y_pred = best_rf.predict(x_val_combined)
print(classification_report(y_val, y_pred))

              precision    recall  f1-score   support

       multi       0.68      0.32      0.44        84
     passage       0.47      0.52      0.49       154
      phrase       0.52      0.61      0.56       162

    accuracy                           0.52       400
   macro avg       0.56      0.48      0.50       400
weighted avg       0.53      0.52      0.51       400



1st Submission

In [50]:
import pandas as pd
import json
import re
from nltk.tokenize import word_tokenize
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from scipy.sparse import hstack
import numpy as np

nltk.download('punkt')

# Function to preprocess text
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^a-zA-Z0-9\s?]', '', text)
    tokens = word_tokenize(text)
    return ' '.join(tokens)

# Function to preprocess data
def preprocess_data(file_path):
    processed_data = []
    with open(file_path, 'r') as f:
        for line in f:
            item = json.loads(line)
            processed_item = {
                'id': item.get('id') or item.get('postId'),  # Handle both 'id' and 'postId'
                'postText': preprocess_text(item['postText'][0]),
                'targetTitle': preprocess_text(item['targetTitle']),
                'targetParagraphs': ' '.join([preprocess_text(p) for p in item['targetParagraphs']]),
                'spoilerType': item['tags'][0] if 'tags' in item else None  # Handle missing 'tags' in test data
            }
            processed_data.append(processed_item)
    return processed_data

# Function to engineer features
def engineer_features(data):
    for item in data:
        item['postLength'] = len(item['postText'].split())
        item['titleLength'] = len(item['targetTitle'].split())
        item['paragraphLength'] = len(item['targetParagraphs'].split())
        item['titleInPost'] = int(item['targetTitle'].lower() in item['postText'].lower())
    return data

# Function to combine TF-IDF vectors with additional features
def combine_features(tfidf_vectors, features):
    feature_array = np.array(features)
    combined = hstack([tfidf_vectors, feature_array])
    return combined

# Load and preprocess training data
train_file_path = "/Users/abiwaqasyasir/Desktop/UWaterloo_Academics/TERM_3/MSCI_641/MSCI_Project/Dataset/train.jsonl"
train_data = preprocess_data(train_file_path)
train_data = engineer_features(train_data)

# Prepare text data for vectorization
x_train_texts = [item['postText'] + ' ' + item['targetTitle'] + ' ' + item['targetParagraphs'] for item in train_data]

# Vectorize text data
vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1, 2))
x_train_vec = vectorizer.fit_transform(x_train_texts)  # Fit and transform training data

# Prepare additional features
train_features = [[item['postLength'], item['titleLength'], item['paragraphLength'], item['titleInPost']] for item in train_data]

# Combine TF-IDF vectors with additional features for the training set
x_train_combined = combine_features(x_train_vec, train_features)

# Train the RandomForestClassifier
best_rf = RandomForestClassifier(n_estimators=100, max_depth=10)  # Example, replace with your actual model
best_rf.fit(x_train_combined, [item['spoilerType'] for item in train_data])


# Load and preprocess test data
test_file_path = "/Users/abiwaqasyasir/Desktop/UWaterloo_Academics/TERM_3/MSCI_641/MSCI_Project/Dataset/test.jsonl"
test_data = preprocess_data(test_file_path)
test_data = engineer_features(test_data)

# Prepare text data for vectorization
x_test_texts = [item['postText'] + ' ' + item['targetTitle'] + ' ' + item['targetParagraphs'] for item in test_data]

# Vectorize text data
x_test_vec = vectorizer.transform(x_test_texts)  # Transform test data

# Prepare additional features
test_features = [[item['postLength'], item['titleLength'], item['paragraphLength'], item['titleInPost']] for item in test_data]

# Combine TF-IDF vectors with additional features for the test set
x_test_combined = combine_features(x_test_vec, test_features)

# Predict on the test set
y_test_pred = best_rf.predict(x_test_combined)

# Prepare the submission DataFrame
submission_df = pd.DataFrame({
    'id': [item['id'] for item in test_data],
    'spoilerType': y_test_pred
})

# Save to CSV
submission_csv_path = "/Users/abiwaqasyasir/Desktop/UWaterloo_Academics/TERM_3/MSCI_641/MSCI_Project/task1_output.csv"
submission_df.to_csv(submission_csv_path, index=False)

print("Submission file saved to:", submission_csv_path)

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/abiwaqasyasir/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Submission file saved to: /Users/abiwaqasyasir/Desktop/UWaterloo_Academics/TERM_3/MSCI_641/MSCI_Project/task1_output.csv
