In [1]:
!pip install catboost

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.metrics import f1_score
import lightgbm as lgb
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string

nltk.download('stopwords')
nltk.download('punkt')



Collecting catboost
  Downloading catboost-1.2.5-cp310-cp310-manylinux2014_x86_64.whl (98.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.2/98.2 MB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: catboost
Successfully installed catboost-1.2.5


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [2]:
# Load data

from google.colab import drive
drive.mount('/content/drive')

data_path = "/content/drive/My Drive/DSC 258R/"
df_train = pd.read_csv(data_path + "train.csv")
df_test = pd.read_csv(data_path + "test.csv")

Mounted at /content/drive


In [3]:
# Preprocess text data
def preprocess_text(text):
    stop_words = set(stopwords.words('english'))
    translator = str.maketrans(string.punctuation, ' ' * len(string.punctuation))
    text = text.translate(translator)
    words = word_tokenize(text.lower())
    words = [word for word in words if word not in stop_words]
    return ' '.join(words)

df_train['text'] = df_train['review'].apply(preprocess_text)
df_test['text'] = df_test['review'].apply(preprocess_text)

In [4]:
# Define feature columns
text_features = 'text'
numerical_features = df_train.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_features = df_train.select_dtypes(include=['object']).columns.tolist()
categorical_features.remove('review')
categorical_features.remove('text')
categorical_features.remove('label')


In [5]:
# Define preprocessor
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

text_transformer = Pipeline(steps=[
    ('tfidf', TfidfVectorizer(max_features=10000, ngram_range=(1, 2))),
    ('svd', TruncatedSVD(n_components=300, random_state=42))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features),
        ('text', text_transformer, 'text')
    ])



In [16]:
# Define individual models
lgbm = lgb.LGBMClassifier(n_estimators=200, random_state=42)
rf = RandomForestClassifier(n_estimators=200, random_state=42)
xgb = XGBClassifier(n_estimators=200, random_state=42)
catboost = CatBoostClassifier(n_estimators=200, random_state=42, verbose=0)
lr = LogisticRegression(max_iter=1000, random_state=42)

In [70]:
# Define Voting Classifier
ensemble = VotingClassifier(estimators=[
    ('xgb', xgb),
    ('catboost', catboost),
    ('lgbm', lgbm)
], voting='soft')

In [71]:
# Define model
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', ensemble)
])

In [72]:
# Split data
X = df_train.drop(columns=['label'])
y = df_train['label']
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)


In [73]:
# Train model
model.fit(X_train, y_train)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.104489 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 79421
[LightGBM] [Info] Number of data points in the train set: 10515, number of used features: 1250
[LightGBM] [Info] Start training from score -2.241261
[LightGBM] [Info] Start training from score -1.576694
[LightGBM] [Info] Start training from score -3.622203
[LightGBM] [Info] Start training from score -3.307315
[LightGBM] [Info] Start training from score -2.038722
[LightGBM] [Info] Start training from score -1.897913
[LightGBM] [Info] Start training from score -2.496673
[LightGBM] [Info] Start training from score -2.890657
[LightGBM] [Info] Start training from score -1.776189
[LightGBM] [Info] Start training from score -3.312523


In [74]:
# Validate model
y_pred = model.predict(X_val)
print(f'Validation F1 Score: {f1_score(y_val, y_pred, average="weighted")}')

Validation F1 Score: 0.7921033547351295


In [75]:
# Predict on test set
X_test = df_test
y_test_pred = model.predict(X_test)

In [76]:
# Save predictions
output = pd.DataFrame({'Id': df_test['id'], 'Predicted': y_test_pred})
output.to_csv(data_path + "predicted.csv", index=False)