# Import and Setup

In [1]:
import pandas as pd
import numpy as np
import json
import os
import re
from tqdm import tqdm

import nltk
nltk.download("stopwords")
from nltk.corpus import stopwords

import matplotlib.pyplot as plt
import seaborn as sns
sns.set_theme(style="whitegrid")

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

from imblearn.over_sampling import RandomOverSampler

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/jirayuwat/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
TRAIN_PATH = r"train_for_student.json"
TEST_PATH = r"test_for_student.json"

TRAIN_PATH = os.path.join(os.path.pardir, "data", TRAIN_PATH)
TEST_PATH = os.path.join(os.path.pardir, "data", TEST_PATH)

LABEL_LIST = ['CE','ENV','BME','PE','METAL','ME','EE','CPE','OPTIC','NANO','CHE','MATENG','AGRI','EDU','IE','SAFETY','MATH','MATSCI']

# Preprocess

In [3]:
# load df
train_data = json.load(open(TRAIN_PATH))

# make df
train_df = pd.DataFrame(train_data).transpose()

# create labels
train_df['label_id'] = 0
for idx, label in enumerate(LABEL_LIST):
    train_df[label] = train_df['Classes'].apply(lambda x: 1 if label in x else 0)
    train_df['label_id'] += 2**idx * train_df[label]
train_df.drop(columns=['Classes']+LABEL_LIST, inplace=True)

# upsampling
sampler = RandomOverSampler(sampling_strategy='auto', random_state=42,)
train_df_resampled, _ = sampler.fit_resample(train_df, train_df['label_id'])
train_df = train_df_resampled
# train_df = pd.concat([train_df]*4, ignore_index=True)

# create X, y
X = train_df.drop(columns=['label_id'])
y = train_df['label_id']

# train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42, stratify = y)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size = 0.2, random_state = 42, stratify = y_train)

print(f"""
X_train: {X_train.shape}
X_val: {X_val.shape}
X_test: {X_test.shape}

y_train: {y_train.shape}
y_val: {y_val.shape}
y_test: {y_test.shape}
""".strip())

X_train: (5270, 2)
X_val: (1318, 2)
X_test: (1648, 2)

y_train: (5270,)
y_val: (1318,)
y_test: (1648,)


In [4]:
y_train[0]

3072

In [4]:
def text_preprocessing(text):

    # remove copyright, year, publisher
    text = re.sub(r'©', '', text)
    text = re.sub(r'\bCopyright\b', '', text)
    text = re.sub(r'\b\d{4}\b', '', text)
    text = re.sub(r'\bElsevier\b', '', text)
    text = re.sub(r'\bSpringer (International Publishing)\b', '', text)
    text = re.sub(r'\bIEEE\.\b', '', text)

    # remove dup spaces
    text = re.sub(r'\s+', ' ', text)

    return text

processor = TfidfVectorizer(ngram_range=(1, 3),
                         binary=True,
                         smooth_idf=False,
                         lowercase=True,
                         preprocessor=text_preprocessing,
                         stop_words='english'
                         )

In [5]:
X_train_tfidf = processor.fit_transform(X_train['Abstract'])
X_val_tfidf = processor.transform(X_val['Abstract'])
X_test_tfidf = processor.transform(X_test['Abstract'])

print(f"""
X_train_tfidf: {X_train_tfidf.shape}
X_val_tfidf: {X_val_tfidf.shape}
X_test_tfidf: {X_test_tfidf.shape}
""".strip())

X_train_tfidf: (5270, 96386)
X_val_tfidf: (1318, 96386)
X_test_tfidf: (1648, 96386)


# Model

In [6]:
model = MultinomialNB()
model.fit(X_train_tfidf, y_train)

In [7]:
print(classification_report(y_val, model.predict(X_val_tfidf)))

              precision    recall  f1-score   support

           1       1.00      1.00      1.00         4
           2       1.00      1.00      1.00         5
           4       1.00      1.00      1.00         4
           9       1.00      1.00      1.00         5
          16       1.00      1.00      1.00         5
          34       1.00      1.00      1.00         5
          36       1.00      1.00      1.00         5
          41       1.00      1.00      1.00         4
          42       1.00      1.00      1.00         5
          48       1.00      1.00      1.00         5
          58       1.00      1.00      1.00         5
          64       1.00      0.60      0.75         5
          66       1.00      1.00      1.00         4
          68       1.00      1.00      1.00         4
          72       0.83      1.00      0.91         5
          74       1.00      1.00      1.00         4
          96       1.00      1.00      1.00         5
         128       0.00    

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [8]:
# accuracy
y_train_pred = model.predict(X_train_tfidf)
y_val_pred = model.predict(X_val_tfidf)
y_test_pred = model.predict(X_test_tfidf)

print(f"""
Train Accuracy: {accuracy_score(y_train, y_train_pred)}
Val Accuracy: {accuracy_score(y_val, y_val_pred)}
Test Accuracy: {accuracy_score(y_test, y_test_pred)}
""".strip())

Train Accuracy: 0.9996204933586338
Val Accuracy: 0.9916540212443096
Test Accuracy: 0.9945388349514563


# Submission

In [9]:
# load df
test_data = json.load(open(TEST_PATH))

# make df
test_df = pd.DataFrame(test_data).transpose()

# tf-idf
test_tfidf = processor.transform(test_df['Abstract'])

In [10]:
test_pred = model.predict(test_tfidf)

In [11]:
submission_df = pd.DataFrame(columns= LABEL_LIST,
                             index= [k for k in test_data.keys()])

for idx, pred in enumerate(test_pred):
    submission_df.loc[test_df.index[idx]] = [int(x) for x in list(f"{pred:018b}")][::-1]

submission_df.index.name = "id"
submission_df

Unnamed: 0_level_0,CE,ENV,BME,PE,METAL,ME,EE,CPE,OPTIC,NANO,CHE,MATENG,AGRI,EDU,IE,SAFETY,MATH,MATSCI
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
001eval,0,0,0,1,0,0,1,0,0,0,1,0,0,0,0,0,0,0
002eval,0,0,0,0,1,1,0,1,0,0,1,1,0,0,0,0,1,1
003eval,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
004eval,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0
005eval,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
147eval,0,1,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,1
148eval,0,1,0,0,0,0,0,1,1,0,0,0,1,0,1,0,0,0
149eval,0,0,0,0,1,1,0,0,0,0,1,0,0,0,0,0,1,1
150eval,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,1


In [12]:
submission_df.to_csv("submission.csv")

In [13]:
!kaggle competitions submit -c 2110446-data-science-2023-02 -f submission.csv -m "NB and TF-IDF"

100%|██████████████████████████████████████| 6.57k/6.57k [00:01<00:00, 4.15kB/s]
Successfully submitted to 2110446 Data Science and Data Engineering Tools