Competition: https://www.kaggle.com/competitions/nlp-lab-dm23/overview

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import pandas as pd
from pandas.core.frame import DataFrame
import re
import numpy as np
from gensim.models import Word2Vec
from gensim.models import FastText
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV

In [None]:
df = pd.read_csv('/content/drive/MyDrive/ML_Lab2/train.csv')

In [None]:
df.head()

Unnamed: 0,Class Index,Title,Description
0,3,Wall St. Bears Claw Back Into the Black (Reuters),"Reuters - Short-sellers, Wall Street's dwindli..."
1,3,Carlyle Looks Toward Commercial Aerospace (Reu...,Reuters - Private investment firm Carlyle Grou...
2,3,Oil and Economy Cloud Stocks' Outlook (Reuters),Reuters - Soaring crude prices plus worries\ab...
3,3,Iraq Halts Oil Exports from Main Southern Pipe...,Reuters - Authorities have halted oil export\f...
4,3,"Oil prices soar to all-time record, posing new...","AFP - Tearaway world oil prices, toppling reco..."


In [None]:
df_dict = df.to_dict('records') # list of records (object dictionary)

The sample is balanced

In [None]:
df['Class Index'].value_counts()

3    30000
4    30000
2    30000
1    30000
Name: Class Index, dtype: int64

Concat title and description

In [None]:
for record in df_dict:
    record['Description'] += ' ' + record['Title']

## Preprocessing

In [None]:
import string
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import re
import inflect
from tqdm import tqdm

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [None]:
def trash_removing(sentence):
    # Some trash words and symbols
    stop_words = set(stopwords.words('english'))
    punct = string.punctuation

    # Tokenize sentence
    sentence = sentence.replace("\\", " ")
    word_tokens = word_tokenize(sentence.strip())

    # Filter sentence
    filtered_sentence = [w.lower() for w in word_tokens if not w.lower() in stop_words and w not in punct and w.isalpha()]

    return filtered_sentence

In [None]:
filtered_text = []
for record in tqdm(df_dict):
    filtered_text.append(trash_removing(record['Description']))

100%|██████████| 120000/120000 [01:02<00:00, 1933.37it/s]


In [None]:
q = inflect.engine()

def lowercase_text(text):
  return text.lower()

def remove_nums(text):
  return re.sub(r'\d+', '', text)

def replace_nums(text):
  # split strings into list of texts
  temp_string = text.split()
  # initialise empty list
  new_str = []

  for word in temp_string:
      # if text is a digit, convert the digit
      # to numbers and append into the new_str list
      if word.isdigit():
          temp = q.number_to_words(word)
          new_str.append(temp)
      # append the texts as it is
      else:
          new_str.append(word)

  # join the texts of new_str to form a string
  temp_str = ' '.join(new_str)
  return temp_str

def remove_punct(text):
  translator = str.maketrans('', '', string.punctuation)
  return text.translate(translator)

def remove_stopwords(text):
  stop_words = set(stopwords.words("english"))
  word_tokens = word_tokenize(text)
  filtered_text = [word for word in word_tokens if word not in stop_words]
  return filtered_text

def full_preprocessing_pipeline(df: DataFrame):
  res = []
  for i, row in tqdm(df.iterrows()):
    res.append(
      remove_stopwords(
        remove_punct(
          remove_nums(
            lowercase_text(
                row[2]
            )
          )
        )
      )
    )
  return res

In [None]:
preprocessed_texts = full_preprocessing_pipeline(df)

120000it [01:10, 1697.39it/s]


## Stemming vs Lemmatization

In [None]:
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import word_tokenize

stem1 = PorterStemmer()

### Stemming

In [None]:
def stem_words(texts):
  stemmed_text = []
  for text in tqdm(texts):
    stemmed_text.append([stem1.stem(word) for word in text])
  return stemmed_text

### Lemmatizing

In [None]:
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet

def get_wordnet_pos(word):
    """Map POS tag to first character lemmatize() accepts"""
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}
    return tag_dict.get(tag, wordnet.NOUN)

lemmatizer = WordNetLemmatizer()


def lemmatize_words(texts):
  lemmatized_text = []
  for text in tqdm(texts):
      lemmatized_text.append([lemmatizer.lemmatize(w, get_wordnet_pos(w)) for w in text])
  return lemmatized_text

In [None]:
stemmed_text = stem_words(filtered_text)

100%|██████████| 120000/120000 [00:56<00:00, 2118.68it/s]


In [None]:
lemmatized_text = lemmatize_words(filtered_text)

100%|██████████| 120000/120000 [06:18<00:00, 317.22it/s]


### CountVectorizer

In [None]:
def cv_vectorize(texts):
    texts = [" ".join(t) for t in texts]
    vectorizer = CountVectorizer()
    data = vectorizer.fit_transform(texts)
    return data


### TF-IDF

In [None]:
def tfidf_vectorize(texts):
    texts = [" ".join(t) for t in texts]
    vectorizer = TfidfVectorizer()
    data = vectorizer.fit_transform(texts)
    return data

### Fasttext

In [None]:
def fasttext_vectorize(texts):
    embedding_dim = 100  # Размерность векторных представлений
    window_size = 5  # Размер окна контекста
    min_word_count = 1  # Минимальная частота слова для учета в модели
    epochs = 10  # Количество эпох обучения

    fasttext_model = FastText(sentences=texts, vector_size=embedding_dim, window=window_size, min_count=min_word_count, sg=1, epochs=epochs)

    # Получаем векторные представления для всех слов в нашем корпусе
    word_vectors = fasttext_model.wv

    X = []

    for tokens in texts:
        # Получаем вектор для каждого слова в тексте (если слово отсутствует в модели, пропускаем его)
        vectors = [word_vectors[word] for word in tokens if word in word_vectors]
        if vectors:
            # Усредняем векторы слов в тексте, чтобы получить один вектор для всего текста
            text_vector = np.mean(vectors, axis=0)
            X.append(text_vector)

    return np.array(X)

### Word2Vec

In [None]:
def vectorize(model, sentence):
    words_vecs = [model.wv[word] for word in sentence if word in model.wv]
    if len(words_vecs) == 0:
        return np.zeros(100)
    words_vecs = np.array(words_vecs)
    return words_vecs.mean(axis=0)

def word2vec_vectorize(sentences):
    w2v_model = Word2Vec(sentences, vector_size=100, window=5, min_count=5, workers=4)
    data = np.array([vectorize(w2v_model, sentence) for sentence in sentences])
    return data

In [None]:
y = df['Class Index'].to_numpy()

## Try LogisticRegression and use it to select best preprocessing combination

In [None]:
def logreg_pipeline(data, y):
    X_train, X_test, y_train, y_test = train_test_split(data, y, test_size=0.25, shuffle=True, stratify=y)

    lr = LogisticRegression(max_iter=1000)
    grid_values = {'C': [0.001,0.01,0.1,1,10,100,1000]}
    model_lr = GridSearchCV(lr, param_grid=grid_values, scoring='f1_micro', verbose=3)

    model_lr.fit(X_train, y_train)

    y_pred = model_lr.predict(X_test)
    print(classification_report(y_test, y_pred))

    return y_pred

### Full preprocessing, lemmatizing, CountVectorizer

In [None]:
data = cv_vectorize(lemmatized_text)
logreg_pipeline(data, y)

              precision    recall  f1-score   support

           1       0.91      0.90      0.90      7500
           2       0.96      0.97      0.96      7500
           3       0.87      0.87      0.87      7500
           4       0.87      0.88      0.88      7500

    accuracy                           0.90     30000
   macro avg       0.90      0.90      0.90     30000
weighted avg       0.90      0.90      0.90     30000



### Full preprocessing, stemming, CountVectorizer

In [None]:
data = cv_vectorize(stemmed_text)
logreg_pipeline(data, y)

              precision    recall  f1-score   support

           1       0.91      0.89      0.90      7500
           2       0.95      0.97      0.96      7500
           3       0.86      0.86      0.86      7500
           4       0.87      0.87      0.87      7500

    accuracy                           0.90     30000
   macro avg       0.90      0.90      0.90     30000
weighted avg       0.90      0.90      0.90     30000



### Full preprocessing, lemmatizing, TF-IDF

In [None]:
data = tfidf_vectorize(lemmatized_text)
logreg_pipeline(data, y)

              precision    recall  f1-score   support

           1       0.93      0.89      0.91      7500
           2       0.95      0.98      0.97      7500
           3       0.88      0.88      0.88      7500
           4       0.88      0.89      0.89      7500

    accuracy                           0.91     30000
   macro avg       0.91      0.91      0.91     30000
weighted avg       0.91      0.91      0.91     30000



### Full preprocessing, stemming, TF-IDF

In [None]:
data = tfidf_vectorize(stemmed_text)
logreg_pipeline(data, y)

              precision    recall  f1-score   support

           1       0.93      0.90      0.92      7500
           2       0.95      0.98      0.97      7500
           3       0.89      0.88      0.89      7500
           4       0.89      0.89      0.89      7500

    accuracy                           0.92     30000
   macro avg       0.92      0.92      0.92     30000
weighted avg       0.92      0.92      0.92     30000



### Full preprocessing, lemmatizing, Fasttext

In [None]:
data = fasttext_vectorize(lemmatized_text)
logreg_pipeline(data, y)

              precision    recall  f1-score   support

           1       0.91      0.89      0.90      7500
           2       0.95      0.97      0.96      7500
           3       0.85      0.85      0.85      7500
           4       0.86      0.86      0.86      7500

    accuracy                           0.89     30000
   macro avg       0.89      0.89      0.89     30000
weighted avg       0.89      0.89      0.89     30000



### Full preprocessing, stemming, Fasttext

In [None]:
data = fasttext_vectorize(stemmed_text)
logreg_pipeline(data, y)

              precision    recall  f1-score   support

           1       0.91      0.89      0.90      7500
           2       0.95      0.97      0.96      7500
           3       0.85      0.86      0.85      7500
           4       0.87      0.85      0.86      7500

    accuracy                           0.89     30000
   macro avg       0.89      0.89      0.89     30000
weighted avg       0.89      0.89      0.89     30000



### Full preprocessing, lemmatizing, w2v

In [None]:
data = word2vec_vectorize(lemmatized_text)
logreg_pipeline(data, y)

              precision    recall  f1-score   support

           1       0.90      0.87      0.88      7500
           2       0.94      0.96      0.95      7500
           3       0.84      0.84      0.84      7500
           4       0.85      0.84      0.85      7500

    accuracy                           0.88     30000
   macro avg       0.88      0.88      0.88     30000
weighted avg       0.88      0.88      0.88     30000



### Full preprocessing, stemming, w2v

In [None]:
data = word2vec_vectorize(stemmed_text)
logreg_pipeline(data, y)

              precision    recall  f1-score   support

           1       0.90      0.88      0.89      7500
           2       0.94      0.96      0.95      7500
           3       0.85      0.84      0.85      7500
           4       0.85      0.85      0.85      7500

    accuracy                           0.88     30000
   macro avg       0.88      0.88      0.88     30000
weighted avg       0.88      0.88      0.88     30000



## Try CatBoost

In [None]:
!pip install catboost



In [None]:
from catboost import Pool, CatBoostClassifier

In [None]:
def catboost_pipeline(data, y):
    X_train, X_test, y_train, y_test = train_test_split(data, y, test_size=0.25, shuffle=True, stratify=y)

    train_dataset = Pool(data=X_train, label=y_train)
    eval_dataset = Pool(data=X_test, label=y_test)

    model = CatBoostClassifier(iterations=1000, depth=9,
                                l2_leaf_reg=7, learning_rate=0.1,
                                loss_function='MultiClass', custom_metric='TotalF1',
                                task_type="GPU", devices='0:1')

    model.fit(train_dataset, use_best_model=True, eval_set=eval_dataset)

    y_pred = model.predict(X_test)

    print(classification_report(y_test, y_pred))
    print(model.get_best_score())


### CatBoost, lemmatizing, TF-IDF

In [None]:
data = tfidf_vectorize(lemmatized_text)
catboost_pipeline(data, y)

### CatBoost, stemming, TF-IDF

In [None]:
data = tfidf_vectorize(stemmed_text)
catboost_pipeline(data, y)

### Best model

In [None]:
from sklearn.metrics import f1_score

In [None]:
X_train, X_test, y_train, y_test = train_test_split(data, y, test_size=0.25, shuffle=True, stratify=y)

lr = LogisticRegression(max_iter=1000, C=0.9)
lr.fit(X_train, y_train)

y_pred = lr.predict(X_test)
print(f1_score(y_test, y_pred, average='micro'))

0.9126666666666666


## Test dataset

In [None]:
df_test = pd.read_csv('/content/drive/MyDrive/ML_Lab2/test.csv', index_col=0)

In [None]:
df_test.head()

Unnamed: 0_level_0,Title,Description
ID,Unnamed: 1_level_1,Unnamed: 2_level_1
0,Fears for T N pension after talks,Unions representing workers at Turner Newall...
1,The Race is On: Second Private Team Sets Launc...,"SPACE.com - TORONTO, Canada -- A second\team o..."
2,Ky. Company Wins Grant to Study Peptides (AP),AP - A company founded by a chemistry research...
3,Prediction Unit Helps Forecast Wildfires (AP),AP - It's barely dawn when Mike Fitzpatrick st...
4,Calif. Aims to Limit Farm-Related Smog (AP),AP - Southern California's smog-fighting agenc...


In [None]:
df_test_dict = df_test.to_dict('records')

In [None]:
for record in df_test_dict:
    record['Description'] += ' ' + record['Title']

In [None]:
filtered_test_text = []
for record in tqdm(df_test_dict):
    filtered_test_text.append(trash_removing(record['Description']))

100%|██████████| 7600/7600 [00:03<00:00, 1983.70it/s]


In [None]:
lemmatized_test_text = lemmatize_words(filtered_test_text)

100%|██████████| 7600/7600 [00:23<00:00, 320.47it/s]


In [None]:
stemmed_test_text = stem_words(filtered_test_text)

100%|██████████| 7600/7600 [00:03<00:00, 2388.26it/s]


In [None]:
texts = [" ".join(t) for t in stemmed_text]
vectorizer = TfidfVectorizer()
vectorizer.fit(texts)

In [None]:
stemmed_test_text = [" ".join(t) for t in stemmed_test_text]

In [None]:
data_test = vectorizer.transform(stemmed_test_text)

In [None]:
y_pred = lr.predict(data_test)

In [None]:
y_pred.shape

(7600,)

In [None]:
df_submit = pd.DataFrame({'ID': list(range(7600)), 'Class Index': y_pred})

In [None]:
df_submit.to_csv('/content/drive/MyDrive/ML_Lab2/submit.csv', index=False)