In [104]:
import re
import pymorphy2
import pandas as pd

import nltk
nltk.download("stopwords")
nltk.download('punkt')
nltk.download('wordnet')
from nltk.corpus import stopwords

from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ipuzanov\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ipuzanov\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ipuzanov\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
train_df = pd.read_csv('./datasets/phi_che_bio/train.csv')
test_df = pd.read_csv('./datasets/phi_che_bio/test.csv')

In [3]:
train_df.head()

Unnamed: 0,Id,Comment,Topic
0,0x840,A few things. You might have negative- frequen...,Biology
1,0xbf0,Is it so hard to believe that there exist part...,Physics
2,0x1dfc,There are bees,Biology
3,0xc7e,I'm a medication technician. And that's alot o...,Biology
4,0xbba,Cesium is such a pretty metal.,Chemistry


In [4]:
test_df.head()

Unnamed: 0,Id,Comment,Topic
0,0x1aa9,Personally I have no idea what my IQ is. I’ve ...,Biology
1,0x25e,I'm skeptical. A heavier lid would be needed t...,Physics
2,0x1248,I think I have 100 cm of books on the subject....,Biology
3,0x2b9,Is chemistry hard in uni. Ive read somewhere t...,Chemistry
4,0x24af,"In addition to the other comment, you can crit...",Physics


In [5]:
morph = pymorphy2.MorphAnalyzer()

In [48]:
def get_morph_text(input_text):
    text = re.sub('[^a-zA-Z]', ' ', input_text.replace('\\n', ''))
    text = nltk.word_tokenize(text)
    clear_text = []
    for word in text:
        normal_word = morph.parse(word)[0].normal_form
        if normal_word not in stopwords.words('english'):
            clear_text.append(normal_word)

    return ' '.join(clear_text)


def get_not_morph_text(input_text):
    text = re.sub('[^a-zA-Z]', ' ', input_text.replace('\\n', ''))
    text = nltk.word_tokenize(text)
    clear_text = [word for word in text if word not in stopwords.words('english')]
    return ' '.join(clear_text)

In [18]:
train_comments = train_df['Comment'].values
test_comments = test_df['Comment'].values

train_y = train_df['Topic'].values
test_y = test_df['Topic'].values

In [19]:
train_comments.shape, train_y.shape, test_comments.shape, test_y.shape

((8695,), (8695,), (1586,), (1586,))

In [8]:
train_x = [get_morph_text(comment) for comment in train_comments]
test_x = [get_morph_text(comment) for comment in test_comments]

In [36]:
not_morph_train_x = [get_not_morph_text(comment) for comment in train_comments]
not_morph_test_x = [get_not_morph_text(comment) for comment in test_comments]

In [105]:
morph_lin = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('clf', LogisticRegression(max_iter=150))
])

not_morph_lin = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('clf', LogisticRegression(max_iter=150))
])

In [100]:
morph_lin.fit(train_x, train_y)
lin_pred_y = morph_lin.predict(test_x)

In [101]:
print(classification_report(lin_pred_y, test_y))

              precision    recall  f1-score   support

     Biology       0.85      0.83      0.84       628
   Chemistry       0.83      0.79      0.81       531
     Physics       0.80      0.87      0.84       427

    accuracy                           0.83      1586
   macro avg       0.83      0.83      0.83      1586
weighted avg       0.83      0.83      0.83      1586



In [108]:
not_morph_lin.fit(not_morph_train_x, train_y)
not_morph_pred_y = not_morph_lin.predict(not_morph_test_x)

In [109]:
print(classification_report(not_morph_pred_y, test_y))

              precision    recall  f1-score   support

     Biology       0.85      0.84      0.84       627
   Chemistry       0.84      0.79      0.81       535
     Physics       0.80      0.88      0.83       424

    accuracy                           0.83      1586
   macro avg       0.83      0.83      0.83      1586
weighted avg       0.83      0.83      0.83      1586



In [None]:
from gensim.models import word2vec

In [None]:
morph_lin_wv = Pipeline([
    ('wv', word2vec),
    ('clf', LogisticRegression(max_iter=150))
])

not_morph_lin_wv = Pipeline([
    ('wv', word2vec),
    ('clf', LogisticRegression(max_iter=150))
])

In [None]:
morph_lin_wv.fit(train_x, train_y)
wv_pred_y = morph_lin_wv.predict(test_x)

In [None]:
print(classification_report(wv_pred_y, test_y))

In [None]:
not_morph_lin_wv.fit(train_x, train_y)
not_morph_wv_pred_y = not_morph_lin_wv.predict(test_x)

In [None]:
print(classification_report(not_morph_wv_pred_y, test_y))