In [1]:
import pandas as pd
import numpy as np
import os
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
import re

In [2]:
# nltk.download('wordnet')

Функция ниже использует лемматизатор, для получения формы слова, убирает лишние символы и слова


In [2]:
lemmatizer = WordNetLemmatizer()
train_path = 'aclImdb/train'
stop_words = set(stopwords.words('english'))
vectorizer = TfidfVectorizer()

def text_prep(text):
    text = re.sub("[^a-zA-Z]"," ", text)
    words = word_tokenize(text.lower())
    words = [word for word in words if not word in stop_words]
    lemma_words = ' '.join([lemmatizer.lemmatize(word) for word in words])
    return lemma_words

    
train = pd.DataFrame(columns=['review', 'class'])
for file in os.listdir(train_path + '/neg'):
    f = open(train_path + '/neg/' + file, 'r',  encoding="utf-8")
    text = f.read()
    train = train.append({'review':text_prep(text), 'class':'neg'}, ignore_index=True)
    

for file in os.listdir(train_path + '/pos'):
    f = open(train_path + '/pos/' + file, 'r',  encoding="utf-8")
    text = f.read()
    train = train.append({'review':text_prep(text), 'class':'pos'}, ignore_index=True)
    

KeyboardInterrupt: 

In [None]:
test_path = 'aclImdb/test'

test = pd.DataFrame(columns=['review', 'class'])
for file in os.listdir(test_path + '/neg'):
    f = open(test_path + '/neg/' + file, 'r',  encoding="utf-8")
    text = f.read()
    test = test.append({'review':text_prep(text), 'class':'neg'}, ignore_index=True)
    

for file in os.listdir(test_path + '/pos'):
    f = open(test_path + '/pos/' + file, 'r',  encoding="utf-8")
    text = f.read()
    test = test.append({'review':text_prep(text), 'class':'pos'}, ignore_index=True)
    

In [5]:
train.to_csv('train.csv')
test.to_csv('test.csv')

NameError: name 'train' is not defined

In [3]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [4]:
from sklearn.model_selection import train_test_split

tfidf = TfidfVectorizer()

test = test.sample(frac=1).reset_index(drop=True)
train_target = train['class'] == 'pos'
test_target = train['class'] == 'pos'

## Обучение моделей с использованием tf-idf векторизации


In [18]:
X_train, X_valid, y_train, y_valid = train_test_split(train['review'], train_target)
train_matrix = tfidf.fit_transform(X_train)
valid_matrix = tfidf.transform(X_valid)
test_matrix = tfidf.transform(test['review'])

In [53]:
from sklearn.linear_model import SGDClassifier
from sklearn.svm import LinearSVC
from sklearn.metrics import roc_auc_score, f1_score, accuracy_score, recall_score, precision_score


model = SGDClassifier()
model.fit(train_matrix, y_train)
preds = model.predict(valid_matrix)
print('roc-auc ', roc_auc_score(y_valid, preds))

roc-auc  0.8970101344985837


In [54]:
model = LinearSVC()
model.fit(train_matrix, y_train)
preds = model.predict(valid_matrix)
print('roc-auc ', roc_auc_score(y_valid, preds))

roc-auc  0.8955734376356823


In [52]:
from sklearn.naive_bayes import GaussianNB

model = GaussianNB()
model.fit(train_matrix.todense(), y_train)
preds = model.predict(valid_matrix.todense())
print(roc_auc_score(y_valid, preds))

0.6706020691711683


## Обучение моделей с использованием Bag of words

In [21]:
from sklearn.feature_extraction.text import CountVectorizer

bag = CountVectorizer()
train_matrix = bag.fit_transform(X_train)
valid_matrix = bag.transform(X_valid)
test_matrix = bag.transform(test['review'])

In [22]:
model = SGDClassifier()
model.fit(train_matrix, y_train)
preds = model.predict(valid_matrix)
print(roc_auc_score(y_valid, preds))

0.8715992371477584


In [23]:
model = LinearSVC()
model.fit(train_matrix, y_train)
preds = model.predict(valid_matrix)
print(roc_auc_score(y_valid, preds))

0.8644982886615843




In [24]:
train_matrix = tfidf.fit_transform(X_train)
valid_matrix = tfidf.transform(X_valid)
test_matrix = tfidf.transform(test['review'])

Погрешность в оценке моделей с разной векторизацией небольшая, но в данном случае, tf-idf является более точным вариантом, так как текстов достаточно много, чтобы можно было делать поправку на весь набор текстов

# Настройка гиперпараметров для SVC

### L2

In [35]:
model = LinearSVC()
model.fit(train_matrix, y_train)
preds = model.predict(test_matrix)
print('roc-auc', roc_auc_score(test_target, preds))

roc-auc 0.50408


### L1

In [36]:
model = LinearSVC(penalty='l1', dual=False)
model.fit(train_matrix, y_train)
preds = model.predict(test_matrix)
print('roc-auc', roc_auc_score(test_target, preds))

roc-auc 0.5012


In [37]:
model = LinearSVC(penalty='l2', loss='hinge')
model.fit(train_matrix, y_train)
preds = model.predict(test_matrix)
print('roc-auc', roc_auc_score(test_target, preds))

roc-auc 0.5042


In [32]:
for c in np.arange(0.1, 1, 0.1):
    model = LinearSVC(penalty='l2', C=c, loss='hinge')
    model.fit(train_matrix, y_train)
    preds = model.predict(test_matrix)
    print('c= ', round(c, 2), ', roc-auc ', roc_auc_score(test_target, preds))

c=  0.1 , roc-auc  0.5054
c=  0.2 , roc-auc  0.50624
c=  0.3 , roc-auc  0.5048
c=  0.4 , roc-auc  0.50552
c=  0.5 , roc-auc  0.50452
c=  0.6 , roc-auc  0.50512
c=  0.7 , roc-auc  0.5042399999999999
c=  0.8 , roc-auc  0.50324
c=  0.9 , roc-auc  0.5039199999999999


In [34]:
model = LinearSVC(penalty='l2', C=0.2, loss='hinge')
model.fit(train_matrix, y_train)
preds = model.predict(test_matrix)
print('roc-auc ', roc_auc_score(test_target, preds))

roc-auc  0.50624


## Настройка гиперпараметров для SGD

### L2

In [41]:
model = SGDClassifier(penalty='l2')
model.fit(train_matrix, y_train)
preds = model.predict(test_matrix)
print('roc-auc ', roc_auc_score(test_target, preds))

roc-auc  0.50444


### L1

In [42]:
model = SGDClassifier(penalty='l1')
model.fit(train_matrix, y_train)
preds = model.predict(test_matrix)
print('roc-auc ', roc_auc_score(test_target, preds))

roc-auc  0.50492


### Elastic net

In [43]:
model = SGDClassifier(penalty='elasticnet')
model.fit(train_matrix, y_train)
preds = model.predict(test_matrix)
print('roc-auc ', roc_auc_score(test_target, preds))

roc-auc  0.50584


In [46]:
for i in np.arange(0.1, 1, 0.05):
    model = SGDClassifier(penalty='elasticnet', l1_ratio=i)
    model.fit(train_matrix, y_train)
    preds = model.predict(test_matrix)
    print('l1_ratio', round(i,2), ', roc-auc ', roc_auc_score(test_target, preds))

l1_ratio 0.1 , roc-auc  0.50588
l1_ratio 0.15 , roc-auc  0.5047200000000001
l1_ratio 0.2 , roc-auc  0.50532
l1_ratio 0.25 , roc-auc  0.50556
l1_ratio 0.3 , roc-auc  0.50456
l1_ratio 0.35 , roc-auc  0.50516
l1_ratio 0.4 , roc-auc  0.50488
l1_ratio 0.45 , roc-auc  0.5042800000000001
l1_ratio 0.5 , roc-auc  0.50572
l1_ratio 0.55 , roc-auc  0.5056799999999999
l1_ratio 0.6 , roc-auc  0.50596
l1_ratio 0.65 , roc-auc  0.50588
l1_ratio 0.7 , roc-auc  0.50564
l1_ratio 0.75 , roc-auc  0.5047200000000001
l1_ratio 0.8 , roc-auc  0.50568
l1_ratio 0.85 , roc-auc  0.50624
l1_ratio 0.9 , roc-auc  0.5065999999999999
l1_ratio 0.95 , roc-auc  0.50612


Наилучший результат при регуляризации elastic net при l1_ratio = 0.25

In [50]:
for i in np.arange(0.0001, 0.003, 0.00005):
    model = SGDClassifier(penalty='elasticnet', l1_ratio=0.25, alpha=i)
    model.fit(train_matrix, y_train)
    preds = model.predict(test_matrix)
    print('alpha', round(i,6), ', roc-auc ', roc_auc_score(test_target, preds))

alpha 0.0001 , roc-auc  0.5058400000000001
alpha 0.00015 , roc-auc  0.50648
alpha 0.0002 , roc-auc  0.50632
alpha 0.00025 , roc-auc  0.50652
alpha 0.0003 , roc-auc  0.50644
alpha 0.00035 , roc-auc  0.50628
alpha 0.0004 , roc-auc  0.5062
alpha 0.00045 , roc-auc  0.50576
alpha 0.0005 , roc-auc  0.5044799999999999
alpha 0.00055 , roc-auc  0.50464
alpha 0.0006 , roc-auc  0.50504
alpha 0.00065 , roc-auc  0.5052
alpha 0.0007 , roc-auc  0.5064000000000001
alpha 0.00075 , roc-auc  0.50624
alpha 0.0008 , roc-auc  0.5063200000000001
alpha 0.00085 , roc-auc  0.50644
alpha 0.0009 , roc-auc  0.50528
alpha 0.00095 , roc-auc  0.5059199999999999
alpha 0.001 , roc-auc  0.50704
alpha 0.00105 , roc-auc  0.5070399999999999
alpha 0.0011 , roc-auc  0.5069600000000001
alpha 0.00115 , roc-auc  0.50688
alpha 0.0012 , roc-auc  0.506
alpha 0.00125 , roc-auc  0.50624
alpha 0.0013 , roc-auc  0.50664
alpha 0.00135 , roc-auc  0.50608
alpha 0.0014 , roc-auc  0.50508
alpha 0.00145 , roc-auc  0.506
alpha 0.0015 , roc-a

оптимальный результат возникает при альфа = 0.00015