# Загрузка библиотек

In [1]:
import os
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.metrics import accuracy_score, mean_squared_error, r2_score
from sklearn.model_selection import train_test_split

# Подготовка данных

Объединим все отзывы (положительные и отрицательные) в единый файл. Один - для тренировочных данных, второй - для тестовых.

In [2]:
# пути к файлам с отзывами
TRAIN_POS = 'train\\pos'
TRAIN_NEG = 'train\\neg'
TEST_POS = 'test\\pos'
TEST_NEG = 'test\\neg'

In [3]:
# функция проходит по папке и сохраняет пути к файлам
def merge_reviews(path_pos, path_neg):
    ls = []
    
    for root, dirs, files in os.walk(path_pos):
        ls.extend([os.path.join(root,file) for file in files])    
    
    for root, dirs, files in os.walk(path_neg):
        ls.extend([os.path.join(root,file) for file in files])       
    
    return ls

In [4]:
# для обучения
ls_train = merge_reviews(TRAIN_POS, TRAIN_NEG)
# для теста
ls_test = merge_reviews(TEST_POS, TEST_NEG)

In [5]:
# для проверки (первые 12500 отзывов - положительные, вторые 12500 - отрицательные)

In [6]:
ls_train[:5]

['train\\pos\\0_9.txt',
 'train\\pos\\10000_8.txt',
 'train\\pos\\10001_10.txt',
 'train\\pos\\10002_7.txt',
 'train\\pos\\10003_8.txt']

In [7]:
ls_train[-5:]

['train\\neg\\9998_4.txt',
 'train\\neg\\9999_3.txt',
 'train\\neg\\999_3.txt',
 'train\\neg\\99_1.txt',
 'train\\neg\\9_1.txt']

In [8]:
len(ls_train)

25000

In [67]:
# сохраним в виде файла

# тренировочные данные
with open('train_data.txt','w') as f:
    for file in ls_train: 
        s = open(file, errors="ignore").read()
        f.write(s)
        f.write('\n')
        
# тестовые данные 
with open('test_data.txt','w') as f:
    for file in ls_test: 
        s = open(file, errors="ignore").read()
        f.write(s)
        f.write('\n')

In [9]:
# данные в виде списков

reviews_train = []
for line in open('train_data.txt', 'r'):
    reviews_train.append(line.strip())
    
reviews_test = []
for line in open('test_data.txt', 'r'):
    reviews_test.append(line.strip())

In [9]:
reviews_train[0]

'Bromwell High is a cartoon comedy. It ran at the same time as some other programs about school life, such as "Teachers". My 35 years in the teaching profession lead me to believe that Bromwell High\'s satire is much closer to reality than is "Teachers". The scramble to survive financially, the insightful students who can see right through their pathetic teachers\' pomp, the pettiness of the whole situation, all remind me of the schools I knew and their students. When I saw the episode in which a student repeatedly tried to burn down the school, I immediately recalled ......... at .......... High. A classic line: INSPECTOR: I\'m here to sack one of your teachers. STUDENT: Welcome to Bromwell High. I expect that many adults of my age think that Bromwell High is far fetched. What a pity that it isn\'t!'

In [10]:
reviews_test[0]

"I went and saw this movie last night after being coaxed to by a few friends of mine. I'll admit that I was reluctant to see it because from what I knew of Ashton Kutcher he was only able to do comedy. I was wrong. Kutcher played the character of Jake Fischer very well, and Kevin Costner played Ben Randall with such professionalism. The sign of a good movie is that it can toy with our emotions. This one did exactly that. The entire theater (which was sold out) was overcome by laughter during the first half of the movie, and were moved to tears during the second half. While exiting the theater I not only saw many women in tears, but many full grown men as well, trying desperately not to let anyone see them crying. This movie was great, and I suggest that you go see it before you judge."

# Baseline model

## Подготовка данных

Сделаем простую предварительную подготовку данных: удалим знаки препинания и сделаем все буквы строчными.

In [10]:
REPLACE_NO_SPACE = re.compile("[.;:!\'?,\"()\[\]]")
REPLACE_WITH_SPACE = re.compile("(<br\s*/><br\s*/>)|(\-)|(\/)")

def preprocess_reviews(reviews):
    reviews = [REPLACE_NO_SPACE.sub("", line.lower()) for line in reviews]
    reviews = [REPLACE_WITH_SPACE.sub(" ", line) for line in reviews]
    
    return reviews

reviews_train_clean = preprocess_reviews(reviews_train)
reviews_test_clean = preprocess_reviews(reviews_test)

In [11]:
# было
reviews_train[0]

'Bromwell High is a cartoon comedy. It ran at the same time as some other programs about school life, such as "Teachers". My 35 years in the teaching profession lead me to believe that Bromwell High\'s satire is much closer to reality than is "Teachers". The scramble to survive financially, the insightful students who can see right through their pathetic teachers\' pomp, the pettiness of the whole situation, all remind me of the schools I knew and their students. When I saw the episode in which a student repeatedly tried to burn down the school, I immediately recalled ......... at .......... High. A classic line: INSPECTOR: I\'m here to sack one of your teachers. STUDENT: Welcome to Bromwell High. I expect that many adults of my age think that Bromwell High is far fetched. What a pity that it isn\'t!'

In [12]:
# стало
reviews_train_clean[0]

'bromwell high is a cartoon comedy it ran at the same time as some other programs about school life such as teachers my 35 years in the teaching profession lead me to believe that bromwell highs satire is much closer to reality than is teachers the scramble to survive financially the insightful students who can see right through their pathetic teachers pomp the pettiness of the whole situation all remind me of the schools i knew and their students when i saw the episode in which a student repeatedly tried to burn down the school i immediately recalled  at  high a classic line inspector im here to sack one of your teachers student welcome to bromwell high i expect that many adults of my age think that bromwell high is far fetched what a pity that it isnt'

## Векторизация

Полученные данные необходимо векторизовать. Самый простой способ - это CountVectorizer. В результате него получим большуб разреженную матрицу.

In [13]:
# создаем векторизатор
cv = CountVectorizer(binary=True)
# обучаем на трейн-данных
cv.fit(reviews_train_clean)
# векторизуем трейн и тест данные
X = cv.transform(reviews_train_clean)
X_test = cv.transform(reviews_test_clean)

## Целевой признак

У нас есть две задачи:

1. Определение статуса комментария: положительный или отрицательный
2. Определени рейтинга (от 1 до 10)

### Статус комменатрия: положительный/отрицательный

Признак бинарный: 1 - положительный отзыв, 0 - отрицательные отзыв

In [14]:
# первые 12500 отзывов - положительные, вторые 12500 - отрицательные
# подходит как и для трейн и тест данных
target = [1 if i < 12500 else 0 for i in range(25000)]

# для проверки
print(target[:4])
print(target[12498:12502])
print(target[-4:])

[1, 1, 1, 1]
[1, 1, 0, 0]
[0, 0, 0, 0]


### Рейтинг (от 1 до 10)

Признак многоклассовый. Считаем рейтинги из названий файлов.

In [15]:
rating = []
for i in ls_train:
    rating.append(int(re.findall(r'_\d+', i)[0][1:]))

In [16]:
# для проверки

In [17]:
rating[:10]

[9, 8, 10, 7, 8, 8, 7, 7, 7, 7]

In [18]:
ls_train[:10]

['train\\pos\\0_9.txt',
 'train\\pos\\10000_8.txt',
 'train\\pos\\10001_10.txt',
 'train\\pos\\10002_7.txt',
 'train\\pos\\10003_8.txt',
 'train\\pos\\10004_8.txt',
 'train\\pos\\10005_7.txt',
 'train\\pos\\10006_7.txt',
 'train\\pos\\10007_7.txt',
 'train\\pos\\10008_7.txt']

# Обучение

### Cтатус комменатрия: положительный/отрицательный

Самая простая модель для обучения - Логистическая регресиия. Обучим как есть "из коробки", без подбора парметров.

In [19]:
# Разделение на трейн и валидацию
X_train, X_val, y_train, y_val = train_test_split(X, target, train_size = 0.75, random_state=10)

In [20]:
# модель
lr = LogisticRegression(solver = 'liblinear') # с solver='lbfgs' выходит ошибка
# обучение
lr.fit(X_train, y_train)
# точность на валидационных данных
accuracy_score(y_val, lr.predict(X_val))

LogisticRegression(solver='liblinear')

In [22]:
# обучим на всех данных
lr.fit(X, target)
# точность на тестовых данных
accuracy_score(target, lr.predict(X_test))

0.8692

### Рейтинг

In [23]:
# Разделение на трейн и валидацию
X_train, X_val, y_train, y_val = train_test_split(X, rating, train_size = 0.75, random_state=10)

In [30]:
# модель
lr1 = LogisticRegression(solver = 'liblinear') # с solver='lbfgs' выходит ошибка
# обучение
lr1.fit(X_train, y_train)
# точность на валидационных данных
accuracy_score(y_val, lr1.predict(X_val))

0.37552

In [31]:
# обучим на всех данных
lr1.fit(X, rating)
# точность на тестовых данных
accuracy_score(rating, lr1.predict(X_test))

0.25192

# Выводы

Базовая модель для определения статуса комментария (положительный/отрицательный) дает уже хорошие результаты при самой простой подготовке данных. 

Модель для определение рейтинга работает плохо.

Будем пытаться улучшить результаты при помощи более сложных способов предобработки.