In [8]:
import pandas as pd
import numpy as np
from sklearn.metrics import *
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline

In [9]:
# считываем данные и заполняем общий датасет
positive = pd.read_csv('positive.csv', sep=';', usecols=[3], names=['text'])
positive['label'] = ['positive'] * len(positive)
negative = pd.read_csv('negative.csv', sep=';', usecols=[3], names=['text'])
negative['label'] = ['negative'] * len(negative)

df=pd.concat([positive,negative])

In [10]:
df.tail()

Unnamed: 0,text,label
111918,Но не каждый хочет что то исправлять:( http://...,negative
111919,скучаю так :-( только @taaannyaaa вправляет мо...,negative
111920,"Вот и в школу, в говно это идти уже надо(",negative
111921,"RT @_Them__: @LisaBeroud Тауриэль, не грусти :...",negative
111922,Такси везет меня на работу. Раздумываю приплат...,negative


In [11]:
x_train, x_test, y_train, y_test = train_test_split(df.text, df.label)

In [12]:
from sklearn.linear_model import LogisticRegression # можно заменить на любимый классификатор
from sklearn.feature_extraction.text import CountVectorizer

In [13]:
from nltk import ngrams
from sklearn.feature_extraction.text import TfidfVectorizer
from collections import Counter

In [None]:
#токены с высокой частотой :

In [161]:
vec = CountVectorizer(ngram_range=(1, 1),min_df=1000)
bow = vec.fit_transform(x_train)
clf = LogisticRegression(random_state=42)
clf.fit(bow, y_train)

In [162]:
pred = clf.predict(vec.transform(x_test))
print(classification_report(pred, y_test))

              precision    recall  f1-score   support

    negative       0.63      0.63      0.63     27949
    positive       0.64      0.64      0.64     28760

    accuracy                           0.63     56709
   macro avg       0.63      0.63      0.63     56709
weighted avg       0.63      0.63      0.63     56709



In [None]:
#токены с средней частотой :

In [163]:
vec = CountVectorizer(ngram_range=(1, 1),min_df=100,max_df=1000)
bow = vec.fit_transform(x_train)
clf = LogisticRegression(random_state=42)
clf.fit(bow, y_train)

In [164]:
pred = clf.predict(vec.transform(x_test))
print(classification_report(pred, y_test))

              precision    recall  f1-score   support

    negative       0.57      0.70      0.62     22607
    positive       0.76      0.64      0.70     34102

    accuracy                           0.67     56709
   macro avg       0.66      0.67      0.66     56709
weighted avg       0.68      0.67      0.67     56709



In [None]:
#токены с низкой частотой :

In [165]:
vec = CountVectorizer(ngram_range=(1, 1),max_df=100)
bow = vec.fit_transform(x_train)
clf = LogisticRegression(random_state=42)
clf.fit(bow, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [166]:
pred = clf.predict(vec.transform(x_test))
print(classification_report(pred, y_test))

              precision    recall  f1-score   support

    negative       0.74      0.66      0.70     30982
    positive       0.64      0.72      0.67     25727

    accuracy                           0.69     56709
   macro avg       0.69      0.69      0.69     56709
weighted avg       0.69      0.69      0.69     56709



In [None]:
# лучшие показатели были у токенов с низкой частотой , возможно из за большого их количества 

In [204]:
list(vec.vocabulary_.keys())[:10]

['банда',
 'недооценила',
 'смелость',
 'народа',
 'страны',
 'украины',
 'oladushek_way',
 'funnyhoorse',
 'herlocked',
 'alen4ik222']

In [211]:
import math

future_impotance=pd.DataFrame(list(vec.vocabulary_.keys())[:10],columns=['feature'])

In [212]:
future_impotance['importance']=pow(math.e,np.array(clf.coef_[0][:10]))

future_impotance

Unnamed: 0,feature,importance
0,банда,2.806864
1,недооценила,1.353378
2,смелость,1.122292
3,народа,1.376099
4,страны,1.417457
5,украины,1.264081
6,oladushek_way,1.246739
7,funnyhoorse,1.299135
8,herlocked,0.843358
9,alen4ik222,1.580486


In [None]:
# ### Задание 3.

# 1) сравнить count/tf-idf/hashing векторайзеры/полносвязанную сетку (построить classification_report)

# 2) подобрать оптимальный размер для hashing векторайзера 

# 3) убедиться что для сетки нет переобучения

In [213]:
vec = TfidfVectorizer(ngram_range=(1, 1))
bow = vec.fit_transform(x_train)
clf = LogisticRegression(random_state=42)
clf.fit(bow, y_train)
pred = clf.predict(vec.transform(x_test))
print(classification_report(pred, y_test))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


              precision    recall  f1-score   support

    negative       0.74      0.77      0.75     26753
    positive       0.78      0.76      0.77     29956

    accuracy                           0.76     56709
   macro avg       0.76      0.76      0.76     56709
weighted avg       0.76      0.76      0.76     56709



In [216]:
from sklearn.feature_extraction.text import HashingVectorizer

vct = HashingVectorizer(analyzer='word', n_features=200)
vct.fit(x_train)

x_train_hash =  vct.transform(x_train)
x_test_hash =  vct.transform(x_test)

clf = LogisticRegression(random_state=42)
clf.fit(x_train_hash, y_train)
pred = clf.predict(x_test_hash)
print(classification_report(pred, y_test))

              precision    recall  f1-score   support

    negative       0.60      0.61      0.60     27367
    positive       0.63      0.62      0.62     29342

    accuracy                           0.61     56709
   macro avg       0.61      0.61      0.61     56709
weighted avg       0.61      0.61      0.61     56709

