In [1]:
import warnings
warnings.filterwarnings("ignore")

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import model_selection, preprocessing, linear_model
from sklearn.metrics import accuracy_score
from sklearn.decomposition import TruncatedSVD
import itertools

import pandas as pd
from nltk.util import ngrams

### read data

In [2]:
combine_df = pd.read_pickle("./preprocessed_df.pkl")
combine_df.head(5)

Unnamed: 0,id,label,tweet,tweet_token,tweet_token_filtered,tweet_stemmed,tweet_lemmatized
0,1,0.0,when father is dysfunctional and is so selfish...,"[when, father, is, dysfunctional, and, is, so,...","[father, dysfunctional, selfish, drags, kids, ...","[father, dysfunct, selfish, drag, kid, dysfunc...","[father, dysfunct, selfish, drag, kid, dysfunc..."
1,2,0.0,thanks for lyft credit cannot use cause they d...,"[thanks, for, lyft, credit, can, not, use, cau...","[thanks, lyft, credit, use, cause, offer, whee...","[thank, lyft, credit, use, caus, offer, wheelc...","[thank, lyft, credit, use, caus, offer, wheelc..."
2,3,0.0,bihday your majesty,"[bihday, your, majesty]","[bihday, majesty]","[bihday, majesti]","[bihday, majesti]"
3,4,0.0,model love you take with you all the time in y...,"[model, love, you, take, with, you, all, the, ...","[model, love, take, time]","[model, love, take, time]","[model, love, take, time]"
4,5,0.0,factsguide society now motivation,"[factsguide, society, now, motivation]","[factsguide, society, motivation]","[factsguid, societi, motiv]","[factsguid, societi, motiv]"


### CountVectorizer

In [3]:
def dummy(doc):
    return doc

In [4]:
count_vectorizer_st = CountVectorizer(ngram_range=(1, 1), analyzer='word', tokenizer=dummy, preprocessor=dummy, binary=False, max_df = 0.9, max_features = 1000, stop_words = 'english')

In [5]:
# CountVectorizer?

In [6]:
bag_of_words_countv = count_vectorizer_st.fit_transform(combine_df['tweet_stemmed'])
feature_names_countv = count_vectorizer_st.get_feature_names()
tweet_stemmed_countv = pd.DataFrame(bag_of_words_countv.toarray(), columns = feature_names_countv)
tweet_stemmed_countv.head()

Unnamed: 0,abl,absolut,accept,account,act,action,actor,actual,ad,adapt,...,yeah,year,yesterday,yo,yoga,york,young,youtub,yr,yummi
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [7]:
count_vectorizer_lem = CountVectorizer(ngram_range=(1, 1), analyzer='word', tokenizer=dummy, preprocessor=dummy, binary=False, max_df = 0.9, max_features = 1000, stop_words = 'english')

In [8]:
bag_of_words_countv = count_vectorizer_lem.fit_transform(combine_df['tweet_lemmatized'])
feature_names_countv = count_vectorizer_lem.get_feature_names()
tweet_lemmatized_countv = pd.DataFrame(bag_of_words_countv.toarray(), columns = feature_names_countv)
tweet_lemmatized_countv.head()

Unnamed: 0,abl,absolut,accept,account,act,action,activ,actor,actual,ad,...,year,yesterday,yo,yoga,york,young,youth,youtub,yr,yummi
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [9]:
# (tweet_lemmatized_countv.columns == tweet_stemmed_countv.columns).sum()

### TfidfVectorizer

In [10]:
tfidf_vectorizer_st = TfidfVectorizer(ngram_range=(1, 1), analyzer='word', tokenizer=dummy, preprocessor=dummy, binary=False, max_df = 0.9, max_features = 1000, stop_words = 'english')

In [11]:
bag_of_words_tfidfv = tfidf_vectorizer_st.fit_transform(combine_df['tweet_stemmed'])
feature_names_tfidfv = tfidf_vectorizer_st.get_feature_names()
tweet_stemmed_tfidfv = pd.DataFrame(bag_of_words_tfidfv.toarray(), columns = feature_names_tfidfv)
tweet_stemmed_tfidfv.head()

Unnamed: 0,abl,absolut,accept,account,act,action,actor,actual,ad,adapt,...,yeah,year,yesterday,yo,yoga,york,young,youtub,yr,yummi
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [12]:
tfidf_vectorizer_lem = TfidfVectorizer(ngram_range=(1, 1), analyzer='word', tokenizer=dummy, preprocessor=dummy, binary=False, max_df = 0.9, max_features = 1000, stop_words = 'english')

In [13]:
bag_of_words_tfidfv = tfidf_vectorizer_lem.fit_transform(combine_df['tweet_lemmatized'])
feature_names_tfidfv = tfidf_vectorizer_lem.get_feature_names()
tweet_lemmatized_tfidfv = pd.DataFrame(bag_of_words_tfidfv.toarray(), columns = feature_names_tfidfv)
tweet_lemmatized_tfidfv.head()

Unnamed: 0,abl,absolut,accept,account,act,action,activ,actor,actual,ad,...,year,yesterday,yo,yoga,york,young,youth,youtub,yr,yummi
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [14]:
# tweet_lemmatized_countv['abl'].sum() == tweet_lemmatized_tfidfv['abl'].sum()

### test data check

In [15]:
# Загружаем данные
data = open('corpus').read()
labels, texts = [], []
for i, line in enumerate(data.split("\n")):
    content = line.split()
    labels.append(content[0])
    texts.append(" ".join(content[1:]))

# создаем df
test_df = pd.DataFrame()
test_df['text'] = texts
test_df['label'] = labels
test_df.head(5)

Unnamed: 0,text,label
0,Stuning even for the non-gamer: This sound tra...,__label__2
1,The best soundtrack ever to anything.: I'm rea...,__label__2
2,Amazing!: This soundtrack is my favorite music...,__label__2
3,Excellent Soundtrack: I truly like this soundt...,__label__2
4,"Remember, Pull Your Jaw Off The Floor After He...",__label__2


In [16]:
train_x, valid_x, train_y, valid_y = model_selection.train_test_split(test_df['text'], test_df['label'])

# labelEncode целевую переменную
encoder = preprocessing.LabelEncoder()
train_y = encoder.fit_transform(train_y)
valid_y = encoder.fit_transform(valid_y)

In [17]:
count_vect = CountVectorizer(analyzer='word', token_pattern=r'\w{1,}', stop_words = 'english')
count_vect.fit(test_df['text'])

CountVectorizer(stop_words='english', token_pattern='\\w{1,}')

In [18]:
tfidf_vect = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', stop_words = 'english')
tfidf_vect.fit(test_df['text'])

TfidfVectorizer(stop_words='english', token_pattern='\\w{1,}')

In [19]:
res = {}
for vec in [count_vect, tfidf_vect, count_vectorizer_st, count_vectorizer_lem, tfidf_vectorizer_st, tfidf_vectorizer_lem]:
#     vec.fit(test_df['text'])
    xtrain_count =  vec.transform(train_x)
    xvalid_count =  vec.transform(valid_x)

    classifier = linear_model.LogisticRegression()
    classifier.fit(xtrain_count, train_y)
    predictions = classifier.predict(xvalid_count)
    res[str(vec)] = accuracy_score(valid_y, predictions)

In [20]:
pd.DataFrame.from_dict(res, orient='index', columns = ['accuracy'])

Unnamed: 0,accuracy
"CountVectorizer(stop_words='english', token_pattern='\\w{1,}')",0.8488
"TfidfVectorizer(stop_words='english', token_pattern='\\w{1,}')",0.8448
"CountVectorizer(max_df=0.9, max_features=1000,\n preprocessor=<function dummy at 0x00000262EA045558>,\n stop_words='english',\n tokenizer=<function dummy at 0x00000262EA045558>)",0.5036
"TfidfVectorizer(max_df=0.9, max_features=1000,\n preprocessor=<function dummy at 0x00000262EA045558>,\n stop_words='english',\n tokenizer=<function dummy at 0x00000262EA045558>)",0.5036


Векторайзеры, натренированные на других данных, работают одинаково плохо

### tokenizer params

In [21]:
max_feats = [100, 250, 500, 750, 1000, None]
max_dfs = [0.1, 0.5, 0.75, 1.0]
ngrams = [(1, 1), (2, 2), (1, 2)]

In [22]:
res_params = {}
for f, d, n in list(itertools.product(max_feats, max_dfs, ngrams)):
    tfidf_vect = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', ngram_range = n, max_df = d, max_features = f, stop_words = 'english')
    tfidf_vect.fit(test_df['text'])
    
    xtrain_count =  tfidf_vect.transform(train_x)
    xvalid_count =  tfidf_vect.transform(valid_x)

    classifier = linear_model.LogisticRegression()
    classifier.fit(xtrain_count, train_y)
    predictions = classifier.predict(xvalid_count)
    res_params['f_'+str(f)+' df_'+str(d) + ' n_' + str(n)] = accuracy_score(valid_y, predictions)

In [23]:
pd.DataFrame.from_dict(res_params, orient='index', columns = ['accuracy']).sort_values(by = 'accuracy', ascending = False)

Unnamed: 0,accuracy
"f_None df_1.0 n_(1, 1)",0.8448
"f_None df_0.75 n_(1, 1)",0.8448
"f_None df_0.5 n_(1, 1)",0.8448
"f_None df_0.1 n_(1, 2)",0.8396
"f_None df_1.0 n_(1, 2)",0.8368
...,...
"f_250 df_0.1 n_(2, 2)",0.6712
"f_100 df_1.0 n_(2, 2)",0.6400
"f_100 df_0.75 n_(2, 2)",0.6400
"f_100 df_0.5 n_(2, 2)",0.6400


Cамый высокий результат у векторайзера с дефолтными настройками

### SVD

In [24]:
tfidf_vect = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', stop_words = 'english')
tfidf_vect.fit(test_df['text'])

TfidfVectorizer(stop_words='english', token_pattern='\\w{1,}')

In [25]:
res_svd = {}
for n in [100, 250, 500, 750, 1000]:
    svd = TruncatedSVD(n_components=n, random_state=42, algorithm = 'arpack')

    xtrain_count =  tfidf_vect.transform(train_x)
    xvalid_count =  tfidf_vect.transform(valid_x)

    xtrain_count_svd =  svd.fit_transform(xtrain_count)
    xvalid_count_svd =  svd.transform(xvalid_count)

    classifier = linear_model.LogisticRegression()
    classifier.fit(xtrain_count_svd, train_y)
    predictions = classifier.predict(xvalid_count_svd)
    res_svd[str(n)] = accuracy_score(valid_y, predictions)

In [26]:
pd.DataFrame.from_dict(res_svd, orient='index', columns = ['accuracy_svd'])

Unnamed: 0,accuracy_svd
100,0.808
250,0.8272
500,0.8304
750,0.8352
1000,0.8356


Понижение размерности снизило точность