In [None]:
import os
from collections import defaultdict, Counter
import time
import torch
import torch.nn.functional as F
import pandas as pd
import numpy as np
from transformers import BertForSequenceClassification, BertTokenizer
import pytorch_lightning as pl
from tqdm.notebook import tqdm
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.decomposition import PCA, TruncatedSVD
from sklearn.model_selection import train_test_split
def encode_labels(data, le):
    data['Class'] = le.transform(data['Class'])
    return data

train = pd.read_csv('../input/prepaid/train_preprocessed.csv')
test = pd.read_csv('../input/prepaid/test_preprocessed.csv')
val = pd.read_csv('../input/prepaid/val_preprocessed.csv')

le = LabelEncoder()
le.fit(train['Class'].values)

for d in [train, val]:
    d = encode_labels(d, le)

In [None]:
!pip install lexicalrichness
!pip install textblob

In [None]:
from lexicalrichness import LexicalRichness

In [None]:
train['Rich'] = LexicalRichness(train['Text']).Maas
test['Rich'] = LexicalRichness(test['Text']).Maas
val['Rich'] = LexicalRichness(val['Text']).Maas

In [None]:
t = []
for i in range(len(train)):
    try:
        t.append(LexicalRichness(train['Text'][i]).Maas)
    except:
        t.append(0)
train['Rich'] = t

In [None]:
t = []
for i in range(len(test)):
    try:
        t.append(LexicalRichness(test['Text'][i]).Maas)
    except:
        t.append(0)
test['Rich'] = t

In [None]:
t = []
for i in range(len(val)):
    try:
        t.append(LexicalRichness(val['Text'][i]).Maas)
    except:
        t.append(0)
val['Rich'] = t

In [None]:
#Весь трейн не влезает, поэтому взяли 0.7 от трейна. Для данных на 0.7 отдельно посчитали скор бейзлайна (см. в презентации)
import nltk
from nltk.corpus import stopwords

stop = stopwords.words('russian')

train, x = train_test_split(train, train_size=0.7, random_state=42)

In [None]:
model = LogisticRegression(C=0.01, n_jobs=40)
vectorizer = TfidfVectorizer(stop_words=stop, ngram_range=(1, 3), max_features=40000)
scaler = StandardScaler()
scaler_length = StandardScaler()

In [None]:
X_train_texts = np.array(train['Text'].values)
X_train_length = np.array(train['Number of tokens'].values)
X_train_rich = np.array(train['Rich'].values)
X_train_max = np.array(train['Max'].values)
X_train_min = np.array(train['Min'].values)
y_train = np.array(train['Class'].values)

In [None]:
X_train_texts = vectorizer.fit_transform(X_train_texts)

In [None]:
X_train_texts[0]

In [None]:
svd = TruncatedSVD(n_components = 4000)

In [None]:
X_train_texts = svd.fit_transform(X_train_texts)

In [None]:
X_train_texts = scaler.fit_transform(X_train_texts)

In [None]:
X_train_length = X_train_length.reshape(-1,1)
X_train_length = scaler_length.fit_transform(X_train_length)

In [None]:
X_train_rich = X_train_rich.reshape(-1,1)
X_train_rich = scaler_length.fit_transform(X_train_rich)

In [None]:
X_train_rich = X_train_rich.flatten()

In [None]:
X_train_min = X_train_min.flatten()

In [None]:
X_train_max = X_train_max.flatten()

In [None]:
len(X_train_min)

In [None]:
len(X_train_length)

In [None]:
#Тут тестировал разные параметры, менял переменные в column_stack и такие же ставил для теста. Потом смотрел скоры
X_train = np.column_stack((X_train_texts, X_train_max, X_train_length, X_train_rich))

In [None]:
model = LogisticRegression(C=0.01, n_jobs=40)

In [None]:
model.fit(X_train, y_train)

In [None]:
X_test_texts = np.array(test['Text'].values)
X_test_texts = vectorizer.transform(X_test_texts)
X_test_texts = svd.transform(X_test_texts)
X_test_texts = scaler.transform(X_test_texts)
X_test_length = np.array(test['Number of tokens'])
X_test_length = X_test_length.reshape(-1,1)
X_test_length = scaler_length.fit_transform(X_test_length)
X_test_length = X_test_length.flatten()
X_test_max = np.array(test['Max'].values)
X_test_min = np.array(test['Min'].values)
X_test_rich = np.array(test['Rich'])
X_test_rich = X_test_rich.reshape(-1,1)
X_test_rich = scaler_length.fit_transform(X_test_rich)
X_test_rich = X_test_rich.flatten()

In [None]:
#В зависимости от переменных в X_train, в X_test'е аналогичные
X_test = np.column_stack((X_test_texts, X_test_max, X_test_length, X_test_rich))

In [None]:
len(X_test)

In [None]:
preds = model.predict(X_test)

In [None]:
np.save('test_preds_tfidf.npy', le.inverse_transform(preds))

In [None]:
test['Class'] = le.inverse_transform(preds)

In [None]:
test[['Id', 'Class']].to_csv('test_topic.csv', index=False)