In [1]:
import pandas as pd
import numpy as np
import re
from nltk.tokenize import word_tokenize as nltk_word_tokenize
from nltk.corpus import stopwords
from tqdm import tqdm
import torch

In [2]:
df = pd.read_csv('prc_1.csv')
df = df.rename(columns={'text_type':'target'})
df.target = df.target.astype('category')
df.target = df.target.cat.codes

In [3]:
df.head()

Unnamed: 0,target,text
0,1,naturally irresistible your corporate identity...
1,1,the stock trading gunslinger fanny is merrill ...
2,1,unbelievable new homes made easy im wanting to...
3,1,4 color printing special request additional in...
4,1,do not have money get software cds from here s...


TF-IDF + Naive Bias

In [9]:
stop_words = set(stopwords.words('english'))

In [3]:
def to_lower(text):
    return text.lower()
def remove_punctuations(text):
    return re.sub(r'[^\w\s]', '', text)
def to_tokenize(text):
    return nltk_word_tokenize(text)
def remove_stopwords(tokens):
    return [word for word in tokens if word not in stop_words]

In [4]:
import pymorphy2
morph = pymorphy2.MorphAnalyzer()

def lemmatize(tokens):
    return [morph.parse(word)[0].normal_form for word in tokens]

In [5]:
def text_processing(text,do_lemmatize = False):
    text = to_lower(text)
    text = remove_punctuations(text)
    tokens = to_tokenize(text)
    tokens = remove_stopwords(tokens)
    if do_lemmatize == 0:
        tokens = lemmatize(tokens)
    return ' '.join(tokens)

In [12]:
text = list(df.text)
processed_text = []

for t in tqdm(text):
    processed_text.append(text_processing(t))    

100%|██████████| 20348/20348 [00:06<00:00, 3143.05it/s]


In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(processed_text)

tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf_vectorizer.get_feature_names_out())


NameError: name 'processed_text' is not defined

In [7]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,roc_auc_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression

In [16]:
X_train,X_test,y_train,y_test = train_test_split(tfidf_df,df.target,test_size=0.3)
nb = MultinomialNB()
nb.fit(X_train,y_train)

In [153]:
lr = LogisticRegression(penalty='l2')
#lr.fit(X_train,y_train)

In [19]:
roc_auc_score(y_test,nb.predict(X_test)),accuracy_score(y_test,nb.predict(X_test))

(0.7953024832476084, 0.8705978705978706)

In [20]:
roc_auc_score(y_test,lr.predict(X_test)),accuracy_score(y_test,lr.predict(X_test))

(0.8701227214243595, 0.9146601146601147)

FastText

In [4]:
import fasttext

In [7]:
def text_processing(text,lemmatize = False):
    text = to_lower(text)
    text = remove_punctuations(text)
    tokens = to_tokenize(text)
    tokens = remove_stopwords(tokens)
    if lemmatize == True:
        tokens = lemmatize(tokens)
    return ' '.join(tokens)

In [11]:
text = list(df.text)
processed_text = []

for t in tqdm(text):
    processed_text.append(text_processing(t))

100%|██████████| 20348/20348 [00:06<00:00, 3089.52it/s]


In [16]:
with open("data.txt", "w") as file:
    file.write(str(processed_text))

In [17]:
model = fasttext.train_unsupervised(
    input='data.txt',
    model="skipgram",
    dim=300,
    epoch=10,
    lr=0.05,
    minn=3,
    maxn=6,
    minCount=5,
    thread=4) 

Read 0M words
Number of words:  14270
Number of labels: 0
Progress: 100.0% words/sec/thread:   56410 lr:  0.000000 avg.loss:  1.788019 ETA:   0h 0m 0s


In [22]:
model.save_model("fasttext_model.bin")

In [None]:
model.get_nearest_neighbors("shit")
model.get_word_vector("shit")

[(0.8091334104537964, "'shit"),
 (0.7118743062019348, "shit',"),
 (0.709918737411499, 'shite'),
 (0.549150824546814, "'hit"),
 (0.5461899638175964, 'shine'),
 (0.5415017008781433, "'fucking"),
 (0.5332852602005005, 'hit'),
 (0.5323843359947205, "'hmm"),
 (0.530677080154419, 'shift'),
 (0.5305278301239014, 'shouldnt')]

In [8]:
from catboost import CatBoostClassifier as cbc

In [32]:
model = cbc()
fs = fasttext.load_model("fasttext_model.bin")

In [None]:
fs.get_word_vector(processed_text[0])

300

In [None]:
#всего 75 слов, нужно получить по каждому слову вектор размерности 300

In [48]:
temp = [to_tokenize(doc) for doc in processed_text]

In [117]:
corpus = []
for doc in tqdm(temp):
    doc_vectors = []
    for word in doc:
        word_vec = fs.get_word_vector(word)
        doc_vectors.append(word_vec)
    corpus.append(np.mean(doc_vectors,axis=0))

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
100%|██████████| 20348/20348 [00:03<00:00, 5291.91it/s] 


In [118]:
err_index = []
for i in range(len(corpus)):
    if np.isnan(corpus[i]).any():
        err_index.append(i)

In [147]:
X = []
for i in range(len(corpus)):
    if i in err_index:
        continue
    X.append(corpus[i])

In [137]:
y = df.target.drop(err_index)

In [149]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,stratify=y)

In [150]:
model.fit(X_train,y_train)

Learning rate set to 0.032012
0:	learn: 0.6649399	total: 71.6ms	remaining: 1m 11s
1:	learn: 0.6407232	total: 81.9ms	remaining: 40.8s
2:	learn: 0.6161068	total: 93ms	remaining: 30.9s
3:	learn: 0.5933140	total: 103ms	remaining: 25.7s
4:	learn: 0.5707447	total: 113ms	remaining: 22.6s
5:	learn: 0.5517391	total: 124ms	remaining: 20.5s
6:	learn: 0.5339789	total: 134ms	remaining: 19.1s
7:	learn: 0.5189596	total: 145ms	remaining: 17.9s
8:	learn: 0.5040701	total: 155ms	remaining: 17.1s
9:	learn: 0.4894159	total: 165ms	remaining: 16.4s
10:	learn: 0.4760790	total: 176ms	remaining: 15.8s
11:	learn: 0.4621985	total: 186ms	remaining: 15.3s
12:	learn: 0.4498069	total: 196ms	remaining: 14.9s
13:	learn: 0.4398625	total: 206ms	remaining: 14.5s
14:	learn: 0.4302468	total: 217ms	remaining: 14.3s
15:	learn: 0.4211322	total: 229ms	remaining: 14.1s
16:	learn: 0.4122040	total: 240ms	remaining: 13.9s
17:	learn: 0.4034320	total: 250ms	remaining: 13.6s
18:	learn: 0.3944718	total: 260ms	remaining: 13.4s
19:	learn

<catboost.core.CatBoostClassifier at 0x16be4dff0>

In [151]:
roc_auc_score(y_test,model.predict(X_test)),accuracy_score(y_test,model.predict(X_test))

(0.9477332759977711, 0.95719908166612)

In [154]:
lr.fit(X_train,y_train)

In [155]:
roc_auc_score(y_test,lr.predict(X_test)),accuracy_score(y_test,lr.predict(X_test))

(0.890159719572282, 0.9096425057395867)

BERT

In [9]:
from transformers import AutoModelForSequenceClassification,AutoTokenizer

In [10]:
checkpoint = "bert-base-uncased"
model = AutoModelForSequenceClassification.from_pretrained(checkpoint,num_labels=len(set(df.target.to_list())))
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

Loading weights:   0%|          | 0/199 [00:00<?, ?it/s]

BertForSequenceClassification LOAD REPORT from: bert-base-uncased
Key                                        | Status     | 
-------------------------------------------+------------+-
cls.predictions.transform.LayerNorm.bias   | UNEXPECTED | 
cls.predictions.transform.LayerNorm.weight | UNEXPECTED | 
cls.predictions.transform.dense.weight     | UNEXPECTED | 
cls.seq_relationship.weight                | UNEXPECTED | 
cls.seq_relationship.bias                  | UNEXPECTED | 
cls.predictions.transform.dense.bias       | UNEXPECTED | 
cls.predictions.bias                       | UNEXPECTED | 
classifier.bias                            | MISSING    | 
classifier.weight                          | MISSING    | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
- MISSING	:those params were newly initialized because missing from the checkpoint. Consider training on your downstream task.


In [12]:
X_train,X_test,y_train,y_test = train_test_split(df.text.to_list(),df.target.to_list(),test_size=0.3,stratify=df.target)

In [13]:
train_encodings = tokenizer(
    X_train,
    padding=True, 
    return_tensors="pt",
    truncation=True,
    max_length = 512)

In [14]:
test_encodings = tokenizer(
    X_test,
    padding=True, 
    return_tensors="pt",
    truncation=True,
    max_length = 512)

In [15]:
class TextDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {k: torch.tensor(v[idx]) for k, v in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)


In [16]:
train_dataset = TextDataset(train_encodings, y_train)
test_dataset   = TextDataset(test_encodings, y_test)


In [19]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    learning_rate=1e-3,   # ВАЖНО: выше, чем обычно
    num_train_epochs=5,   # можно больше эпох
    per_device_train_batch_size=16,
    eval_strategy="epoch"
)

In [22]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)


In [23]:
for param in model.bert.parameters():
    param.requires_grad = False

In [25]:
trainer.train()

  item = {k: torch.tensor(v[idx]) for k, v in self.encodings.items()}


KeyboardInterrupt: 