In [1]:
import json
import nltk
import numpy as np
import pandas as pd

from html import unescape
from functools import partial
from string import punctuation

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, HashingVectorizer
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.linear_model import SGDClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import precision_score, recall_score, f1_score, classification_report

import torch
from tqdm import tqdm

__load previous data__

In [2]:
apostrophe_dict = json.load(open('../hw1/data/apostrophe.json', 'r', encoding='utf-8'))
short_word_dict = json.load(open('../hw1/data/short_words.json', 'r', encoding='utf-8'))
emoticon_dict = json.load(open('../hw1/data/emotions.json', 'r', encoding='utf-8'))

data = pd.read_csv('../hw1/data/train_tweets.csv')
data['label'].value_counts()

0    29720
1     2242
Name: label, dtype: int64

__used functions__

In [3]:
# взято из hw1
def remap_func(text, dictionary):
    return ' '.join([dictionary.get(w, w) for w in text.split()])

In [4]:
# собрано из hw1
def data_clean(data):
    stopwords = nltk.corpus.stopwords.words('english')
    lemm = nltk.stem.wordnet.WordNetLemmatizer()
    data = data.copy()
    data = data.apply(unescape)
    data = data.str.replace(r'@\w*', '', regex=True)
    data = data.str.lower()
    data = data.apply(partial(remap_func, dictionary=apostrophe_dict))
    data = data.apply(partial(remap_func, dictionary=short_word_dict))
    data = data.apply(partial(remap_func, dictionary=emoticon_dict))
    data = data.str.replace(f'[{punctuation}0-9]', ' ', regex=True)
    data = data.str.replace(r'\b\S\b', '', regex=True)
    data = data.apply(nltk.tokenize.word_tokenize)
    data = data.apply(lambda tokens: [w for w in tokens if w not in stopwords])
    data = data.apply(lambda tokens: [lemm.lemmatize(w) for w in tokens])
    return data

In [5]:
# prepare train data
data['lemma'] = data_clean(data['tweet'])
data.head()

Unnamed: 0,id,label,tweet,lemma
0,1,0,@user when a father is dysfunctional and is s...,"[father, dysfunctional, selfish, drag, kid, dy..."
1,2,0,@user @user thanks for #lyft credit i can't us...,"[thanks, lyft, credit, use, cause, offer, whee..."
2,3,0,bihday your majesty,"[bihday, majesty]"
3,4,0,#model i love u take with u all the time in ...,"[model, love, take, time, urð±, ,..."
4,5,0,factsguide: society now #motivation,"[factsguide, society, motivation]"


__vectorizers__

In [20]:
#
corpus = data['lemma'].str.join(' ')


class DataKeeper:
    def __init__(self, corpus, **models):
        self.corpus = corpus
        self.models = models
        self.fields = []
        self.fitted = []
    
    def prepare(self):
        for name, model in self.models.items():
            model.fit(self.corpus)
            matrix = model.transform(self.corpus)
            mtx_name, mdl_name = f'matrix_{name}', f'model_{name}'
            setattr(self, mtx_name, matrix)
            setattr(self, mdl_name, model)
            self.fields.append(mtx_name)
            self.fitted.append(mdl_name)

In [21]:
# подготовка матриц
keeper = DataKeeper(corpus,
    cnvec_high=CountVectorizer(stop_words='english', max_features=1000, min_df=1, max_df=0.9),
    tfidf_high=TfidfVectorizer(stop_words='english', max_features=1000, min_df=1, max_df=0.9),
    cnvec_mid=CountVectorizer(stop_words='english', max_features=1000, min_df=1e-3, max_df=0.7),
    tfidf_mid=TfidfVectorizer(stop_words='english', max_features=1000, min_df=1e-3, max_df=0.7),
    cnvec_low=CountVectorizer(stop_words='english', max_features=1000, min_df=1e-5, max_df=0.3),
    tfidf_low=TfidfVectorizer(stop_words='english', max_features=1000, min_df=1e-5, max_df=0.3),
    hashing=HashingVectorizer(n_features=1000),
)
keeper.prepare()

In [15]:
# обучение и сравнение классификаторов
reports = {}
for fname in keeper.fields:
    matrix = getattr(keeper, fname)
    precision, recall, f1 = [], [], []
    # train/valid split    
    skf = StratifiedKFold(3, shuffle=True, random_state=11)
    for train, valid in skf.split(matrix, data['label']):
        # model = SGDClassifier(learning_rate='adaptive', eta0=0.01, class_weight='balanced', random_state=19)
        model = SGDClassifier(learning_rate='adaptive', eta0=0.01, random_state=19)
        # model = LGBMClassifier(class_weight='balanced', random_state=19)
        # model = LGBMClassifier(random_state=19)
        model.fit(matrix[train].astype(np.float32), data.loc[train, 'label'])
        predicts = model.predict(matrix[valid].astype(np.float32))
        # metrics & report
        precision.append(precision_score(data.loc[valid, 'label'], predicts))
        recall.append(recall_score(data.loc[valid, 'label'], predicts))
        f1.append(f1_score(data.loc[valid, 'label'], predicts))
        reports[fname] = [np.mean(precision), np.mean(recall), np.mean(f1)]

pd.DataFrame(reports, index=['precision', 'recall', 'f1']).T

Unnamed: 0,precision,recall,f1
matrix_cnvec_high,0.822866,0.325597,0.46622
matrix_tfidf_high,0.847976,0.273854,0.413852
matrix_cnvec_mid,0.821968,0.325597,0.466094
matrix_tfidf_mid,0.849493,0.274746,0.415084
matrix_cnvec_low,0.822866,0.325597,0.46622
matrix_tfidf_low,0.847976,0.273854,0.413852
matrix_hashing,0.947595,0.075821,0.140269


__feature importance__

In [16]:
# take one model
matrix = keeper.matrix_tfidf_mid
train, valid = train_test_split(np.arange(matrix.shape[0]), test_size=0.2, stratify=data['label'], shuffle=True, random_state=11)

# model = LGBMClassifier(random_state=19)
model = SGDClassifier(learning_rate='adaptive', eta0=0.01, random_state=19)

model.fit(matrix[train], data.loc[train, 'label'])
predicts = model.predict(matrix[valid])
print(classification_report(data.loc[valid, 'label'], predicts))

              precision    recall  f1-score   support

           0       0.95      1.00      0.97      5945
           1       0.86      0.26      0.40       448

    accuracy                           0.95      6393
   macro avg       0.91      0.63      0.68      6393
weighted avg       0.94      0.95      0.93      6393



In [17]:
# read more: https://towardsdatascience.com/boruta-explained-the-way-i-wish-someone-explained-it-to-me-4489d70e154a
class SGDFeatureSelector:
    def __init__(self, model, n_estimators=100, random_state=None):
        self.model = model
        self.n_estimators = n_estimators
        self.randomizer = np.random.default_rng(random_state)
        self.hits = None
    
    def fit(self, X, y=None):
        self.hits = np.zeros(X.shape[1])        
        for n in tqdm(range(self.n_estimators), total=self.n_estimators, desc='Fitting'):
            # extend X with shadows - вот тут хорошо бы не переходить к dense-матрице, но реализация требует времени
            shadows = np.apply_along_axis(np.random.permutation, 1, X.toarray())
            X_extended = np.hstack([X.toarray(), shadows])
            self.model.fit(X_extended, y)
            # get importances
            importances = self.model.coef_.flatten()
            threshold = importances[importances.size // 2:].max()
            self.hits += importances[:importances.size // 2] > threshold

In [18]:
# обучение
fs = SGDFeatureSelector(model, n_estimators=10, random_state=17)
fs.fit(matrix[train], data.loc[train, 'label'])

Fitting: 100%|██████████| 10/10 [00:48<00:00,  4.84s/it]


In [22]:
idx = fs.hits.argsort()[:50]
keeper.model_tfidf_mid.get_feature_names_out()[idx]

array(['able', 'pop', 'positive', 'positivity', 'post', 'power', 'pray',
       'prayer', 'prayfororlando', 'praying', 'pre', 'present', 'pretty',
       'poor', 'previous', 'pride', 'probably', 'problem', 'product',
       'project', 'proud', 'public', 'pulse', 'punjab', 'puppy', 'pussy',
       'queen', 'price', 'question', 'pool', 'political', 'peace',
       'people', 'perfect', 'person', 'pet', 'phone', 'photo',
       'photography', 'photooftheday', 'pic', 'picoftheday', 'picture',
       'politician', 'piece', 'pizza', 'place', 'plan', 'planning',
       'play'], dtype=object)

__NN approach__

In [23]:
class TweetDataset():
    def __init__(self, X, y):
        self.X = X
        self.y = y

    def __getitem__(self, index):
        return self.X[index], self.y[index]
    
    def __len__(self):
        return self.X.shape[0]

In [24]:
class Cell(torch.nn.Module):
    def __init__(self, inp, out, *, drop=0):
        super().__init__()
        self.linear = torch.nn.Linear(inp, out)
        self.bn = torch.nn.BatchNorm1d(out)
        self.dp = torch.nn.Dropout(drop) if drop else None
    
    def forward(self, x):
        x = self.linear(x)
        x = torch.relu(x)
        x = self.bn(x)
        if self.dp is not None:
            x = self.dp(x)
        return x


class Net(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.cell1 = Cell(1000, 512, drop=0.2)
        self.cell2 = Cell(512, 128, drop=0.2)
        self.cell3 = Cell(128, 1, drop=0.2)
        
    def forward(self, x):
        x = self.cell1(x)
        x = self.cell2(x)
        x = self.cell3(x)
        return torch.sigmoid(x)

In [25]:
EPOCHS = 5
matrix = keeper.matrix_tfidf_mid

# init network
device = 'cpu'      # на gpu не работает
net = Net().to(device)
optimizer = torch.optim.Adam(net.parameters(), lr=0.001)
criterion = torch.nn.BCELoss()

# make data loader
tweetset = TweetDataset(matrix[train].toarray(), data.loc[train, 'label'].reset_index(drop=True))
loader = torch.utils.data.DataLoader(tweetset, batch_size=64, shuffle=True)

# train
net.train()
for ep in range(EPOCHS):
    sum_loss, items = 0.0, 0
    pbar = tqdm(enumerate(loader), total=len(loader), desc=f'Epoch {ep + 1}/{EPOCHS}')
    for i, batch in pbar:
        inputs, labels = batch[0].to(device).float(), batch[1].to(device).float()
        optimizer.zero_grad()
        outputs = net(inputs)
        loss = criterion(outputs.flatten(), labels)
        loss.backward()
        optimizer.step()

        sum_loss += loss.item()
        items += len(labels)
        pbar.set_postfix({'cumulative loss per item': sum_loss / items})
print('\nDone.')


Epoch 1/5: 100%|██████████| 400/400 [00:03<00:00, 113.99it/s, cumulative loss per item=0.00938]
Epoch 2/5: 100%|██████████| 400/400 [00:03<00:00, 113.14it/s, cumulative loss per item=0.00718]
Epoch 3/5: 100%|██████████| 400/400 [00:03<00:00, 116.79it/s, cumulative loss per item=0.00606]
Epoch 4/5: 100%|██████████| 400/400 [00:03<00:00, 115.66it/s, cumulative loss per item=0.00526]
Epoch 5/5: 100%|██████████| 400/400 [00:03<00:00, 115.03it/s, cumulative loss per item=0.00476]


Done.





In [28]:
net.eval()
predicts = net(torch.FloatTensor(matrix[valid].toarray())).detach().numpy() > 0.5
print(classification_report(data.loc[valid, 'label'], predicts))

              precision    recall  f1-score   support

           0       0.96      0.99      0.97      5945
           1       0.72      0.40      0.52       448

    accuracy                           0.95      6393
   macro avg       0.84      0.70      0.74      6393
weighted avg       0.94      0.95      0.94      6393



In [None]:
#