In [1]:
import numpy as np
import pandas as pd
# import sys

In [2]:
train = pd.read_csv('../input/feedback-prize-effectiveness/train.csv')

In [3]:
train.head()

Unnamed: 0,discourse_id,essay_id,discourse_text,discourse_type,discourse_effectiveness
0,0013cc385424,007ACE74B050,"Hi, i'm Isaac, i'm going to be writing about h...",Lead,Adequate
1,9704a709b505,007ACE74B050,"On my perspective, I think that the face is a ...",Position,Adequate
2,c22adee811b6,007ACE74B050,I think that the face is a natural landform be...,Claim,Adequate
3,a10d361e54e4,007ACE74B050,"If life was on Mars, we would know by now. The...",Evidence,Adequate
4,db3e453ec4e2,007ACE74B050,People thought that the face was formed by ali...,Counterclaim,Adequate


In [4]:
label2id = {'Ineffective': 0, 'Adequate': 1, 'Effective': 2}
train['target'] = [label2id[x] for x in train.discourse_effectiveness.values]
train.head()

Unnamed: 0,discourse_id,essay_id,discourse_text,discourse_type,discourse_effectiveness,target
0,0013cc385424,007ACE74B050,"Hi, i'm Isaac, i'm going to be writing about h...",Lead,Adequate,1
1,9704a709b505,007ACE74B050,"On my perspective, I think that the face is a ...",Position,Adequate,1
2,c22adee811b6,007ACE74B050,I think that the face is a natural landform be...,Claim,Adequate,1
3,a10d361e54e4,007ACE74B050,"If life was on Mars, we would know by now. The...",Evidence,Adequate,1
4,db3e453ec4e2,007ACE74B050,People thought that the face was formed by ali...,Counterclaim,Adequate,1


In [5]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('all-MiniLM-L6-v2')

  from .autonotebook import tqdm as notebook_tqdm


In [6]:
folds = pd.read_csv('../input/feedback-folds/df_folds.csv')
folds.head()

Unnamed: 0,essay_id,fold_k_5_seed_42,fold_k_5_seed_2020,fold_k_8_seed_42,fold_k_8_seed_2020,fold_k_10_seed_42,fold_k_10_seed_2020
0,00066EA9880D,2,3,0,6,4,8
1,000E6DE9E817,2,1,5,3,4,9
2,0016926B079C,3,3,2,2,0,2
3,00203C45FC55,3,2,7,5,9,5
4,0029F4D19C3F,3,2,1,6,3,2


In [7]:
essay2fold = {i:f for i,f in zip(folds.essay_id.values.tolist(), folds.fold_k_5_seed_42.values.tolist())}

In [8]:
train['fold'] = [essay2fold[x] for x in train.essay_id.values.tolist()]

In [9]:
# train = train.sample(n=100, random_state=42)

In [10]:
train = train.reset_index(drop=True)

In [11]:
models = [
    'all-mpnet-base-v2', 
    'all-MiniLM-L6-v2',
    'all-distilroberta-v1'
]

In [12]:
from tqdm.auto import tqdm

#Our sentences we like to encode
sentences = train.discourse_text.values.tolist()

#Sentences are encoded by calling model.encode()
all_embeddings = []

for checkpoint in tqdm(models):
    model = SentenceTransformer(checkpoint)
    embeddings = model.encode(sentences)
    all_embeddings.append(embeddings)
    print(checkpoint)
    print(embeddings.shape)

  0%|                                                                          | 0/3 [00:00<?, ?it/s]
Downloading: 100%|██████████████████████████████████████████████| 1.18k/1.18k [00:00<00:00, 1.47MB/s][A

Downloading: 100%|███████████████████████████████████████████████████| 190/190 [00:00<00:00, 256kB/s][A

Downloading: 100%|██████████████████████████████████████████████| 10.6k/10.6k [00:00<00:00, 8.70MB/s][A

Downloading: 100%|███████████████████████████████████████████████████| 571/571 [00:00<00:00, 588kB/s][A

Downloading: 100%|███████████████████████████████████████████████████| 116/116 [00:00<00:00, 101kB/s][A

Downloading: 100%|██████████████████████████████████████████████| 39.3k/39.3k [00:00<00:00, 1.40MB/s][A

Downloading:   0%|                                                        | 0.00/438M [00:00<?, ?B/s][A
Downloading:   1%|▋                                              | 6.38M/438M [00:00<00:06, 63.8MB/s][A
Downloading:   4%|█▊                                

all-mpnet-base-v2
(36765, 768)


 67%|████████████████████████████████████████████                      | 2/3 [00:59<00:26, 26.44s/it]

all-MiniLM-L6-v2
(36765, 384)



Downloading: 100%|███████████████████████████████████████████████████| 737/737 [00:00<00:00, 608kB/s][A

Downloading: 100%|███████████████████████████████████████████████████| 190/190 [00:00<00:00, 214kB/s][A

Downloading: 100%|██████████████████████████████████████████████| 10.3k/10.3k [00:00<00:00, 10.7MB/s][A

Downloading: 100%|███████████████████████████████████████████████████| 653/653 [00:00<00:00, 925kB/s][A

Downloading: 100%|███████████████████████████████████████████████████| 116/116 [00:00<00:00, 106kB/s][A

Downloading: 100%|███████████████████████████████████████████████| 15.7k/15.7k [00:00<00:00, 625kB/s][A

Downloading:   0%|                                                        | 0.00/456k [00:00<?, ?B/s][A
Downloading: 100%|████████████████████████████████████████████████| 456k/456k [00:00<00:00, 3.47MB/s][A

Downloading:   0%|                                                        | 0.00/329M [00:00<?, ?B/s][A
Downloading:   0%|▏                            

all-distilroberta-v1
(36765, 768)





In [13]:
embeddings = np.concatenate(all_embeddings, axis=1)
embeddings.shape

(36765, 1920)

In [14]:
# from cuml.svm import SVC
# from sklearn.preprocessing import StandardScaler
# import gc
# from tqdm.auto import tqdm

# num_labels = 3

# def fit_gpu_svc(TRAIN, kfoldcol='fold'):
    
#     ypredtrain_ = np.zeros((train.shape[0], num_labels))

#     for fold in tqdm(range(train[kfoldcol].max()+1)):
#         ind_train = train[kfoldcol] != fold
#         ind_valid = train[kfoldcol] == fold

#         model = SVC(C=16.0, kernel='rbf', degree=3, max_iter=4000, output_type='numpy', probability=True)
#         model.fit(TRAIN[ind_train], train.target[ind_train])

#         ypredtrain_[ind_valid] = model.predict_proba(TRAIN[ind_valid])
        
#         del model
#         gc.collect()

#     return ypredtrain_

In [18]:
from sklearn.preprocessing import OneHotEncoder
enc = OneHotEncoder(handle_unknown='ignore')
X = train.discourse_type.values.reshape(-1,1)
enc.fit(X)
enc.categories_

[array(['Claim', 'Concluding Statement', 'Counterclaim', 'Evidence',
        'Lead', 'Position', 'Rebuttal'], dtype=object)]

In [19]:
labels = enc.transform(train.discourse_type.values.reshape(-1,1)).toarray()

In [20]:
features = np.concatenate([labels, embeddings], axis=-1)
features.shape, labels.shape, embeddings.shape

((36765, 1927), (36765, 7), (36765, 1920))

In [21]:
import pickle
with open('features.pkl', 'wb') as file:
    pickle.dump(features, file)

In [55]:
from sklearn.metrics import log_loss
TRAIN = features

ypredtrain = fit_gpu_svc(TRAIN, 'fold')
print(log_loss(train.target,ypredtrain)) 

  0%|          | 0/5 [00:00<?, ?it/s]

0.7944317400865349


In [66]:
# is it better than naive baseline? seems yes, see below. 

In [61]:
train.discourse_effectiveness.value_counts()/len(train)

Adequate       0.570570
Effective      0.253665
Ineffective    0.175765
Name: discourse_effectiveness, dtype: float64

In [62]:
baseline = np.array([0.175765, 0.570570, 0.253665]*len(train)).reshape(len(train), 3)

In [63]:
baseline.shape

(36765, 3)

In [65]:
print(log_loss(train.target,baseline)) 

0.9737069926259393


In [None]:
model = 'sentence-transformers/all-mpnet-base-v2'