#Linking Humor and Offense Across Different Age Groups

#Notka odnośnie rozpoznawania zdań niezabawnych
dane wejściowe:

    Is the intention of this text to be humorous? (0 or 1)
    [If it is intended to be humorous] How humorous do you find it? (1-5)
=> dla zdań nieśmiesznch nie mamy podanej wartości humor_rating

np.

```
id	                                                            text	is_humor	humor_rating	humor_controversy	offense_rating
7936	My girlfriend dumped me on a fishing trip. She left me reeling.	1	2.25	0.0	0.0
4609	"Even though I've tried, spaces between us, hold all our secrets, leaving us speechless.' - Spaces.	0    0.0
2940	Ok. Who needs a hug? Anyone? I'm giving some away rn for free!	0  0.0

```

wnik dla zdania określonego jako niezabawne:
```
id                                                 text 	pred 	humor_rating 	is_humor
3     There are people out their happier with less t...     0.000607     0.002430     False
4     One zebra says to the other, "I'm going to che...     0.996539     3.986156     True
```



###Przygotowanie środowiska

In [None]:
!pip install transformers
!pip install tensorflow==2.3.1

In [None]:
import os
import pandas as pd
import numpy as np
import sklearn
from sklearn.model_selection import GroupKFold


from tqdm.notebook import tqdm
import tensorflow as tf
import tensorflow.keras.backend as K
from tensorflow import keras


from scipy.stats import spearmanr
from math import floor, ceil

import transformers

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

np.set_printoptions(suppress=True)
print(tf.__version__) # >=2.3.1

###Tokenizer 

`MAX_SEQUENCE_LENGTH 200`

In [None]:
from transformers import BertTokenizer, BertModel

MAX_SIZE = 200
BATCH_SIZE = 500

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [None]:
HAS_ANS = False
training_sample_count = 1000
training_epochs = 2
test_count = 1000
running_folds = 1
MAX_SEQUENCE_LENGTH = 200

### Pobranie datasetu

In [None]:
!pip install -U -q PyDrive

from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

# Authenticate and create the PyDrive client.
# This only needs to be done once per notebook.
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

from pathlib import Path
import os

def download_and_save(file_name, file_id):
  downloaded = drive.CreateFile({'id': file_id})
  # fetch file
  downloaded.FetchContent()
  # write file
  with open(file_name,'wb') as f:
       f.write(downloaded.content.read())
      
  print(f'Saved {file_name}')
  
FILE_NAME, FILE_ID = ['file_name', 'file_id']

In [None]:
#downloading file 
#https://drive.google.com/file/d/1xMJBvCPvQD8oWjNBAFEgNhQlLvJ7iuEx/view?usp=sharing
file = {FILE_NAME: 'humor_train.csv', FILE_ID: '1xMJBvCPvQD8oWjNBAFEgNhQlLvJ7iuEx'}
dataset_path = Path('/content')
file_path = dataset_path / file[FILE_NAME]
download_and_save(file_path, file[FILE_ID])

In [None]:
#downloading file 
#https://drive.google.com/file/d/18SF4OLtmNvL__BfriC5OJsspQtPJqIVH/view?usp=sharing
file = {FILE_NAME: 'humor_test.csv', FILE_ID: '18SF4OLtmNvL__BfriC5OJsspQtPJqIVH'}
dataset_path = Path('/content')
file_path = dataset_path / file[FILE_NAME]
download_and_save(file_path, file[FILE_ID])

In [None]:
ls

###Formatowanie danych

In [None]:
training_set = pd.read_csv('humor_train.csv')
training_set = training_set[['text','is_humor']]

In [None]:
training_set_copy = training_set 
testing_set = pd.read_csv("humor_test.csv")
testing_set = testing_set[['text']]

print("training_set_copy:")
print(training_set_copy.head())
print("\ntesting_set:")
print(testing_set.head())

#zmiana rozmiaru
testing_set = testing_set[:test_count]

In [None]:
output_categories = list(training_set_copy.columns[[1]])
input_categories = list(training_set_copy.columns[[0]])

if HAS_ANS:
    output_categories = list(training_set_copy.columns[11:])
    input_categories = list(training_set_copy.columns[[1,2,5]])
    

TARGET_COUNT = len(output_categories)
print(HAS_ANS)
print('input categories:', input_categories)
print('output TARGET_COUNT:', TARGET_COUNT)
print('output categories:', output_categories)

####Przygotowanie warstw

In [None]:
def _convert_to_transformer_inputs(title, question, answer, tokenizer, max_sequence_length):
    
      def return_id(str1, str2, truncation_strategy, length):

          inputs = tokenizer.encode_plus(str1, str2,
              add_special_tokens=True,
              max_length=length,
              truncation_strategy=truncation_strategy)
          
          input_ids =  inputs["input_ids"]
          input_segments = inputs["token_type_ids"]
          input_masks = [1] * len(input_ids)
          padding_length = length - len(input_ids)
          padding_id = tokenizer.pad_token_id
          input_masks = input_masks + ([0] * padding_length)
          input_segments = input_segments + ([0] * padding_length)
          input_ids = input_ids + ([padding_id] * padding_length)
          
          
          return [input_ids, input_masks, input_segments]
      
      input_ids_q, input_masks_q, input_segments_q = return_id(title, None, 'longest_first', max_sequence_length)
      input_ids_a, input_masks_a, input_segments_a = return_id('', None, 'longest_first', max_sequence_length)
          
      return [input_ids_q, input_masks_q, input_segments_q,input_ids_a, input_masks_a, input_segments_a]



def compute_input_arrays(training_set, columns, tokenizer, max_sequence_length):
    input_ids_q, input_masks_q, input_segments_q = [], [], []
    input_ids_a, input_masks_a, input_segments_a = [], [], []
    for _, instance in tqdm(training_set[columns].iterrows()):
        t, q, a = instance.text, instance.text, instance.text

        ids_q, masks_q, segments_q, ids_a, masks_a, segments_a = _convert_to_transformer_inputs(t, q, a, tokenizer, max_sequence_length)
        
        input_ids_q.append(ids_q)
        input_masks_q.append(masks_q)
        input_segments_q.append(segments_q)
        input_ids_a.append(ids_a)
        input_masks_a.append(masks_a)
        input_segments_a.append(segments_a)
        
    return [np.asarray(input_ids_q, dtype=np.int32), 
            np.asarray(input_masks_q, dtype=np.int32), 
            np.asarray(input_segments_q, dtype=np.int32),
            np.asarray(input_ids_a, dtype=np.int32), 
            np.asarray(input_masks_a, dtype=np.int32), 
            np.asarray(input_segments_a, dtype=np.int32)]

def compute_output_arrays(training_set, columns):
    return np.asarray(training_set[columns])

In [None]:
outputs = compute_output_arrays(training_set_copy, output_categories)
inputs = compute_input_arrays(training_set_copy, input_categories, tokenizer, MAX_SEQUENCE_LENGTH)
test_inputs = compute_input_arrays(testing_set, input_categories, tokenizer, MAX_SEQUENCE_LENGTH)

###model



In [None]:
tf.version.VERSION #2.3.1



```
#  https://keras.io/api/layers
```



In [None]:
from tensorflow import keras
from tensorflow.keras import layers
from transformers import BertModel, BertConfig
from transformers import TFAutoModel,TFBertModel

def create_model(input_shape):
    model = TFBertModel.from_pretrained('bert-base-uncased')
    layer = model.layers[0]

    inputs = keras.Input(shape=input_shape, dtype='int32')
    input_masks = keras.Input(shape=input_shape, dtype='int32')

    outputs = layer([inputs, input_masks])
    output = outputs[0]
    pooled_output = output[:, 0, :]

    is_humor = layers.Dropout(0.3)(pooled_output)
    is_humor = layers.Dense(1, activation="sigmoid")(is_humor)

    model = keras.Model(inputs=[inputs,input_masks], outputs=is_humor)
    
    return model

## Trening, walidacja, testowanie


In [None]:
def print_evaluation_metrics(y_true, y_pred, label='', is_regression=True, label2=''):
    if is_regression:
        return sklearn.metrics.mean_squared_error(y_true, y_pred)
    else:
        matrix = sklearn.metrics.confusion_matrix(y_true, y_pred)
        TP,TN,FP,FN = matrix[1][1],matrix[0][0],matrix[0][1],matrix[1][0]
        Accuracy = (TP+TN)/(TP+FP+FN+TN)
        Precision = TP/(TP+FP)
        Recall = TP/(TP+FN)
        F1 = 2*(Recall * Precision) / (Recall + Precision)
        print('Acc', Accuracy, 'Prec', Precision, 'Rec', Recall, 'F1',F1)
        return sklearn.metrics.accuracy_score(y_true, y_pred)

print_evaluation_metrics([1,0], [0.9,0.1], '', True)
print_evaluation_metrics([1,0], [1,1], '', False)

### trening

In [None]:
from transformers import TFAutoModel, BertModel

min_acc = 1000000
min_test = []
valid_preds = []
test_preds = []
best_model = False
LR= 2e-5 

gkf = GroupKFold(n_splits=5).split(X=training_set_copy.text, groups=training_set_copy.text)

for fold, (train_idx, valid_idx) in enumerate(gkf):
    if fold not in range(running_folds):
          continue
    train_inputs = [(inputs[i][train_idx])[:training_sample_count] for i in range(len(inputs))]
    train_outputs = (outputs[train_idx])[:training_sample_count]

    valid_inputs = [inputs[i][valid_idx] for i in range(len(inputs))]
    valid_outputs = outputs[valid_idx]

    print(np.array(train_inputs).shape, np.array(train_outputs).shape)

    K.clear_session()
    model = create_model(MAX_SEQUENCE_LENGTH)
    optimizer = tf.keras.optimizers.Adam(learning_rate=LR)
    model.compile(loss='binary_crossentropy', optimizer=optimizer)

    for xx in range(1):
        print(xx)
        model.fit(train_inputs, train_outputs, epochs=training_epochs, batch_size=16, verbose=1)
        valid_preds.append(model.predict(valid_inputs))

        acc = print_evaluation_metrics(np.array(valid_outputs), np.array(valid_preds[-1]), 'on #'+str(xx+1))
        if acc < min_acc:
              print('Accuracy: ', acc)
              min_acc = acc
              best_model = model
        print(' ')

In [None]:
best_model.summary()
print('Accuracy:', acc)

In [None]:
min_test = best_model.predict(test_inputs)
print(len(min_test))

## Wyniki

In [None]:
print(min_test[:10])

In [None]:
result = testing_set.copy()
result['pred'] = min_test
for i in range(len(min_test)):
    min_test[i] = min_test[i] * 4
result['humor_rating'] = min_test

In [None]:
print(result)

In [None]:
for split in np.arange(0.1, 0.80, 0.1).tolist():
    result['is_humor'] = (result['pred'] > split)


result.to_csv('result_10e.csv', index=False)
result.head()

```
2 e
0 	If you are Asian in the kitchen and African in... 	0.985082 	3.940329 	True
1 	Why is there only a stairway to heaven but a h... 	0.918508 	3.674032 	True
2 	I once dated a girl with a twin People asked m... 	0.998696 	3.994783 	True
3 	There are people out their happier with less t... 	0.058198 	0.232790 	False
4 	One zebra says to the other, "I'm going to che... 	0.951897 	3.807587 	True

```


```
10e
0 	If you are Asian in the kitchen and African in... 	0.999979 	3.999916 	True
1 	Why is there only a stairway to heaven but a h... 	0.993642 	3.974567 	True
2 	I once dated a girl with a twin People asked m... 	0.999971 	3.999885 	True
3 	There are people out their happier with less t... 	0.000607 	0.002430 	False
4 	One zebra says to the other, "I'm going to che... 	0.996539 	3.986156 	True
```

