# Setup
Click the file icon on the left, then the upload file icon in the panel. Upload `implicit_hate_train.csv`, `implicit_hate_dev.csv`, and `implicit_hate_test.csv`

# Data loading

In [None]:
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


In [None]:
import pandas as pd
import numpy as np

In [None]:
folder = '/content/drive/MyDrive/CIS530/CIS530Project/'

In [None]:
major_class_train_data = pd.read_csv(folder+"implicit_hate_train.csv")
major_class_dev_data = pd.read_csv(folder+"implicit_hate_dev.csv")
major_class_test_data = pd.read_csv(folder+"implicit_hate_test.csv")
print("major_class relevant keys: `post`, `class`")
print("  post: the tweet (str)")
print("  class: takes the values `explicit_hate`, `implicit_hate`, or `not_hate` (str)")

major_class relevant keys: `post`, `class`
  post: the tweet (str)
  class: takes the values `explicit_hate`, `implicit_hate`, or `not_hate` (str)


In [None]:
minor_class_train_data = major_class_train_data[major_class_train_data['implicit_class'].notnull()].reset_index(drop=True)
minor_class_dev_data = major_class_dev_data[major_class_dev_data['implicit_class'].notnull()].reset_index(drop=True)
minor_class_test_data = major_class_test_data[major_class_test_data['implicit_class'].notnull()].reset_index(drop=True)

print("minor_class relevant keys: `post`, `implicit_class`")
print("post: the tweet (str)")
print("implicit_class: takes the values `incitement`, `inferiority`, `irony`, `stereotypical`, `threatening`, or `white_grievance` (str)")

minor_class relevant keys: `post`, `implicit_class`
post: the tweet (str)
implicit_class: takes the values `incitement`, `inferiority`, `irony`, `stereotypical`, `threatening`, or `white_grievance` (str)


In [None]:
major_class_train_data['implicit_class']

0        stereotypical
1        stereotypical
2        stereotypical
3        stereotypical
4        stereotypical
             ...      
16431              NaN
16432              NaN
16433              NaN
16434              NaN
16435              NaN
Name: implicit_class, Length: 16436, dtype: object

In [None]:
major_class_train_data.head()

Unnamed: 0.1,Unnamed: 0,post,class,implicit_class
0,8146,they do and you can check them the white on wh...,implicit_hate,stereotypical
1,11071,""" hes a very good boy "" "" mt : seriously wt...",implicit_hate,stereotypical
2,18416,"rt : "" breadth of the benefits the family wa...",implicit_hate,stereotypical
3,148,"rt : how #jews feel about #whitepeople - "" t...",implicit_hate,stereotypical
4,11302,at least 60 white south africans were murder b...,implicit_hate,stereotypical


In [None]:
print(major_class_train_data.shape)
print(major_class_dev_data.shape)
print(major_class_test_data.shape)

(16436, 4)
(2055, 4)
(2989, 5)


In [None]:
minor_class_train_data.head()

Unnamed: 0.1,Unnamed: 0,post,class,implicit_class
0,8146,they do and you can check them the white on wh...,implicit_hate,stereotypical
1,11071,""" hes a very good boy "" "" mt : seriously wt...",implicit_hate,stereotypical
2,18416,"rt : "" breadth of the benefits the family wa...",implicit_hate,stereotypical
3,148,"rt : how #jews feel about #whitepeople - "" t...",implicit_hate,stereotypical
4,11302,at least 60 white south africans were murder b...,implicit_hate,stereotypical


In [None]:
print(minor_class_train_data.shape)
print(minor_class_dev_data.shape)
print(minor_class_test_data.shape)

(4937, 4)
(618, 4)
(698, 5)


In [None]:
from sklearn.metrics import f1_score
def evaluation(prediction, label):
    f1 = f1_score(label, prediction, average='macro')
    print(f1)

# Albert Classifier
https://github.com/NadirEM/nlp-notebooks/blob/master/Fine_tune_ALBERT_sentence_pair_classification.ipynb

In [None]:
!pip install sentencepiece
!pip install transformers==3.1.0
!pip install simpletransformers==0.48.1

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting sentencepiece
  Downloading sentencepiece-0.1.97-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[K     |████████████████████████████████| 1.3 MB 32.8 MB/s 
[?25hInstalling collected packages: sentencepiece
Successfully installed sentencepiece-0.1.97
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers==3.1.0
  Downloading transformers-3.1.0-py3-none-any.whl (884 kB)
[K     |████████████████████████████████| 884 kB 17.2 MB/s 
Collecting tokenizers==0.8.1.rc2
  Downloading tokenizers-0.8.1rc2-cp38-cp38-manylinux1_x86_64.whl (3.0 MB)
[K     |████████████████████████████████| 3.0 MB 46.0 MB/s 
[?25hCollecting sacremoses
  Downloading sacremoses-0.0.53.tar.gz (880 kB)
[K     |████████████████████████████████| 880 kB 54.1 MB/s 
Building wheels for collected packages: sacremoses
  Building w

In [None]:
major_label_2_idx = {'not_hate' : 0, 'implicit_hate': 1, 'explicit_hate' : 2}
major_idx_2_label = {0: 'not_hate', 1:'implicit_hate',2:'explicit_hate'}
vector_major2idx = np.vectorize(lambda x: major_label_2_idx[x])

In [None]:
major_class_test_data.dropna(subset=['class'], inplace=True)

In [None]:
major_class_test_data

Unnamed: 0.1,Unnamed: 0,post,class,implicit_class,class_x
0,21358,One of my good friends from high school is bla...,1,stereotypical,
1,10769,religion of peace wants to kill all the jews ...,1,stereotypical,
2,11846,we also have these people speaking out on the ...,1,stereotypical,
3,16656,. 6219 408 ? 5895 116 ? what's the actual fi...,1,stereotypical,
4,16195,no amnesty ! ! many will be able to apply for ...,1,stereotypical,
...,...,...,...,...,...
2053,12111,1850 white vs slave ... white live longer ... ...,0,,
2054,4180,is a open white nationalists website sir ...,0,,
2055,14950,another way wall street big business democrats...,0,,
2056,9474,because bannon thinks u r only american if u r...,0,,


In [None]:
major_class_train_data['class'] = vector_major2idx(major_class_train_data['class'])
major_class_dev_data['class'] = vector_major2idx(major_class_dev_data['class'])
major_class_test_data['class'] = vector_major2idx(major_class_test_data['class'])

KeyError: ignored

In [None]:
major_class_test_data['class'] = vector_major2idx(major_class_test_data['class'])

In [None]:
minor_class_train_data = major_class_train_data[major_class_train_data['implicit_class'].notnull()].reset_index(drop=True)
minor_class_dev_data = major_class_dev_data[major_class_dev_data['implicit_class'].notnull()].reset_index(drop=True)
minor_class_test_data = major_class_test_data[major_class_test_data['implicit_class'].notnull()].reset_index(drop=True)

In [None]:
from simpletransformers.classification import ClassificationModel
from pandas import read_csv
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

labelencoder = LabelEncoder()

# load the dataset
data = major_class_train_data
data = data.iloc[:,1:3]
data["class"] = labelencoder.fit_transform(data["class"])

total_classes  = len(labelencoder.classes_)

# Create a ClassificationModel
model = ClassificationModel(
    'albert',
    'albert-base-v2',
    num_labels=total_classes,
    use_cuda=True,
    args={'learning_rate':1e-5,'save_model_every_epoch': False,'num_train_epochs': 2, 'reprocess_input_data': True, 'overwrite_output_dir': True}
) 

# Train the model
model.train_model(data,output_dir=None)

Some weights of the model checkpoint at albert-base-v2 were not used when initializing AlbertForSequenceClassification: ['predictions.bias', 'predictions.LayerNorm.weight', 'predictions.LayerNorm.bias', 'predictions.dense.weight', 'predictions.dense.bias', 'predictions.decoder.weight', 'predictions.decoder.bias']
- This IS expected if you are initializing AlbertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing AlbertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-base-v2 and are newly initialized: ['classifier.weight', 'classifier.bias']
You sho

  0%|          | 0/16436 [00:00<?, ?it/s]

Epoch:   0%|          | 0/2 [00:00<?, ?it/s]

Running Epoch 0 of 2:   0%|          | 0/2055 [00:00<?, ?it/s]



Running Epoch 1 of 2:   0%|          | 0/2055 [00:00<?, ?it/s]

In [None]:
# load the dataset
test_major = major_class_test_data.iloc[:,1:2]
prediction = model.predict(test_major.post)

  0%|          | 0/2058 [00:00<?, ?it/s]

  0%|          | 0/258 [00:00<?, ?it/s]

In [None]:
from sklearn.metrics import f1_score
print('accuracy on dev set:',f1_score(major_class_test_data['class'], prediction[0], average='macro'))

accuracy on dev set: 0.23708723208178065


In [None]:
labelencoder = LabelEncoder()

# load the dataset
data = minor_class_train_data
data = data[['post','implicit_class']]
data["implicit_class"] = labelencoder.fit_transform(data["implicit_class"])

total_classes  = len(labelencoder.classes_)

# Create a ClassificationModel
model = ClassificationModel(
    'albert',
    'albert-base-v2',
    num_labels=total_classes,
    use_cuda=True,
    args={'learning_rate':1e-5,'save_model_every_epoch': False,'num_train_epochs': 2, 'reprocess_input_data': True, 'overwrite_output_dir': True}
) 

# Train the model
model.train_model(data,output_dir=None)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data["implicit_class"] = labelencoder.fit_transform(data["implicit_class"])
Some weights of the model checkpoint at albert-base-v2 were not used when initializing AlbertForSequenceClassification: ['predictions.bias', 'predictions.LayerNorm.weight', 'predictions.LayerNorm.bias', 'predictions.dense.weight', 'predictions.dense.bias', 'predictions.decoder.weight', 'predictions.decoder.bias']
- This IS expected if you are initializing AlbertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing AlbertForSequenceClassification from the checkp

  0%|          | 0/4937 [00:00<?, ?it/s]

Epoch:   0%|          | 0/2 [00:00<?, ?it/s]

Running Epoch 0 of 2:   0%|          | 0/618 [00:00<?, ?it/s]



Running Epoch 1 of 2:   0%|          | 0/618 [00:00<?, ?it/s]

In [None]:
# load the dataset
test_minor = minor_class_test_data[['post','implicit_class']]
prediction = model.predict(test_minor.post)

  0%|          | 0/698 [00:00<?, ?it/s]

  0%|          | 0/88 [00:00<?, ?it/s]

In [None]:
minor_class_test_data["implicit_class"] = labelencoder.fit_transform(minor_class_test_data["implicit_class"])

In [None]:
print('accuracy on dev set:',f1_score(minor_class_test_data['implicit_class'], prediction[0], average='macro'))

accuracy on dev set: 0.2352346580672434


# BertForSequenceClassification
https://huggingface.co/docs/transformers/v4.25.1/en/model_doc/bert#transformers.BertForSequenceClassification