In [None]:
! pip install textstat transformers category_encoders

In [None]:
import os, random, textstat, spacy, re, string, itertools
from collections import defaultdict, Counter
import numpy as np
import pandas as pd
from pandas.api.types import CategoricalDtype
from tqdm.notebook import tqdm

from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments, pipeline
from sklearn.model_selection import train_test_split
import torch
import torch.nn as nn

from sklearn.feature_extraction.text import CountVectorizer
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
import category_encoders as ce

import seaborn as sns
import matplotlib.pyplot as plt
plt.style.use("seaborn-whitegrid")
plt.rc("figure", autolayout=True)
plt.rc(
    "axes",
    labelweight="bold",
    labelsize="large",
    titleweight="bold",
    titlesize=14,
    titlepad=10,
)

In [None]:
! gdown "1o_QH7OB56BBgPFFJLYMBzkbAwKeNj9XC"

In [None]:
'''Set dataset directory'''
ROOT_DIR = "/content"

# 1. Exploratory Data Analysis

## 1(a) Profile the dataset

In [None]:
df = pd.read_csv(os.path.join(ROOT_DIR, ""))

display(df.head())

In [None]:
print("df shape:", df.shape)

In [None]:
df.info()

## 1(b) Look at the target variable

In [None]:
targets = []

fig, axes = plt.subplots(2, len(targets)//2, figsize=(15,6))

for i, target in enumerate(targets):
    ax = axes.flat[i]
    sns.histplot(x=target, data=df, linewidth=1.25, alpha=1, ax=ax, zorder=2)
    ax.set_title(target)
    ax.set(xlabel=None, ylabel=None)
    
fig.suptitle('Output Variables')
plt.tight_layout()
plt.show()

In [None]:
corr = df[targets].corr()
fig, ax = plt.subplots(figsize=(8,8))        
sns.heatmap(corr, xticklabels=corr.columns, yticklabels=corr.columns, square=True, annot=True, cmap="vlag", fmt=".2f")
plt.show()

## 1(c) Text Analysis
- look at word and document statistics, especially as they relate to the target variables
- look at common words

In [None]:
fig = plt.figure(figsize=(16, 5))
df['num_words'] = df['full_text'].apply(lambda x: len(x.split()))
sns.histplot(data=df, x="num_words")
plt.title("Number of Words")
plt.show()

In [None]:
fig = plt.figure(figsize=(16, 5))
df['mean_word_len'] = df['full_text'].str.split().apply(lambda x : np.mean([len(i) for i in x]))
sns.histplot(data=df, x="mean_word_len")
plt.title("Average Word Length")
plt.show()

In [None]:
fig = plt.figure(figsize=(16, 5))
df['grade_level'] = df['full_text'].apply(lambda x: textstat.text_standard(x, float_output=True))
sns.histplot(data=df, x="grade_level")
plt.title("Grade Levels")
plt.show()

In [None]:
fig = plt.figure(figsize=(16, 5))
df['reading_ease'] = df['full_text'].apply(lambda x: textstat.flesch_reading_ease(x))
sns.histplot(data=df, x="reading_ease")
plt.title("Reading Ease")
plt.show()

In [None]:
'''Look at worst reading ease'''
print(df[df['reading_ease']==df['reading_ease'].min()]['full_text'].values[0])

In [None]:
'''Look at best reading ease'''
print(df[df['reading_ease']==df['reading_ease'].max()]['full_text'].values[0])

In [None]:
'''Look at highest grade level'''
print(df[df['grade_level']==df['grade_level'].max()]['full_text'].values[0])

In [None]:
'''Look at lowest grade level'''
print(df[df['grade_level']==df['grade_level'].min()]['full_text'].values[0])

In [None]:
fig = plt.figure(figsize=(15, 5))

stop_words = set(stopwords.words('english'))
corpus = ''.join(df.full_text).split()

dic = defaultdict(int)
for word in corpus:
    if word in stop_words:
        dic[word] += 1
top_words = sorted(dic.items(), key=lambda x:x[1], reverse=True)[:15] 
x, y = zip(*top_words)
plt.bar(x, y)
plt.title('Common Stopwords')
plt.show()

In [None]:
fig = plt.figure(figsize=(15, 5))

counter = Counter(corpus)
words = counter.most_common()

x, y = [], []
i = 0
for word, count in words:
    if (word not in stop_words):
        x.append(word)
        y.append(count)  
        i += 1
    if i > 15:
        break
plt.bar(x, y)
plt.title('Common Words')
plt.show()

In [None]:
special_characters = '''@#$%^*()-+_=<>'''
fig = plt.figure(figsize=(16, 5))
df['special_chars'] = df['full_text'].apply(lambda x: any(c in special_characters for c in x))
np.unique(df['special_chars'], return_counts=True)

In [None]:
print(df[df['special_chars']==True]['full_text'].values[0])

In [None]:
corr = df[targets+['grade_level', 'reading_ease', 'mean_word_len', 'num_words']].corr()
fig, ax = plt.subplots(figsize=(8,8))        
sns.heatmap(corr, xticklabels=corr.columns, yticklabels=corr.columns, square=True, annot=True, cmap="vlag", fmt=".2f")
plt.show()

Look at possible labelings of the text

In [None]:
zsc = pipeline("zero-shot-classification", model="facebook/bart-large-mnli", device=0)

In [None]:

candidate_labels = []

In [None]:
results = zsc(, candidate_labels, batch_size=22)

In [None]:
zsc_labels = []
for result in results:
  label = result['labels'][np.argmax(result['scores'])]
  zsc_labels.append(label)

In [None]:

df["zsc_label"] = zsc_labels

In [None]:
fig = plt.figure(figsize=(15, 5))
sns.histplot(data=df, x="zsc_label")
plt.title("Zero-shot Topics")
plt.show()

# 2. Import and Preprocess Data

In [None]:
def preprocess_data(df,  cat_features=None):
    
    # Specify categorical variables
    if cat_features is not None:
        for name in cat_features:
            df[name] = df[name].astype("category")
            # Add a None category for missing values
            if "None" not in df[name].cat.categories:
                df[name].cat.add_categories("None", inplace=True)
                

    # create test data set
    train, test = train_test_split(df, test_size=0.2)


    return train, test

In [None]:
df = pd.read_csv(os.path.join(ROOT_DIR, "text.csv"), index_col="text_id")
train, test = preprocess_data(df, cat_features=[])

# 3. Set up Configuration

In [None]:
config = {
    'model': 'distilbert-base-uncased',
    'num_classes' :54,
    'dropout': 0.,
    'batch_size': 6,
    'epochs': 2,
    'lr': 2e-5,
    'weight_decay': 0.01,
    'warmup_steps' : 100,
    'device': 'cuda' if torch.cuda.is_available() else 'cpu',
    'tokenizer' :{
        'max_length': 512,
        'truncation':True
    }
}

# 4. Create Datasets and Dataloaders
- set the classes for the dataset module
- define the tokenizer

In [None]:
class TextDataset:
    def __init__(self, df, config, classes=None, tokenizer=None, is_test=False):
        self.df = df.reset_index(drop=True)
        self.classes = classes
        self.max_len = config['tokenizer']['max_length']
        self.truncation = config['tokenizer']['truncation']
        self.tokenizer = tokenizer
        self.is_test = is_test
        
    def __getitem__(self,idx):
        sample = self.df['full_text'][idx]
        tokenized = tokenizer.encode_plus(sample,
                                          None,
                                          add_special_tokens=True,
                                          max_length=self.max_len,
                                          truncation=self.truncation,
                                          padding='max_length'
                                         )
        inputs = {
            "input_ids": torch.tensor(tokenized['input_ids'], dtype=torch.long),
            #"token_type_ids": torch.tensor(tokenized['token_type_ids'], dtype=torch.long),
            "attention_mask": torch.tensor(tokenized['attention_mask'], dtype=torch.long)
        }
        
        if self.is_test == True:
            return inputs
        
        label = self.df.loc[idx,self.classes].to_list()
        
        '''NEED to check this for correct output'''
        inputs["labels"] = torch.tensor(label, dtype=torch.long)
        
        return inputs
    
    def __len__(self):
        return len(self.df)

In [None]:
tokenizer = AutoTokenizer.from_pretrained(config['model'])

In [None]:
train_ds = TextDataset(train, config, classes= y_train.columns, tokenizer=tokenizer)
test_ds = TextDataset(test, config, tokenizer=tokenizer, is_test=True)

# 5. Define the Model

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(config["model"], num_labels=config["num_classes"])

# 6. Model Training

https://huggingface.co/transformers/v3.2.0/custom_datasets.html
https://colab.research.google.com/github/DhavalTaunk08/Transformers_scripts/blob/master/Transformers_multilabel_distilbert.ipynb#scrollTo=I4R39UTxNKTk

Note, if you are not using multi-label classification, you can just use `Trainer` as is (i.e. no custom loss function)

In [None]:
class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        outputs = model(
            input_ids=inputs['input_ids'],
            attention_mask=inputs['attention_mask'],
            #token_type_ids=inputs['token_type_ids']
        )
        loss = nn.BCEWithLogitsLoss()(outputs['logits'],
                                          inputs['labels'].float())
        return (loss, outputs) if return_outputs else loss

In [None]:
training_args = TrainingArguments(
    output_dir='./results',         
    num_train_epochs=config['epochs'],              
    per_device_train_batch_size=config['batch_size'],  
    per_device_eval_batch_size=config['batch_size'],   
    warmup_steps=config['warmup_steps'],               
    weight_decay=config['weight_decay'],              
)

trainer = CustomTrainer(
    model=model,                         
    args=training_args,                  
    train_dataset=train_ds,         
    eval_dataset=test_ds           
)

In [None]:
trainer.train()

In [None]:
trainer.save_model("trained_text_model")

# 7. Check Predictions

In [None]:
# loading the model you previously trained
model = AutoModelForSequenceClassification.from_pretrained("./trained_text_model")

# arguments for Trainer
test_args = TrainingArguments(
    output_dir = ".",
    do_train = False,
    do_predict = True,
    per_device_eval_batch_size = config["batch_size"],   
    dataloader_drop_last = False    
)

# init trainer
trainer = Trainer(
              model = model, 
              args = test_args
              )

test_results = trainer.predict(test_ds)

In [None]:
mapping = {0:1.0, 1:1.5, 2:2.0, 3:2.5, 4:3.0, 5:3.5, 6:4.0, 7:4.5, 8:5.0,}

preds = [[mapping[pos] for pos in np.argmax(i.reshape(6,9), axis=1)] for i in test_results[0]]

In [None]:
labels = test[['cohesion', 'syntax', 'vocabulary', 'phraseology', 'grammar', 'conventions']].values

In [None]:
sns.displot(np.sum((preds-labels)**2, axis=1))

In [None]:
targets = ['cohesion', 'syntax', 'vocabulary', 'phraseology', 'grammar', 'conventions']
preds_df = pd.DataFrame(preds, columns = targets).astype("category")

fig, axes = plt.subplots(2, len(targets)//2, figsize=(15,6))

for i, target in enumerate(targets):
    ax = axes.flat[i]
    sns.histplot(x=target, data=test, alpha=0.3, ax=ax, color="blue")
    sns.histplot(x=target, data=preds_df, alpha=0.3, ax=ax, color="orange")
    ax.set_title(target)
    ax.set(xlabel=None, ylabel=None)
    
fig.suptitle('Output Variables')
plt.tight_layout()
plt.show()