# Install packages

In [None]:
!sudo apt install tesseract-ocr
!pip install pytesseract==0.3.8
!pip install transformers==4.11.3

# Imports

In [None]:
from __future__ import print_function, division
import torch
import torchvision
from tqdm.notebook import tqdm
from os.path import exists
import torch.nn as nn
import torch.optim as optim
from torch.optim import lr_scheduler
import numpy as np
import matplotlib.pyplot as plt
from torchvision import datasets, models
import torchvision.transforms as transforms
from torch.utils.data import Dataset, DataLoader
import time
import os
import copy
import torch.nn.functional as F
from torch.autograd import Variable
import torch.onnx
import cv2
import seaborn as sns
import pytesseract
import re
import nltk
import string

from sklearn.feature_selection import chi2
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

from collections import defaultdict
from nltk.corpus import stopwords
from transformers import BertTokenizer, BertModel, AdamW, get_linear_schedule_with_warmup
import pandas as pd
pytesseract.pytesseract.tesseract_cmd = r'/usr/bin/tesseract'
%matplotlib inline
from google.colab import drive
drive.mount('/content/gdrive')
plt.ion() 

# CUDA status

In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device, torch.cuda.is_available()

In [None]:
# Более новая версия торча не совместима с ГПУ на видеокартах Colab
# https://pytorch.org/ 10.1
cuda_output = !ldconfig -p|grep cudart.so|sed -e 's/.*\.\([0-9]*\)\.\([0-9]*\)$/cu\1\2/'
accelerator = cuda_output[0] if exists('/dev/nvidia0') else 'cpu'
print("\ndevice: ", device, "\nPyTorch Version: ", torch.__version__, "\nTorchvision Version: ", torchvision.__version__, \
    "\nПроверяем, доступны ли GPU: ", torch.cuda.is_available(), "\naccelerator: ", accelerator)
if torch.cuda.is_available() == False:
    !pip install torch==1.7.1+cu101 torchvision==0.8.2+cu101 torchaudio==0.7.2 -f https://download.pytorch.org/whl/torch_stable.html

# Проверяем, доступны ли GPU
device = 'cuda' if torch.cuda.is_available() else 'cpu'

print("\ndevice: ", device, "\nPyTorch Version: ", torch.__version__, "\nTorchvision Version: ", torchvision.__version__, \
    "\nПроверяем, доступны ли GPU: ", torch.cuda.is_available(), "\naccelerator: ", accelerator)

In [None]:
print(torch.__config__.show()) 
print(torch.version.cuda)
torch.cuda.is_available()

# 0) Work with data

In [None]:
data_dir="/content/gdrive/MyDrive/data/dataset"

In [None]:
data_transforms = {
    'train': transforms.Compose([
        transforms.RandomResizedCrop(540),
        transforms.ToTensor(),
        transforms.Normalize([0.5, 0.5, 0.5], [0.5, 0.5, 0.5])
    ]),
    'test': transforms.Compose([
        transforms.RandomResizedCrop(540),
        transforms.ToTensor(),
        transforms.Normalize([0.5, 0.5, 0.5], [0.5, 0.5, 0.5])
    ]),
}

In [None]:
def get_dataset(data_dir, data_transforms, folders=['train', 'test'], batch_size=4):
    image_datasets = {x: datasets.ImageFolder(os.path.join(data_dir, x),
                                              data_transforms[x])
                      for x in folders}
    dataloaders = {x: torch.utils.data.DataLoader(image_datasets[x], batch_size=batch_size,
                                                 shuffle=True, num_workers=2)
                  for x in folders}
    dataset_sizes = {x: len(image_datasets[x]) for x in folders}
    classes = image_datasets['train'].classes

    return dataloaders["train"], dataloaders['test'], classes, dataset_sizes

In [None]:
trainloader, testloader, classes, dataset_sizes=get_dataset(data_dir,data_transforms, folders=['train', 'test'])
print('Classes: ',  classes)
print('The datasest have: ',  dataset_sizes ," images")

In [None]:
def imshow(img):
    img = img / 2+0.5      
    npimg = img.numpy()
    plt.imshow(np.transpose(npimg, (1, 2, 0)))
    plt.show()
dataiter = iter(trainloader)
images, labels = next(dataiter)
imshow(torchvision.utils.make_grid(images))
print('|'.join('%10s' % classes[labels[j]] for j in range(4)))

# 1) Image classification (googlenet)


In [None]:
def fit_epoch(model, train_loader, criterion, optimizer):
    running_loss = 0.0
    running_corrects = 0
    processed_data = 0
  
    for inputs, labels in train_loader:
        inputs = inputs.to(DEVICE)
        labels = labels.to(DEVICE)
        optimizer.zero_grad()

        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        preds = torch.argmax(outputs, 1)
        running_loss += loss.item() * inputs.size(0)
        running_corrects += torch.sum(preds == labels.data)
        processed_data += inputs.size(0)
              
    train_loss = running_loss / processed_data
    train_acc = running_corrects.cpu().numpy() / processed_data
    return train_loss, train_acc

In [None]:
def eval_epoch(model, val_loader, criterion):
    model.eval()
    running_loss = 0.0
    running_corrects = 0
    processed_size = 0

    for inputs, labels in val_loader:
        i = 0
        inputs = inputs.to(DEVICE)
        labels = labels.to(DEVICE)

        with torch.set_grad_enabled(False):
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            preds = torch.argmax(outputs, 1)

        running_loss += loss.item() * inputs.size(0)
        running_corrects += torch.sum(preds == labels.data)
        if (preds[i] != labels.data[i]):
          print(preds[i], labels.data[i])
        i+=1
        processed_size += inputs.size(0)
    val_loss = running_loss / processed_size
    val_acc = running_corrects.double() / processed_size
    return val_loss, val_acc

In [None]:
def train(train_loader, val_loader, model, criterion, epochs, batch_size,optimizer, scheduler, sampler = None, shuffle = True):

    history = []
    log_template = "\nEpoch {ep:03d} train_loss: {t_loss:0.4f} \
    val_loss {v_loss:0.4f} train_acc {t_acc:0.4f} val_acc {v_acc:0.4f}"

    with tqdm(desc="epoch", total=epochs) as pbar_outer:
        for epoch in range(epochs):
            train_loss, train_acc = fit_epoch(model, train_loader, criterion, optimizer)
            print("loss", train_loss)
            
            val_loss, val_acc = eval_epoch(model, val_loader, criterion)
            history.append((train_loss, train_acc, val_loss, val_acc))
            scheduler.step()
            pbar_outer.update(1)
            tqdm.write(log_template.format(ep=epoch+1, t_loss=train_loss,\
                                           v_loss=val_loss, t_acc=train_acc, v_acc=val_acc))
            
    return history

In [None]:
def predict(model, test_loader):
    with torch.no_grad():
        logits = []
        y_true = []
    
        for inputs, labels in test_loader:
            inputs = inputs.to(DEVICE)
            model.eval()
            outputs = model(inputs).cpu()
            logits.append(outputs)
            y_true.append(labels)
            
    probs = nn.functional.softmax(torch.cat(logits), dim=-1)
    return probs, y_true

In [None]:
myModel = models.googlenet(pretrained=True)

In [None]:
%%timeit
_ = myModel(images[:1])

In [None]:
n_classes = 9
for param in myModel.parameters():
  param.requires_grad = False
DEVICE = torch.device("cuda")
numFeat = myModel.fc.in_features
myModel.fc = nn.Linear(numFeat,n_classes)
myModel = myModel.to(DEVICE)
criterizator = nn.CrossEntropyLoss()
optimizator = torch.optim.AdamW(myModel.parameters())
shedulator = torch.optim.lr_scheduler.StepLR(optimizator,3,0.5)

In [None]:
%%time

history = train(trainloader, testloader, model=myModel, criterion = criterizator, epochs=10, batch_size=40,optimizer = optimizator,scheduler = shedulator)

for param in myModel.parameters():
  param.requires_grad = True
history = train(trainloader, testloader, model=myModel, criterion = criterizator, epochs=24, batch_size=40,optimizer = optimizator,scheduler = shedulator)

In [None]:
probs, y_true = predict(myModel, testloader)

In [None]:
y_test = torch.cat(y_true).numpy()
y_pred = probs.max(1).indices.numpy()
conf_mat = confusion_matrix(y_test, y_pred)
test_score = accuracy_score(y_test, y_pred)
print(f'test score: {test_score}')

fig, ax = plt.subplots(figsize=(10,10))
sns.heatmap(conf_mat, annot=True, fmt='d')
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.show()

In [None]:
pd.DataFrame(classification_report(y_test, y_pred, output_dict=True)).to_csv('clf_report.csv')

In [None]:
pd.DataFrame(conf_mat).to_csv('conf_mat.csv')

# 2) Image to string

In [None]:
img_files = []
def append_files(img_files, data_dir, subfolder):
  for path, subdirs, files in os.walk(os.path.join(data_dir, subfolder)):
      for name in files:
        img_files.append(os.path.join(path, name))
  return img_files

img_files = append_files(img_files, data_dir, 'train')
img_files = append_files(img_files, data_dir, 'test')

In [None]:
def image_to_string(img_filepath):

  img_cv = cv2.imread(img_filepath)
  img_rgb = cv2.cvtColor(img_cv, cv2.COLOR_BGR2RGB)
  return pytesseract.image_to_string(img_rgb)

In [None]:
img_files[0]

In [None]:
%%timeit
_ = image_to_string(img_files[1])

In [None]:
%%time

df_dict = {
    'filepath': [],
    'text': [],
}

for img in tqdm(img_files):
  text = image_to_string(img)
  df_dict['filepath'].append(img)
  df_dict['text'].append(text)

df_raw = pd.DataFrame(df_dict)

In [None]:
df_raw['type'] = df_raw['filepath'].str.split(data_dir).str[1].str.split('/').str[1]
df_raw['label'] = df_raw['filepath'].str.split(data_dir).str[1].str.split('/').str[2]
df_raw['label'] = df_raw['label'].str.split(' ').str[1].astype(int) - 1

In [None]:
mask = df_raw['type'] == 'train'
df_train = df_raw[mask]
df_test = df_raw[~mask]
df_train.to_csv(os.path.join(data_dir, 'df_train.csv'))
df_test.to_csv(os.path.join(data_dir, 'df_test.csv'))

In [None]:
df_train = pd.read_csv(os.path.join(data_dir, 'df_train.csv'))
df_test = pd.read_csv(os.path.join(data_dir, 'df_test.csv'))

In [None]:
df_train['raw_text'] = df_train['text']
df_test['raw_text'] = df_test['text']

# Text cleaning

In [None]:
nltk.download('stopwords')

In [None]:
def clean_text(text):
    '''Make text lowercase, remove text in square brackets,remove links,remove punctuation
    and remove words containing numbers.'''
    text = text.lower()
    text = re.sub('\[.*?\]', ' ', text)
    text = re.sub('https?://\S+|www\.\S+', ' ', text)
    text = re.sub('<.*?>+', ' ', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), ' ', text)
    text = re.sub('\n', ' ', text)
    text = re.sub('\w*\d\w*', ' ', text)
    text = re.sub('[^a-zA-z0-9.,!?/:;\"\'\s]', ' ', text)
    return text

def text_preprocessing(text):
    """
    Cleaning and parsing the text.

    """
    tokenizer = nltk.tokenize.RegexpTokenizer(r'\w+')
    nopunc = clean_text(text)
    tokenized_text = tokenizer.tokenize(nopunc)
    # remove stopwords
    tokenized_text = [w for w in tokenized_text if w not in stopwords.words('english') and '#' not in w and len(w) > 2]
    combined_text = ' '.join(tokenized_text)
    return combined_text

In [None]:
%%time

# Applying the cleaning function to both test and training datasets
df_train['text'] = df_train['text'].apply(str).apply(lambda x: text_preprocessing(x))
df_test['text'] = df_test['text'].apply(str).apply(lambda x: text_preprocessing(x))

# 3) BERT model

In [None]:
PRE_TRAINED_MODEL_NAME = 'bert-base-cased'
tokenizer = BertTokenizer.from_pretrained(PRE_TRAINED_MODEL_NAME)

In [None]:
token_lens = []
for txt in df_test.text:
  tokens = tokenizer.encode(txt)
  token_lens.append(len(tokens))
df_test['token_len'] = token_lens

token_lens = []
for txt in df_train.text:
  tokens = tokenizer.encode(txt)
  token_lens.append(len(tokens))
df_train['token_len'] = token_lens

In [None]:
df_test.token_len.describe()

In [None]:
df_train.token_len.describe()

In [None]:
sns.distplot(token_lens)
plt.xlabel('Token count');

In [None]:
max(token_lens)

In [None]:
MAX_LEN = 512#max(token_lens)

In [None]:
class TextDataset(Dataset):
  def __init__(self, df, tokenizer, max_len):
    self.df = df
    self.tokenizer = tokenizer
    self.max_len = max_len

  def __len__(self):
    return len(self.df)

  def __getitem__(self, item):
    text = str(self.df.loc[item, 'text'])
    label = self.df.loc[item, 'label']
    encoding = self.tokenizer.encode_plus(
      text,
      add_special_tokens=True,
      max_length=self.max_len,
      return_token_type_ids=False,
      padding='max_length',
      return_attention_mask=True,
      return_tensors='pt',
      truncation=True,
    )

    return {
      'text': text,
      'input_ids': encoding['input_ids'].flatten(),
      'attention_mask': encoding['attention_mask'].flatten(),
      'labels': torch.tensor(label, dtype=torch.long)
    }

In [None]:
def create_data_loader(df, tokenizer, max_len, batch_size):
  ds = TextDataset(
    df=df,
    tokenizer=tokenizer,
    max_len=max_len
  )
  return DataLoader(
    ds,
    batch_size=batch_size,
    num_workers=2
  )

In [None]:
BATCH_SIZE = 4
trainloader = create_data_loader(df_train, tokenizer, MAX_LEN, BATCH_SIZE)
testloader = create_data_loader(df_test, tokenizer, MAX_LEN, BATCH_SIZE)

In [None]:
data = next(iter(trainloader))
data.keys()

In [None]:
print(data['input_ids'].shape)
print(data['attention_mask'].shape)
print(data['labels'].shape)

In [None]:
class TextClassifier(nn.Module):
  
  def __init__(self, n_classes):
    super(TextClassifier, self).__init__()
    self.bert = BertModel.from_pretrained(PRE_TRAINED_MODEL_NAME)
    self.drop = nn.Dropout(p=0.3)
    self.out = nn.Linear(self.bert.config.hidden_size, n_classes)

  def forward(self, input_ids, attention_mask):
    bert_output = self.bert(
      input_ids=input_ids,
      attention_mask=attention_mask
    )
    output = self.drop(bert_output['pooler_output'])
    return self.out(output)

In [None]:
n_classes = 9
model = TextClassifier(n_classes)
model = model.to(device)

In [None]:
input_ids = data['input_ids'].to(device)
attention_mask = data['attention_mask'].to(device)
print(input_ids.shape) # batch size x seq length
print(attention_mask.shape) # batch size x seq length

In [None]:
%%timeit
_ = model(input_ids=input_ids[:1], attention_mask=attention_mask[:1])

In [None]:
EPOCHS = 10
optimizer = AdamW(model.parameters(), lr=2e-5, correct_bias=False)
total_steps = len(trainloader) * EPOCHS
scheduler = get_linear_schedule_with_warmup(
  optimizer,
  num_warmup_steps=0,
  num_training_steps=total_steps
)
loss_fn = nn.CrossEntropyLoss().to(device)

In [None]:
def train_epoch(
  model,
  data_loader,
  loss_fn,
  optimizer,
  device,
  scheduler,
  n_examples
):
  model = model.train()
  losses = []
  correct_predictions = 0
  for d in data_loader:
    input_ids = d["input_ids"].to(device)
    attention_mask = d["attention_mask"].to(device)
    labels = d["labels"].to(device)
    outputs = model(
      input_ids=input_ids,
      attention_mask=attention_mask
    )
    _, preds = torch.max(outputs, dim=1)
    loss = loss_fn(outputs, labels)
    correct_predictions += torch.sum(preds == labels)
    losses.append(loss.item())
    loss.backward()
    nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
    optimizer.step()
    scheduler.step()
    optimizer.zero_grad()
  return correct_predictions.double() / n_examples, np.mean(losses)

In [None]:
def eval_model(model, data_loader, loss_fn, device, n_examples):
  model = model.eval()
  losses = []
  correct_predictions = 0
  with torch.no_grad():
    for d in data_loader:
      input_ids = d["input_ids"].to(device)
      attention_mask = d["attention_mask"].to(device)
      labels = d["labels"].to(device)
      outputs = model(
        input_ids=input_ids,
        attention_mask=attention_mask
      )
      _, preds = torch.max(outputs, dim=1)
      loss = loss_fn(outputs, labels)
      correct_predictions += torch.sum(preds == labels)
      losses.append(loss.item())
  return correct_predictions.double() / n_examples, np.mean(losses)

In [None]:
%%time
history = defaultdict(list)
best_accuracy = 0
for epoch in tqdm(list(range(EPOCHS))):
  print(f'Epoch {epoch + 1}/{EPOCHS}')
  print('-' * 10)
  train_acc, train_loss = train_epoch(
    model,
    trainloader,
    loss_fn,
    optimizer,
    device,
    scheduler,
    len(df_train)
  )
  print(f'Train loss {train_loss} accuracy {train_acc}')
  test_acc, test_loss = eval_model(
    model,
    testloader,
    loss_fn,
    device,
    len(df_test)
  )
  print(f'Test   loss {test_loss} accuracy {test_acc}')
  print()
  history['train_acc'].append(train_acc)
  history['train_loss'].append(train_loss)
  history['test_acc'].append(test_acc)
  history['test_loss'].append(test_loss)
  if test_acc > best_accuracy:
    torch.save(model.state_dict(), 'best_model_state.bin')
    best_accuracy = test_acc

# Other models

In [None]:
df_train['label'].value_counts().sort_index().plot.bar()

In [None]:
df_test['label'].value_counts().sort_index().plot.bar()

In [None]:
tfidf = TfidfVectorizer(sublinear_tf=True, min_df=5, norm='l2', encoding='latin-1', ngram_range=(1, 2), stop_words='english')
features = tfidf.fit_transform(df_train.text).toarray()
labels = df_train.label
features.shape

In [None]:
sorted(df_train['filepath'].str.split('/').str[7].unique())

In [None]:
N = 2
for label in range(n_classes):
  features_chi2 = chi2(features, labels == label)
  indices = np.argsort(features_chi2[0])
  feature_names = np.array(tfidf.get_feature_names())[indices]
  unigrams = [v for v in feature_names if len(v.split(' ')) == 1]
  bigrams = [v for v in feature_names if len(v.split(' ')) == 2]
  print("# label '{}':".format(label))
  print("  . Most correlated unigrams:\n. {}".format('\n. '.join(unigrams[-N:])))
  print("  . Most correlated bigrams:\n. {}".format('\n. '.join(bigrams[-N:])))

# Naive Bayes

In [None]:
%%time
X_train, X_test, y_train, y_test = df_train['text'], df_test['text'], df_train['label'], df_test['label']
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(X_train)
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
clf = MultinomialNB().fit(X_train_tfidf, y_train)

X_test_counts = count_vect.transform(X_test)
X_test_tfidf = tfidf_transformer.transform(X_test_counts)
y_pred = clf.predict(X_test_tfidf)
conf_mat = confusion_matrix(y_test, y_pred)
test_score = accuracy_score(y_test, y_pred)
print(f'test score: {test_score}')

fig, ax = plt.subplots(figsize=(10,10))
sns.heatmap(conf_mat, annot=True, fmt='d')
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.show()

In [None]:
%%timeit
_ = clf.predict(count_vect.transform([df_test.loc[0, 'text']]))

In [None]:
print(clf.predict(count_vect.transform([df_test.loc[0, 'text']])))

In [None]:
df_test.loc[0, 'label']

# Model selection

In [None]:
df = pd.concat([df_train, df_test])
df_train.shape, df_test.shape, df.shape

In [None]:
%%time
tfidf = TfidfVectorizer(sublinear_tf=True, min_df=5, norm='l2', encoding='latin-1', ngram_range=(1, 2), stop_words='english')
features = tfidf.fit_transform(df.text).toarray()
labels = df.label
features.shape

In [None]:
%%time
models = [
    RandomForestClassifier(n_estimators=200, max_depth=3, random_state=0),
    LinearSVC(),
    MultinomialNB(),
    LogisticRegression(random_state=0),
]
CV = 5
cv_df = pd.DataFrame(index=range(CV * len(models)))
entries = []
for model in models:
  model_name = model.__class__.__name__
  accuracies = cross_val_score(model, features, labels, scoring='accuracy', cv=CV)
  for fold_idx, accuracy in enumerate(accuracies):
    entries.append((model_name, fold_idx, accuracy))
cv_df = pd.DataFrame(entries, columns=['model_name', 'fold_idx', 'accuracy'])

sns.boxplot(x='model_name', y='accuracy', data=cv_df)
sns.stripplot(x='model_name', y='accuracy', data=cv_df, 
              size=8, jitter=True, edgecolor="gray", linewidth=2)
plt.show()

In [None]:
%%time
model = LinearSVC()
tfidf = TfidfVectorizer(sublinear_tf=True, min_df=5, norm='l2', encoding='latin-1', ngram_range=(1, 2), stop_words='english')
X_train = tfidf.fit_transform(df_train.text).toarray()
y_train = df_train.label
X_test = tfidf.transform(df_test.text).toarray()
y_test = df_test.label

model.fit(X_train, y_train)
y_pred = model.predict(X_test)
conf_mat = confusion_matrix(y_test, y_pred)
test_score = accuracy_score(y_test, y_pred)
print(f'test score: {test_score}')
fig, ax = plt.subplots(figsize=(10,10))
sns.heatmap(conf_mat, annot=True, fmt='d',
            xticklabels=df.label.unique(), yticklabels=df.label.unique())
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.show()

In [None]:
%%timeit
_ = model.predict(X_test[:1])

# Check data leak

In [None]:
# intersection between paths
train_files = df[df['type']=='train']
test_files = df[df['type']=='test']

set(train_files['filepath']).intersection(test_files['filepath'])

In [None]:
# intersection between images
trainloader, testloader, classes, dataset_sizes = get_dataset(data_dir, data_transforms,
                                                              folders=['train', 'test'], batch_size=1)

In [None]:
criterion = nn.MSELoss()
diff_dict = {
    'rmse': [],
    'max_diff': [],
    'min_diff': [],
    'min_tr': [],
    'max_tr': [],
    'min_ts': [],
    'max_ts': []
}

for tr_img, tr_label in tqdm(trainloader):
  for ts_img, ts_label in testloader:
    rmse = torch.sqrt(criterion(tr_img, ts_img)).item()
    diff_dict['rmse'].append(rmse)
    diff_dict['max_diff'].append(torch.max(tr_img - ts_img).item())
    diff_dict['min_diff'].append(torch.min(tr_img - ts_img).item())
    diff_dict['min_tr'].append(torch.min(tr_img).item())
    diff_dict['max_tr'].append(torch.max(tr_img).item())
    diff_dict['min_ts'].append(torch.min(ts_img).item())
    diff_dict['max_ts'].append(torch.max(ts_img).item())

df_diff = pd.DataFrame(diff_dict)
df_diff.head()

In [None]:
df_diff.describe().drop('count')

In [None]:
# Check tokens
text_dict = {
    'len_tr': [],
    'len_ts': [],
    'len_sym_diff': [],
    'len_left_diff': [],
    'len_right_diff': [],
}
for tr_tokens in tqdm(df_train.text.str.split(' ')):
  for ts_tokens in df_test.text.str.split(' '):
    text_dict['len_tr'].append(len(tr_tokens))
    text_dict['len_ts'].append(len(ts_tokens))
    text_dict['len_sym_diff'].append(len(set(ts_tokens).symmetric_difference(tr_tokens)))
    text_dict['len_left_diff'].append(len(set(tr_tokens).difference(ts_tokens)))
    text_dict['len_right_diff'].append(len(set(ts_tokens).difference(tr_tokens)))

df_text = pd.DataFrame(text_dict)
df_text.head()

In [None]:
df_text.describe().drop('count')

# Instance spec

In [None]:
!df -h

In [None]:
from psutil import *

In [None]:
cpu_count()

In [None]:
cpu_stats()

In [None]:
!cat /proc/cpuinfo

In [None]:
#GPU count and name
!nvidia-smi -L

In [None]:
#use this command to see GPU activity while doing Deep Learning tasks, for this command 'nvidia-smi' and for above one to work, go to 'Runtime > change runtime type > Hardware Accelerator > GPU'
!nvidia-smi