In [10]:
# Import the library required
!pip -q install accelerate -U
!pip -q install transformers[torch]
!pip -q install datasets


In [11]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from transformers import pipeline

# Hugging Face models - Pipeline()

#### Using pipeline model as base

In [12]:
from transformers import pipeline

In [13]:
sentiment_model = pipeline(task="sentiment-analysis",
                         model="cardiffnlp/twitter-roberta-base-sentiment-latest",
                           device="cuda")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/929 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/501M [00:00<?, ?B/s]

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

Device set to use cuda


In [14]:
sentiment_model("Over heating issue don't by this product camera was good")

[{'label': 'neutral', 'score': 0.42103174328804016}]

In [15]:
sentiment_model("Waste of money")

[{'label': 'negative', 'score': 0.7434302568435669}]

In [16]:
sentiment_model("Nice product under 24k .... overall good")

[{'label': 'positive', 'score': 0.965560793876648}]

In [17]:
# Import review dataset to review each row
import pandas as pd
user_review_data=pd.read_csv("https://raw.githubusercontent.com/harshitgupta1998/TransformerModel-Sentiment/refs/heads/main/data/Review_Data.csv")
user_review_data=user_review_data.sample(50)
user_review_data["Review"]

Unnamed: 0,Review
1493,CONCLUSION: Very filling meals.
1838,Food is way overpriced and portions are fuckin...
383,After trying many many handsfree gadgets this ...
1315,The bus boy on the other hand was so rude.
402,Excellent product for the price.
485,A Disappointment.
1447,Sauce was tasteless.
897,Don't buy this product - It fails!.
1533,If you love authentic Mexican food and want a ...
476,"Uncomfortable In the Ear, Don't use with LG VX..."


In [18]:
# Apply the model to predict each row
user_review_data["Predicted_Sentiment"] = user_review_data["Review"].apply(lambda x: sentiment_model(x)[0]["label"])
user_review_data

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


Unnamed: 0,Id,Review,Sentiment,Predicted_Sentiment
1493,1494,CONCLUSION: Very filling meals.,1,positive
1838,1839,Food is way overpriced and portions are fuckin...,0,negative
383,384,After trying many many handsfree gadgets this ...,1,positive
1315,1316,The bus boy on the other hand was so rude.,0,negative
402,403,Excellent product for the price.,1,positive
485,486,A Disappointment.,0,negative
1447,1448,Sauce was tasteless.,0,negative
897,898,Don't buy this product - It fails!.,0,negative
1533,1534,If you love authentic Mexican food and want a ...,1,positive
476,477,"Uncomfortable In the Ear, Don't use with LG VX...",0,negative


In [19]:
#Clear the cache in GPU
import torch
torch.cuda.empty_cache()

In [20]:
# Sentiment Analysis without pipeline to provide more functionality
from transformers import AutoTokenizer, AutoModelForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment")

model = AutoModelForSequenceClassification.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment")

config.json:   0%|          | 0.00/747 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/150 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/499M [00:00<?, ?B/s]

In [21]:
import numpy as np
raw_text = "This is a great book"
encoded_input = tokenizer(raw_text, return_tensors='pt')
output = model(**encoded_input)
logits = output.logits.detach().numpy()
y_pred = np.argmax(logits)
y_pred

2

In [22]:
#Code for passing multiple examples to the above model

import numpy as np
# Prepare the input texts
texts = [
    "This is a great book",
    "The food was not tasty and it was very cold",
    "The weather is very good today",
]

# Tokenize and encode the input texts
encoded_inputs = tokenizer(texts, padding=True, return_tensors="pt")

# Pass the encoded inputs to the model
outputs = model(**encoded_inputs)

# Get the model's predictions
logits = outputs.logits.detach().cpu().numpy()
print(logits)
# Find the predicted class for each input
predictions = np.argmax(logits, axis=1)
predictions_text={0:'Neutral',1:'Negative',2:'Positive'}
# Print the predictions
for i in predictions:
  print(predictions_text[i])


[[-2.459717   -0.62854964  3.8105457 ]
 [ 3.2826881  -0.5237998  -2.8078046 ]
 [-2.7630343  -0.48863792  4.031289  ]]
Positive
Neutral
Positive


#### Finetuning HuggingFace model to apply on customer complaints


##### Bank Complaints Data

In [31]:
!wget https://github.com/venkatareddykonasani/Datasets/raw/master/Bank_Customer_Complaints/complaints_v2.zip
!unzip -o complaints_v2.zip
complaints_data = pd.read_csv("/content/complaints_v2.csv")
complaints_data.head()

--2025-01-10 12:15:52--  https://github.com/venkatareddykonasani/Datasets/raw/master/Bank_Customer_Complaints/complaints_v2.zip
Resolving github.com (github.com)... 140.82.121.4
Connecting to github.com (github.com)|140.82.121.4|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/venkatareddykonasani/Datasets/master/Bank_Customer_Complaints/complaints_v2.zip [following]
--2025-01-10 12:15:52--  https://raw.githubusercontent.com/venkatareddykonasani/Datasets/master/Bank_Customer_Complaints/complaints_v2.zip
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.111.133, 185.199.109.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 20228857 (19M) [application/zip]
Saving to: ‘complaints_v2.zip.3’


2025-01-10 12:15:53 (147 MB/s) - ‘complaints_v2.zip.3’ saved [20228857/20228857

Unnamed: 0,ID,product,text,label
0,0,credit_card,purchase order day shipping amount receive pro...,1
1,1,credit_card,forwarded message date tue subject please inve...,1
2,2,retail_banking,forwarded message cc sent friday pdt subject f...,1
3,3,credit_reporting,payment history missing credit report speciali...,0
4,4,credit_reporting,payment history missing credit report made mis...,0


In [32]:
#Use distilbert model for sentimental analysis without fine tuning
from transformers import pipeline
distilbert_model = pipeline(task="text-classification",
                            model="distilbert-base-uncased",
                            )

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Device set to use cuda:0


In [33]:
sample_data=complaints_data.sample(100, random_state=42)
sample_data["text"]=sample_data["text"].apply(lambda x: " ".join(x.split()[:350]))
sample_data["bert_predicted"] = sample_data["text"].apply(lambda x: distilbert_model(x)[0]["label"])
#Default prediction is not a number LABEL_1, LABEL_0
sample_data["bert_predicted_num"]=sample_data["bert_predicted"].apply(lambda x: x[-1])
sample_data["bert_predicted_num"] = sample_data["bert_predicted_num"].astype(int)
sample_data.head()

Unnamed: 0,ID,product,text,label,bert_predicted,bert_predicted_num
156566,156566,mortgages_and_loans,penfed asking copy driver license finalizing l...,1,LABEL_1,1
1498,1498,credit_reporting,collection account removed credit report frank...,0,LABEL_1,1
134991,134991,credit_reporting,bureau falsely reporting alleged debt fdcpa se...,0,LABEL_1,1
56391,56391,mortgages_and_loans,va mortgage well fargo bank since meet conditi...,1,LABEL_1,1
9067,9067,credit_reporting,bank xxxxi credit card mine,0,LABEL_1,1


In [34]:
# Accuracy of the model without fine-tuning
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(sample_data["label"], sample_data["bert_predicted_num"])
print(cm)
accuracy=cm.diagonal().sum()/cm.sum()
print(accuracy)

[[ 0 47]
 [ 1 52]]
0.52


In [35]:
!pip -q install accelerate -U
!pip -q install transformers[torch]
!pip -q install datasets

In [36]:
##Finetuning the model with our data
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, Trainer, TrainingArguments
from transformers import Trainer, TrainingArguments
from datasets import load_dataset, DatasetDict, ClassLabel, Dataset
import pandas as pd
from sklearn.model_selection import train_test_split
import torch

In [37]:
#The target variable must be named as "label" - Verify it, before proceeding
print(sample_data.columns)

Index(['ID', 'product', 'text', 'label', 'bert_predicted',
       'bert_predicted_num'],
      dtype='object')


In [38]:
Sample_data = Dataset.from_pandas(sample_data)
# Split the dataset into training and testing sets
train_test_split = Sample_data.train_test_split(test_size=0.2)  # 80% training, 20% testing
dataset = DatasetDict({
    'train': train_test_split['train'],
    'test': train_test_split['test']
})
dataset

DatasetDict({
    train: Dataset({
        features: ['ID', 'product', 'text', 'label', 'bert_predicted', 'bert_predicted_num', '__index_level_0__'],
        num_rows: 80
    })
    test: Dataset({
        features: ['ID', 'product', 'text', 'label', 'bert_predicted', 'bert_predicted_num', '__index_level_0__'],
        num_rows: 20
    })
})

### Load the tokenizer

In [39]:
# Load the tokenizer
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

# Padding
tokenizer.pad_token = tokenizer.eos_token
tokenizer.pad_token_id = tokenizer.eos_token_id
tokenizer.add_special_tokens({'pad_token': '[PAD]'} )

def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=512)
tokenized_datasets = dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/80 [00:00<?, ? examples/s]

Map:   0%|          | 0/20 [00:00<?, ? examples/s]

In [40]:
#Load and Train the model
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased',
                                                            num_labels=2,
                                                            pad_token_id=tokenizer.eos_token_id) # Adjust num_labels as needed
model

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): DistilBertSdpaAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
            (li

In [41]:
training_args = TrainingArguments(
    output_dir="./results_bert_custom",
    num_train_epochs=1,
    logging_dir="./logs_bert_custom",
    evaluation_strategy="epoch",
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['test'],
)

# Start training
trainer.train()

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:[34m[1mwandb[0m: [32m[41mERROR[0m API key must be 40 characters long, yours was 37


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Epoch,Training Loss,Validation Loss
1,No log,0.652106


TrainOutput(global_step=10, training_loss=0.7122066020965576, metrics={'train_runtime': 958.5098, 'train_samples_per_second': 0.083, 'train_steps_per_second': 0.01, 'total_flos': 10597391892480.0, 'train_loss': 0.7122066020965576, 'epoch': 1.0})

In [42]:
# Define the directory where you want to save your model and tokenizer
model_dir = "./distilbert_finetuned"

# Save the model
model.save_pretrained(model_dir)

# Save the tokenizer
tokenizer.save_pretrained(model_dir)

#Save the model with
trainer.save_model('Distilbert_CusModel_10K')

In [43]:
def make_prediction(text):
  new_complaint=text
  inputs=tokenizer(new_complaint, return_tensors="pt")
  inputs = inputs.to(torch.device("cuda:0"))
  outputs=model(**inputs)
  predictions=outputs.logits.argmax(-1)
  predictions=predictions.detach().cpu().numpy()
  return(predictions)

sample_data["finetuned_predicted"]=sample_data["text"].apply(lambda x: make_prediction(str(x))[0])
sample_data.sample(10)

Unnamed: 0,ID,product,text,label,bert_predicted,bert_predicted_num,finetuned_predicted
145701,145701,credit_reporting,transunion taken point account closed good sta...,0,LABEL_1,1,1
152012,152012,retail_banking,complaint paypal credit card deliberately igno...,1,LABEL_1,1,1
44343,44343,debt_collection,got loan totally paid nca put credit report op...,1,LABEL_1,1,1
10767,10767,credit_card,cash app since contacted customer service time...,1,LABEL_1,1,1
29759,29759,debt_collection,year old college student live home parent two ...,1,LABEL_1,1,1
95861,95861,credit_card,main problem wayfair bank payment purposefully...,1,LABEL_1,1,1
6872,6872,mortgages_and_loans,applied refinance conventional home mortgage r...,1,LABEL_1,1,1
65825,65825,credit_reporting,noticed victim identity theft year collection ...,0,LABEL_1,1,1
23286,23286,debt_collection,received letter company trying collect debt wr...,1,LABEL_1,1,1
156566,156566,mortgages_and_loans,penfed asking copy driver license finalizing l...,1,LABEL_1,1,0


In [44]:
from sklearn.metrics import confusion_matrix
# Create the confusion matrix
cm1 = confusion_matrix(sample_data["label"], sample_data["finetuned_predicted"])
print(cm1)
accuracy1=cm1.diagonal().sum()/cm1.sum()
print(accuracy1)

[[11 36]
 [ 6 47]]
0.58


### Loading a pre-built model and making prediction

In [45]:
#Code to donwloading the distilbert model
!gdown --id 1785J3ir19RaZP3ebbFvWUX88PMaBouro -O distilbert_finetuned_V1.zip
!unzip -o -j distilbert_finetuned_V1.zip -d distilbert_finetuned_V1

model_v1 = DistilBertForSequenceClassification.from_pretrained('/content/distilbert_finetuned_V1')
model_v1.to("cuda:0")

Downloading...
From (original): https://drive.google.com/uc?id=1785J3ir19RaZP3ebbFvWUX88PMaBouro
From (redirected): https://drive.google.com/uc?id=1785J3ir19RaZP3ebbFvWUX88PMaBouro&confirm=t&uuid=4361623f-f960-4a88-bf77-27aa30d6c93a
To: /content/distilbert_finetuned_V1.zip
100% 247M/247M [00:08<00:00, 27.9MB/s]
Archive:  distilbert_finetuned_V1.zip
  inflating: distilbert_finetuned_V1/config.json  
  inflating: distilbert_finetuned_V1/model.safetensors  
  inflating: distilbert_finetuned_V1/special_tokens_map.json  
  inflating: distilbert_finetuned_V1/tokenizer_config.json  
  inflating: distilbert_finetuned_V1/vocab.txt  


DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): DistilBertSdpaAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
            (li

In [46]:
def make_prediction(text):
  new_complaint=text
  inputs=tokenizer(new_complaint, return_tensors="pt")
  inputs = inputs.to(torch.device("cuda:0"))
  outputs=model_v1(**inputs)
  predictions=outputs.logits.argmax(-1)
  predictions=predictions.detach().cpu().numpy()
  return(predictions)


In [47]:
sample_data_large=complaints_data.sample(n=1000, random_state=55)
sample_data_large["finetuned_predicted"]=sample_data_large["text"].apply(lambda x: make_prediction(str(x)[:350])[0])

In [48]:
sample_data_large["finetuned_predicted"]

Unnamed: 0,finetuned_predicted
36949,1
27628,0
138979,0
60466,0
98334,1
...,...
66079,0
122182,0
37186,1
121726,0


In [49]:
from sklearn.metrics import confusion_matrix
# Create the confusion matrix
cm1 = confusion_matrix(sample_data_large["label"], sample_data_large["finetuned_predicted"])
print(cm1)
accuracy1=cm1.diagonal().sum()/cm1.sum()
print(accuracy1)

[[478  62]
 [ 61 399]]
0.877


In [50]:
# Saving the Model on HuggingFace hub
!pip install transformers
!pip install huggingface_hub
!pip install -U ipykernel #for executing the commands


Collecting ipykernel
  Downloading ipykernel-6.29.5-py3-none-any.whl.metadata (6.3 kB)
Collecting comm>=0.1.1 (from ipykernel)
  Downloading comm-0.2.2-py3-none-any.whl.metadata (3.7 kB)
Collecting jedi>=0.16 (from ipython>=7.23.1->ipykernel)
  Downloading jedi-0.19.2-py2.py3-none-any.whl.metadata (22 kB)
Downloading ipykernel-6.29.5-py3-none-any.whl (117 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m117.2/117.2 kB[0m [31m7.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading comm-0.2.2-py3-none-any.whl (7.2 kB)
Downloading jedi-0.19.2-py2.py3-none-any.whl (1.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m42.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: jedi, comm, ipykernel
  Attempting uninstall: ipykernel
    Found existing installation: ipykernel 5.5.6
    Uninstalling ipykernel-5.5.6:
      Successfully uninstalled ipykernel-5.5.6
[31mERROR: pip's dependency resolver does not currently take into ac

In [51]:
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, Trainer, TrainingArguments


In [52]:
!gdown --id 1785J3ir19RaZP3ebbFvWUX88PMaBouro -O distilbert_finetuned_V1.zip
!unzip -o -j distilbert_finetuned_V1.zip -d distilbert_finetuned_V1

model = DistilBertForSequenceClassification.from_pretrained('/content/distilbert_finetuned_V1')


Downloading...
From (original): https://drive.google.com/uc?id=1785J3ir19RaZP3ebbFvWUX88PMaBouro
From (redirected): https://drive.google.com/uc?id=1785J3ir19RaZP3ebbFvWUX88PMaBouro&confirm=t&uuid=2865bde5-7761-4644-a68f-e4611923747d
To: /content/distilbert_finetuned_V1.zip
100% 247M/247M [00:06<00:00, 39.2MB/s]
Archive:  distilbert_finetuned_V1.zip
  inflating: distilbert_finetuned_V1/config.json  
  inflating: distilbert_finetuned_V1/model.safetensors  
  inflating: distilbert_finetuned_V1/special_tokens_map.json  
  inflating: distilbert_finetuned_V1/tokenizer_config.json  
  inflating: distilbert_finetuned_V1/vocab.txt  


In [53]:
from google.colab import userdata
import os
os.environ['HUGGINGFACEHUB_API_TOKEN']=userdata.get('HUGGINGFACEHUB_API_TOKEN')

In [54]:
from huggingface_hub import notebook_login
notebook_login()
#To get Auth token: Profile >> Settings >>Access Token

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [56]:
model.push_to_hub("Bank_Complaints_distil_bert_10K")

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/harshitg1003/Bank_Complaints_distil_bert_10K/commit/4bbbd18245f3bbb4ed84467904a237e2b5aa2a1f', commit_message='Upload DistilBertForSequenceClassification', commit_description='', oid='4bbbd18245f3bbb4ed84467904a237e2b5aa2a1f', pr_url=None, repo_url=RepoUrl('https://huggingface.co/harshitg1003/Bank_Complaints_distil_bert_10K', endpoint='https://huggingface.co', repo_type='model', repo_id='harshitg1003/Bank_Complaints_distil_bert_10K'), pr_revision=None, pr_num=None)

# Loading the model from HuggingFace hub

In [57]:
model=DistilBertForSequenceClassification.from_pretrained("harshitg1003/Bank_distil_bert_10K")

config.json:   0%|          | 0.00/627 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

In [58]:
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, Trainer, TrainingArguments

tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

In [59]:
import pandas as pd
!wget https://github.com/venkatareddykonasani/Datasets/raw/master/Bank_Customer_Complaints/complaints_v2.zip
!unzip -o complaints_v2.zip
complaints_data = pd.read_csv("/content/complaints_v2.csv")
list(complaints_data["text"].head())

--2025-01-10 12:37:49--  https://github.com/venkatareddykonasani/Datasets/raw/master/Bank_Customer_Complaints/complaints_v2.zip
Resolving github.com (github.com)... 140.82.121.4
Connecting to github.com (github.com)|140.82.121.4|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/venkatareddykonasani/Datasets/master/Bank_Customer_Complaints/complaints_v2.zip [following]
--2025-01-10 12:37:50--  https://raw.githubusercontent.com/venkatareddykonasani/Datasets/master/Bank_Customer_Complaints/complaints_v2.zip
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 20228857 (19M) [application/zip]
Saving to: ‘complaints_v2.zip.4’


2025-01-10 12:37:50 (72.7 MB/s) - ‘complaints_v2.zip.4’ saved [20228857/2022885

['purchase order day shipping amount receive product week sent followup email exact verbiage paid two day shipping received order company responded im sorry inform due unusually high order volume order shipped several week stock since early due high demand although continuing take order guaranteeing receive order place due time mask order exact shipping date right however guarantee ship soon soon delivers product u getting small shipment shipping first come first served basis appreciate patience fulfill order quickly recommend keeping order lose place line cancel distributor stock moment prefer cancel please note ask via email cancel accordance cancellation policy agreed checkout electronic inventory online requested order canceled refund issued canceled order sent verification order canceled refunded item particulate respirator refunded subtotal shipping tax total usd visa ending refund called disputed amount stated nothing needed submitted address issue recharged item removing called

In [60]:
import torch

In [61]:
complaint="""
payment history missing credit report made mistake put account forbearance without authorization knowledge matter fact automatic payment setup month monthly mortgage paid full noticed issue account marked forbearance credit report tried get new home loan another new bank contacted immediately asked fix error provide letter detail please see asks forbearance issue seemed fixed however credit report payment history missing new bank able approve new loan issue missing payment history contacted time since phone ask thing report payment history transunion fix missing data issue provide letter show account never forbearance payment history past month however waiting week countless email phone call talk multiple supervisor able get either one thing without issue fixed new bank process new loan application therefore need help immediately get fixed
"""

inputs=tokenizer(complaint, return_tensors="pt")
outputs=model(**inputs)
predictions=outputs.logits.argmax(-1)
predictions=predictions.detach().cpu().numpy()
print(predictions)

[0]


# Web App Creation

In [62]:
%%writefile requirements.txt
streamlit
numpy
pandas
torch
transformers
huggingface_hub

Writing requirements.txt


In [63]:
!pip install -r requirements.txt

Collecting streamlit (from -r requirements.txt (line 1))
  Downloading streamlit-1.41.1-py2.py3-none-any.whl.metadata (8.5 kB)
Collecting watchdog<7,>=2.1.5 (from streamlit->-r requirements.txt (line 1))
  Downloading watchdog-6.0.0-py3-none-manylinux2014_x86_64.whl.metadata (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.3/44.3 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
Collecting pydeck<1,>=0.8.0b4 (from streamlit->-r requirements.txt (line 1))
  Downloading pydeck-0.9.1-py2.py3-none-any.whl.metadata (4.1 kB)
Downloading streamlit-1.41.1-py2.py3-none-any.whl (9.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.1/9.1 MB[0m [31m37.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pydeck-0.9.1-py2.py3-none-any.whl (6.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.9/6.9 MB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading watchdog-6.0.0-py3-none-manylinux2014_x86_64.whl (79 kB)
[2K   [90m━━━━━━━━━━━━━

In [64]:
%%writefile app.py
import streamlit as st
import numpy as np
import pandas as pd
import torch
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, Trainer, TrainingArguments

tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = DistilBertForSequenceClassification.from_pretrained('harshitg1003/Bank_Complaints_distil_bert_10K')

st.title("Bank Complaints Categorization")
st.write("Sample Complaints are given below")
Sample_Complaints = [
    {"Sentence": "Credit Report - payment history missing credit report made mistake put account forbearance without authorization "},
    {"Sentence": "Retail Related - forwarded message cc sent friday pdt subject final legal payment well fargo well fargo clearly wrong need look actually opened account see court hearing several different government agency "}
]
st.table(Sample_Complaints)
user_input = st.text_input("Enter a complaint:")
button=st.button("Classify")

d={
    0: "Credit reporting",
    1: "Mortgage and Others"
}

if user_input and button:
  inputs=tokenizer(user_input, return_tensors="pt")
  outputs=model(**inputs)
  predictions=outputs.logits.argmax(-1)
  predictions=predictions.detach().cpu().numpy()
  print(predictions)
  st.write("Prediction :" , d[predictions[0]])


Writing app.py


In [66]:
!streamlit run app.py & npx localtunnel --port 8501 & curl ipv4.icanhazip.com

#This sometimes doesn't work on Chrome

y
y
y
y
y
yy
yy
35.204.112.108

Collecting usage statistics. To deactivate, set browser.gatherUsageStats to false.
[0m
[1G[0K⠙[1G[0K⠹[1G[0K⠸[1G[0K⠼[1G[0K⠴[1G[0K⠦[1G[0K⠧[0m
[34m[1m  You can now view your Streamlit app in your browser.[0m
[0m
[34m  Local URL: [0m[1mhttp://localhost:8501[0m
[34m  Network URL: [0m[1mhttp://172.28.0.12:8501[0m
[34m  External URL: [0m[1mhttp://35.204.112.108:8501[0m
[0m
[1G[0K⠇[1G[0K⠏[1G[0K⠋[1G[0K⠙[1G[0K[1G[0JNeed to install the following packages:
localtunnel@2.0.2
Ok to proceed? (y) [20Gy

[1G[0K⠙[1G[0K⠹[1G[0K⠸[1G[0K⠼[1G[0K⠴[1G[0K⠦[1G[0K⠧[1G[0K⠇[1G[0K⠏[1G[0K⠋[1G[0K⠙[1G[0K⠹[1G[0K⠸[1G[0K⠼[1G[0K⠴[1G[0K⠦[1G[0K⠧[1G[0K⠇[1G[0K⠏[1G[0K⠋[1G[0K⠙[1G[0K⠹[1G[0K⠸[1G[0K⠼[1G[0K⠴[1G[0K⠦[1G[0Kyour url is: https://gold-onions-shave.loca.lt
[34m  Stopping...[0m
