In [None]:
!pip -q install langchain
!pip -q install huggingface_hub
!pip -q install  git+https://github.com/huggingface/transformers # need to install from github
!pip -q install kor
!pip install pandas --upgrade
!pip install Pathlib

In [2]:
!pip show langchain

Name: langchain
Version: 0.0.324
Summary: Building applications with LLMs through composability
Home-page: https://github.com/langchain-ai/langchain
Author: 
Author-email: 
License: MIT
Location: /opt/conda/lib/python3.10/site-packages
Requires: aiohttp, anyio, async-timeout, dataclasses-json, jsonpatch, langsmith, numpy, pydantic, PyYAML, requests, SQLAlchemy, tenacity
Required-by: kor


In [3]:
!pip show kor

Name: kor
Version: 1.0.0
Summary: Extract information with LLMs from text
Home-page: https://www.github.com/eyurtsev/kor
Author: Eugene Yurtsev
Author-email: eyurtsev@gmail.com
License: MIT
Location: /opt/conda/lib/python3.10/site-packages
Requires: langchain, pandas, pydantic
Required-by: 


In [4]:
import warnings
warnings.filterwarnings("ignore")

import torch

import os
import textwrap
import enum
import re
import numpy as np
import pandas as pd
from pathlib import Path
from tqdm import tqdm

from operator import itemgetter
from typing import List, Union, Optional

from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
from transformers import GenerationConfig

import langchain
from langchain.schema.output_parser import StrOutputParser
from langchain.prompts.chat import (
    PromptTemplate,
    ChatPromptTemplate,
    SystemMessagePromptTemplate,
    AIMessagePromptTemplate,
    HumanMessagePromptTemplate,
)
from langchain.prompts import StringPromptTemplate

from langchain import LLMChain
from langchain.llms import HuggingFaceHub, HuggingFacePipeline
from langchain.schema.runnable import ConfigurableField

from kor.extraction import create_extraction_chain
from kor.nodes import Object, Text, Number
from pydantic import BaseModel, Field, validator
from kor import extract_from_documents, from_pydantic

from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

pd.set_option('display.max_colwidth', 500)
os.environ['HUGGINGFACEHUB_API_TOKEN'] = "your_Api_token"

In [5]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)

cuda


## Load Dataset
**Restaurant Reviews with following aspects**

- Ambience
- Anecdotes
- Food
- Price
- Staff
- Miscellaneous

In [6]:
review_domain = 'restaurant'

input_dir = Path(f'/kaggle/input/aspect-extraction-dataset/datasets/{review_domain}')
data_dir = Path('/kaggle/working')

In [47]:
def load_dataset(filename):
    f = open(filename, 'r', encoding='utf-8')
    all_reviews = f.readlines()
    print('Total Reviews: ', len(all_reviews))
    f.close()

    sentences = []
    for i,review in enumerate(all_reviews):
        sentences.append(review.strip('\n'))

    col = 'labels' if 'test_label' in str(filename) else 'text_org'

    df = pd.DataFrame({col:sentences})
    return df

In [75]:
df_label = load_dataset(input_dir/'test_label.txt')
print(df_label.shape)

all_labels = list(set(df_label['labels'].str.split(' ').sum()))
print(all_labels)
drop_labels = ['Positive', 'Neutral']
all_labels = sorted([label for label in all_labels if label not in drop_labels])

drop_index = pd.Series(False, index=df_label.index)
for label in drop_labels:
    drop_index = drop_index | df_label['labels'].str.contains(label)
    
print(all_labels)

df_label = df_label[~drop_index].copy()
print(df_label.shape)

for label in all_labels:
    df_label[label] = np.where(df_label['labels'].str.contains(label), 1, 0)
print(df_label.shape)

df_label.head()

Total Reviews:  3328
(3328, 1)
['Positive', 'Anecdotes', 'Neutral', 'Miscellaneous', 'Price', 'Food', 'Staff', 'Ambience']
['Ambience', 'Anecdotes', 'Food', 'Miscellaneous', 'Price', 'Staff']
(3315, 1)
(3315, 7)


Unnamed: 0,labels,Ambience,Anecdotes,Food,Miscellaneous,Price,Staff
0,Food Ambience,1,0,1,0,0,0
1,Staff,0,0,0,0,0,1
2,Ambience,1,0,0,0,0,0
3,Miscellaneous,0,0,0,1,0,0
4,Miscellaneous,0,0,0,1,0,0


In [77]:
df = load_dataset(input_dir/'test.txt')
print(df.shape)

df = df[~drop_index].copy()
print(df.shape)

df = pd.concat([df, df_label], axis=1)

df.head()

Total Reviews:  3328
(3328, 1)
(3315, 1)


Unnamed: 0,text_org,labels,Ambience,Anecdotes,Food,Miscellaneous,Price,Staff
0,Always a fun place ... the food is deeelish !,Food Ambience,1,0,1,0,0,0
1,"The staff is n't the friendliest or most competent , and I am stickler for service , but everything else about this place makes up for it .",Staff,0,0,0,0,0,1
2,"Great for groups , great for a date , great for early brunch or a nightcap .",Ambience,1,0,0,0,0,0
3,Another great place to take out-of-towners !,Miscellaneous,0,0,0,1,0,0
4,: ),Miscellaneous,0,0,0,1,0,0


In [184]:
df[all_labels].mean()

Ambience         0.113424
Anecdotes        0.129412
Food             0.370136
Miscellaneous    0.276621
Price            0.089894
Staff            0.166817
dtype: float64

## Setup LLM: Mistral 7B

In [78]:
repo_id = "Open-Orca/Mistral-7B-OpenOrca"

model = AutoModelForCausalLM.from_pretrained(repo_id, torch_dtype="auto")
tokenizer = AutoTokenizer.from_pretrained(repo_id, torch_dtype="auto")

Downloading (…)lve/main/config.json:   0%|          | 0.00/623 [00:00<?, ?B/s]

Downloading (…)model.bin.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading (…)l-00001-of-00002.bin:   0%|          | 0.00/9.94G [00:00<?, ?B/s]

Downloading (…)l-00002-of-00002.bin:   0%|          | 0.00/4.54G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading (…)neration_config.json:   0%|          | 0.00/120 [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/1.69k [00:00<?, ?B/s]

Downloading tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

Downloading (…)in/added_tokens.json:   0%|          | 0.00/90.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/101 [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [148]:
pipe = pipeline(
    "text-generation",
    model=model, 
    tokenizer=tokenizer, 
    max_new_tokens = 50,
    device=device,
    pad_token_id=tokenizer.eos_token_id,
    temperature=0,
)

llm = HuggingFacePipeline(pipeline=pipe)

## Setup Aspect Extractor

In [149]:
class Aspects(enum.Enum):
    staff = "Staff"
    ambience = "Ambience"
    anecdotes = "Anecdotes"
    food = "Food"
    price = 'Price'
    miscellaneous = 'Miscellaneous'
    
class ReviewABSA(BaseModel):
    aspect: List[Aspects] = Field(
        description=f"The key features that customers are talking about in their {review_domain} reviews.",
        examples=[
            ("The Singapore Mai Fun had NO curry flavor whatsoever.", "Food"),
            ("Go for it !", "Miscellaneous"),
            ("While the staff at this little bistro is very friendly , I have never experienced more incompetency.", "Staff"),
            ("Went on a double date with friend and his girlfriend for a few drinks and appetizers .", "Anecdotes"),
            ("When you enter , you feel like you have entered your mom 's kitchen .", "Ambience"),
            ("Pricey , but worth a try , at least once .", "Price")
        ],
        many=True,
    )

In [150]:
instruction_template = PromptTemplate(
    input_variables=["format_instructions", "type_description"],
    template=(
        "<|im_start|>system\nYour goal is to extract structured information from the user's input that matches the form described below. When extracting information please make sure it matches the type information exactly. Do not add any attributes that do not appear in the schema shown below.\n\n"
        "{type_description}\n<|im_end|>\n<|im_start|>user\n{format_instructions}\n"
    ),
)

In [151]:
schema, validator = from_pydantic(ReviewABSA)   
chain = create_extraction_chain(
    llm, schema, validator=validator, 
    encoder_or_encoder_class="json",
    instruction_template=instruction_template
)

In [152]:
print(chain.prompt.format_prompt(text="[user input]").to_string())

<|im_start|>system
Your goal is to extract structured information from the user's input that matches the form described below. When extracting information please make sure it matches the type information exactly. Do not add any attributes that do not appear in the schema shown below.

```TypeScript

reviewabsa: { // 
 aspect: Array<"Staff" | "Ambience" | "Anecdotes" | "Food" | "Price" | "Miscellaneous"> // The key features that customers are talking about in their restaurant reviews.
}
```

<|im_end|>
<|im_start|>user
Please output the extracted information in JSON format. Do not output anything except for the extracted information. Do not add any clarifying information. Do not add any fields that are not in the schema. If the text contains attributes that do not appear in the schema, please ignore them. All output must be in JSON format and follow the schema specified above. Wrap the JSON in <json> tags.


Input: The Singapore Mai Fun had NO curry flavor whatsoever.
Output: <json>{"re

## Aspect Extraction

In [153]:
%%time

reviews = df['text_org'].values.tolist()

responses = []
aspects = []
    
for text in tqdm(reviews):
    response = chain.predict(text=text+'<|im_end|>\n')
    responses.append(response)
    aspects.append(response['data']['reviewabsa']['aspect'] if len(response['data'].keys())>0 
                   else ['Miscellaneous'])

len(reviews), len(responses), len(aspects)

100%|██████████| 3315/3315 [3:29:29<00:00,  3.79s/it]  

CPU times: user 3h 18min 51s, sys: 10min 44s, total: 3h 29min 35s
Wall time: 3h 29min 29s





(3315, 3315, 3315)

In [169]:
df_test_pred = pd.DataFrame(columns=[f'predicted_{label}' for label in all_labels])
df_test_pred['predicted_labels'] = [' '.join(aspect) for aspect in aspects]
for label in all_labels:
    df_test_pred[f'predicted_{label}'] = np.where(df_test_pred['predicted_labels'].str.contains(label), 1, 0)
df_test_pred.index = df.index
print(df_test_pred.shape)

(3315, 7)


## Evaluate Results

In [170]:
df_final = pd.concat([df, df_test_pred], axis=1)
print(df_final.shape)
df_final = df_final[['text_org', 'labels', 'predicted_labels']+all_labels+[f'predicted_{label}' for label in all_labels]] 
df_final.head()

(3315, 15)


Unnamed: 0,text_org,labels,predicted_labels,Ambience,Anecdotes,Food,Miscellaneous,Price,Staff,predicted_Ambience,predicted_Anecdotes,predicted_Food,predicted_Miscellaneous,predicted_Price,predicted_Staff
0,Always a fun place ... the food is deeelish !,Food Ambience,Food,1,0,1,0,0,0,0,0,1,0,0,0
1,"The staff is n't the friendliest or most competent , and I am stickler for service , but everything else about this place makes up for it .",Staff,Staff Miscellaneous,0,0,0,0,0,1,0,0,0,1,0,1
2,"Great for groups , great for a date , great for early brunch or a nightcap .",Ambience,Miscellaneous,1,0,0,0,0,0,0,0,0,1,0,0
3,Another great place to take out-of-towners !,Miscellaneous,Miscellaneous,0,0,0,1,0,0,0,0,0,1,0,0
4,: ),Miscellaneous,Miscellaneous,0,0,0,1,0,0,0,0,0,1,0,0


In [181]:
interested_labels = ['Ambience', 'Food', 'Price', 'Staff']
for label in interested_labels:
    print('Label: ', label)
    y_true = df_final[label]
    y_pred = df_final[f'predicted_{label}']
    print('Accuracy: ', np.round(accuracy_score(y_true, y_pred),3))
    print('F1 Score: ', np.round(f1_score(y_true, y_pred),3))
    print('Precision: ', np.round(precision_score(y_true, y_pred),3))
    print('Recall: ', np.round(recall_score(y_true, y_pred),3))
    print('***************')

Label:  Ambience
Accuracy:  0.938
F1 Score:  0.676
Precision:  0.839
Recall:  0.566
***************
Label:  Food
Accuracy:  0.893
F1 Score:  0.866
Precision:  0.81
Recall:  0.93
***************
Label:  Price
Accuracy:  0.957
F1 Score:  0.765
Precision:  0.744
Recall:  0.789
***************
Label:  Staff
Accuracy:  0.942
F1 Score:  0.806
Precision:  0.907
Recall:  0.725
***************


In [182]:
other_labels = ['Anecdotes', 'Miscellaneous']
for label in other_labels:
    print('Label: ', label)
    y_true = df_final[label]
    y_pred = df_final[f'predicted_{label}']
    print('Accuracy: ', np.round(accuracy_score(y_true, y_pred),3))
    print('F1 Score: ', np.round(f1_score(y_true, y_pred),3))
    print('Precision: ', np.round(precision_score(y_true, y_pred),3))
    print('Recall: ', np.round(recall_score(y_true, y_pred),3))
    print('***************')

Label:  Anecdotes
Accuracy:  0.854
F1 Score:  0.058
Precision:  0.176
Recall:  0.035
***************
Label:  Miscellaneous
Accuracy:  0.741
F1 Score:  0.636
Precision:  0.521
Recall:  0.816
***************
