In [1]:
import pandas as pd
import json

# question labels

In [None]:
labels = ["Document Request",
"Material or Product Information",
"Specifications and Standards",
"Installation and Construction Details",
"Ownership or Responsibility",
"Quantities and Progress Tracking",
"Maintenance and Turnover",
"Translation Request",
"Schedule and Timeline",
"Miscellaneous Queries"]

# load the zero-shot classifier

In [2]:
from transformers import pipeline

classifier = pipeline(
"zero-shot-classification",
model="facebook/bart-large-mnli")

Device set to use cpu


In [3]:
def question_label_annotation(
	text: str
	) -> dict:
	try:
		classification_response = classifier(text, labels)
		result_list = [{'label': label, 'score': score} for label, score in zip(classification_response['labels'], classification_response['scores'])]
		result_list_sorted = sorted(result_list, key=lambda x: x['score'], reverse=True)
		return result_list_sorted[0]
	except:
		return None

In [4]:
question_label_annotation(
	text = "show me the RFI"
	)

{'label': 'Document Request', 'score': 0.5063319802284241}

# load the question data

In [5]:
# Define the path to your JSON file
json_file_path = '../data/questions.json'

# Read the JSON file into a DataFrame
with open(json_file_path, 'r') as file:
    data = json.load(file)

df = pd.DataFrame(data)

In [6]:
df

Unnamed: 0,question_text,created_at,user_id
0,What is the thickness of the drywall in the re...,2024-01-19 15:56:34.383+00,64201228-e558-4722-962e-69c831e1ea8f
1,How many filters does JCI owe in their contract?,2023-11-08 16:06:17.035+00,64201228-e558-4722-962e-69c831e1ea8f
2,what CB will the changes resulting from RFI 18...,2024-01-10 14:22:33.006+00,4f697825-9bb6-4fa6-b3ab-a46dc6d42919
3,What's the flooring type in the penthouse?,2024-01-10 20:55:19.668+00,fcad35ff-bb4b-435d-84f9-2f3b00c24f57
4,what CB will the changes resulting from RFI 18...,2024-01-16 21:36:23.569+00,4f697825-9bb6-4fa6-b3ab-a46dc6d42919
...,...,...,...
550,What are the finishes at the balconies?,2024-03-22 18:05:50.325+00,2fa78a12-f3c1-44c7-ad74-dacee0caab46
551,How many building 2 observations for framing a...,2024-03-26 17:07:22.58+00,e7cd71a5-a10c-4b25-b794-5c88f8560dc5
552,Who is responsible for caulking between counte...,2024-03-22 12:53:03.535+00,50ad58a0-da8d-49d1-9d07-8a62eac815c4
553,"For stucco, What is senerflex?",2024-03-22 18:09:16.862+00,2fa78a12-f3c1-44c7-ad74-dacee0caab46


# inference of the category

In [7]:
%%time

df['category'] = df['question_text'].apply(question_label_annotation)

CPU times: user 2h 13min 34s, sys: 6.94 s, total: 2h 13min 41s
Wall time: 16min 43s


In [14]:
df_expanded

Unnamed: 0,question_text,created_at,user_id,label,score
0,What is the thickness of the drywall in the re...,2024-01-19 15:56:34.383+00,64201228-e558-4722-962e-69c831e1ea8f,Material or Product Information,0.354098
1,How many filters does JCI owe in their contract?,2023-11-08 16:06:17.035+00,64201228-e558-4722-962e-69c831e1ea8f,Ownership or Responsibility,0.416806
2,what CB will the changes resulting from RFI 18...,2024-01-10 14:22:33.006+00,4f697825-9bb6-4fa6-b3ab-a46dc6d42919,Document Request,0.360661
3,What's the flooring type in the penthouse?,2024-01-10 20:55:19.668+00,fcad35ff-bb4b-435d-84f9-2f3b00c24f57,Material or Product Information,0.535430
4,what CB will the changes resulting from RFI 18...,2024-01-16 21:36:23.569+00,4f697825-9bb6-4fa6-b3ab-a46dc6d42919,Document Request,0.371686
...,...,...,...,...,...
550,What are the finishes at the balconies?,2024-03-22 18:05:50.325+00,2fa78a12-f3c1-44c7-ad74-dacee0caab46,Material or Product Information,0.369501
551,How many building 2 observations for framing a...,2024-03-26 17:07:22.58+00,e7cd71a5-a10c-4b25-b794-5c88f8560dc5,Installation and Construction Details,0.189652
552,Who is responsible for caulking between counte...,2024-03-22 12:53:03.535+00,50ad58a0-da8d-49d1-9d07-8a62eac815c4,Ownership or Responsibility,0.649132
553,"For stucco, What is senerflex?",2024-03-22 18:09:16.862+00,2fa78a12-f3c1-44c7-ad74-dacee0caab46,Material or Product Information,0.705017


# save the results

In [8]:
df_expanded = pd.concat([df.drop(columns='category'), df['category'].apply(pd.Series)], axis=1)

In [9]:
df_expanded.to_json('../data/question_labels.json', orient="records", lines=False, indent=4)

# question by labels

In [13]:
# Group by 'label' and count the number of questions
question_counts = (
    df_expanded.groupby('label')['question_text']
    .count()
    .reset_index()
    .rename(columns={'question_text': 'question_count'})
    .sort_values(by='question_count', ascending=False)
)


# Rename columns for clarity
question_counts.columns = ['label', 'question_count']

question_counts

Unnamed: 0,label,question_count
2,Material or Product Information,365
4,Ownership or Responsibility,99
0,Document Request,70
8,Translation Request,9
5,Quantities and Progress Tracking,5
7,Specifications and Standards,3
1,Installation and Construction Details,2
3,Miscellaneous Queries,1
6,Schedule and Timeline,1


# question by user_id

In [15]:
# Group by 'label' and count the number of questions
question_counts = (
    df_expanded.groupby('user_id')['question_text']
    .count()
    .reset_index()
    .rename(columns={'question_text': 'question_count'})
    .sort_values(by='question_count', ascending=False)
)


# Rename columns for clarity
question_counts.columns = ['user_id', 'question_count']

question_counts

Unnamed: 0,user_id,question_count
10,6722222d-bb12-4647-83dc-83cc72c78a9f,87
11,74186580-00eb-4790-9be8-7b1cfede6c4e,74
7,4f697825-9bb6-4fa6-b3ab-a46dc6d42919,66
9,64201228-e558-4722-962e-69c831e1ea8f,60
18,b7a309f9-8e5f-45db-a9cc-e6b319f01950,58
24,eebe0f7f-456e-4775-8586-d0c7c94940b5,29
8,50ad58a0-da8d-49d1-9d07-8a62eac815c4,23
4,42703ba8-b086-46f7-81f8-431a9e3c41b1,22
26,fcad35ff-bb4b-435d-84f9-2f3b00c24f57,18
23,e7cd71a5-a10c-4b25-b794-5c88f8560dc5,14


# end