# Read file

In [None]:
import json
import re
import pandas as pd

# 读取 JSON 文件
with open("dataset/Train_QA.json", "r") as file:
    data = json.load(file)

# 提取问题和类别
questions = []
labels = []
answer = []

for image, qas in data.items():
    for qa in qas:
        questions.append(qa["Question"])
        labels.append(qa["Type"])
        answer.append(qa["Answer"])

# 转换为 DataFrame
train_set = pd.DataFrame({"Question": questions, "Type": labels, "Answer": answer})

# 查看数据分布
print(train_set["Type"].value_counts())
train_set.head()

Type
Reasoning-based Judging      35414
Basic Judging                17654
Basic Counting               17654
Comprehensive Analysis       10088
Object Situation Analysis     4834
Reasoning-based Counting      2522
Name: count, dtype: int64


Unnamed: 0,Question,Type,Answer
0,Are there any buildings in this scene?,Basic Judging,Yes
1,What is the area of buildings?,Basic Counting,0%-10%
2,Are there any roads in this scene?,Basic Judging,Yes
3,What is the area of roads?,Basic Counting,0%-10%
4,Is there any water in this scene?,Basic Judging,Yes


In [9]:
import json
import re
import pandas as pd

# 读取 JSON 文件
with open("dataset/Val_QA.json", "r") as file:
    data = json.load(file)

# 提取问题和类别
questions = []
labels = []
answer = []

for image, qas in data.items():
    for qa in qas:
        questions.append(qa["Question"])
        labels.append(qa["Type"])
        answer.append(qa["Answer"])

# 转换为 DataFrame
val_set = pd.DataFrame({"Question": questions, "Type": labels, "Answer": answer})

# 查看数据分布
print(val_set["Type"].value_counts())
val_set.head()

Type
Reasoning-based Judging      22468
Basic Judging                11683
Basic Counting               11683
Comprehensive Analysis        6676
Object Situation Analysis     3023
Reasoning-based Counting      1669
Name: count, dtype: int64


Unnamed: 0,Question,Type,Answer
0,Are there any buildings in this scene?,Basic Judging,Yes
1,What is the area of buildings?,Basic Counting,0%-10%
2,Are there any roads in this scene?,Basic Judging,Yes
3,What is the area of roads?,Basic Counting,0%-10%
4,Is there any water in this scene?,Basic Judging,Yes


In [None]:
# Clean data

In [None]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string
import nltk

nltk.download("stopwords")
nltk.download('punkt_tab')

# 定义文本清理函数
def clean_text(text):
    text = text.lower()  # 转小写
    text = re.sub(r'\d+', '', text)  # 去数字
    text = text.translate(str.maketrans('', '', string.punctuation))  # 去标点
    words = word_tokenize(text)  # 分词
    words = [word for word in words if word not in stopwords.words("english")]  # 去停用词
    return " ".join(words)

# 处理所有问题
train_set["Cleaned_Question"] = train_set["Question"].apply(clean_text)

train_set.head(10)

[nltk_data] Downloading package stopwords to /home/liw324/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to /home/liw324/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


Unnamed: 0,Question,Type,Answer,Cleaned_Question
0,Are there any buildings in this scene?,Basic Judging,Yes,buildings scene
1,What is the area of buildings?,Basic Counting,0%-10%,area buildings
2,Are there any roads in this scene?,Basic Judging,Yes,roads scene
3,What is the area of roads?,Basic Counting,0%-10%,area roads
4,Is there any water in this scene?,Basic Judging,Yes,water scene
5,What is the area of water?,Basic Counting,0%-10%,area water
6,Is there any barren in this scene?,Basic Judging,Yes,barren scene
7,What is the area of barren?,Basic Counting,0%-10%,area barren
8,Is there any forest in this scene?,Basic Judging,Yes,forest scene
9,What is the area of the forest?,Basic Counting,30%-40%,area forest


In [None]:
# Vectorize text

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# 采用 TF-IDF 进行文本向量化
vectorizer = TfidfVectorizer(max_features=500)
X = vectorizer.fit_transform(train_set["Cleaned_Question"])

# 标签编码
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
y = encoder.fit_transform(train_set["Type"])

# 查看类别映射
label_mapping = dict(zip(encoder.classes_, encoder.transform(encoder.classes_)))
print("类别映射:", label_mapping)

类别映射: {'Basic Counting': 0, 'Basic Judging': 1, 'Comprehensive Analysis': 2, 'Object Situation Analysis': 3, 'Reasoning-based Counting': 4, 'Reasoning-based Judging': 5}


In [8]:
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 训练 SVM
svm = SVC(kernel="linear", C=1.0)
svm.fit(X_train, y_train)

# 预测
y_pred = svm.predict(X_test)

# 评估模型
print(f"SVM 分类准确率: {accuracy_score(y_test, y_pred):.4f}")
print(classification_report(y_test, y_pred, target_names=encoder.classes_))

SVM 分类准确率: 1.0000
                           precision    recall  f1-score   support

           Basic Counting       1.00      1.00      1.00      3625
            Basic Judging       1.00      1.00      1.00      3509
   Comprehensive Analysis       1.00      1.00      1.00      2021
Object Situation Analysis       1.00      1.00      1.00       942
 Reasoning-based Counting       1.00      1.00      1.00       477
  Reasoning-based Judging       1.00      1.00      1.00      7060

                 accuracy                           1.00     17634
                macro avg       1.00      1.00      1.00     17634
             weighted avg       1.00      1.00      1.00     17634

