!python -m spacy download en_core_web_md

In [2]:
import spacy
import os
import pandas as pd
import calendar

In [3]:
# 加载语言模型
nlp = spacy.load("en_core_web_md")

# 文件夹路径
folder_path = "/Users/irisgong/Desktop/ISE540 Text Analytics/project/data/label/label_story"

retrieved_documents = {}


for filename in os.listdir(folder_path):
    if filename.endswith(".story"):
        file_path = os.path.join(folder_path, filename)

        # 读取文件内容
        with open(file_path, "r", encoding="utf-8") as file:
            document = file.read()
            
            people = set()
            organizations = set()
            locations = set()
            dates = set()
            doc = nlp(document)
            
            for ent in doc.ents:
                if ent.label_ == "PERSON":
                    people.add(ent.text)
                elif ent.label_ == "ORG":
                    organizations.add(ent.text)
                elif ent.label_ == "LOC":
                    locations.add(ent.text)
                elif ent.label_ == "GPE":
                    locations.add(ent.text)
                elif ent.label_ == "DATE":
                    dates.add(ent.text)
            
            # 创建当前文件的字典项
            result = {
                'People': people,
                'Organizations': organizations,
                'Locations': locations,
                'Dates': dates
            }
            
            # 将当前文件的结果存储到总的字典中
            retrieved_documents[filename] = result
            
            
retrieved_documents = dict(sorted(retrieved_documents.items(), key=lambda item: item[0]))


# 要删除的日期标签
unwanted_dates = {"Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday",}

for filename, result in retrieved_documents.items():
    result['Dates'] -= unwanted_dates
    

# 要删除的词组
unwanted_phrases = {"year", "month", "week", "day", "old", "annual", "season", "daily", "decade", "decades", "this", "next", "last", "more"}

for filename, result in retrieved_documents.items():
    result['Dates'] = {date for date in result['Dates'] if not any(word in date.lower() for word in unwanted_phrases)}
    
    
# 删除只含有一个月份单词的日期标签
months_to_remove = [month.lower() for month in calendar.month_name[1:]]

for filename, result in retrieved_documents.items():
    result['Dates'] = {date for date in result['Dates'] if not (date.lower() in months_to_remove and len(date.split()) == 1)}

In [4]:
# Ground truth

excel_path = "/Users/irisgong/Desktop/ISE540 Text Analytics/project/data/label/label.xlsx"
df = pd.read_excel(excel_path)

ground_truth = {}

for index, row in df.iterrows():

    file_id = row['id']
    
    # 如果逗号后没有空格，则加上空格
    people_raw = str(row['person']) if pd.notna(row['person']) else ""
    people_raw = ', '.join([p.strip() for p in people_raw.split(',')])
    people = set([p.strip() for p in people_raw.split(', ') if p.strip()])
    
    organizations_raw = str(row['organization']) if pd.notna(row['organization']) else ""
    organizations_raw = ', '.join([o.strip() for o in organizations_raw.split(',')])
    organizations = set([o.strip() for o in organizations_raw.split(', ') if o.strip()])
    
    locations_raw = str(row['location']) if pd.notna(row['location']) else ""
    locations_raw = ', '.join([l.strip() for l in locations_raw.split(',')])
    locations = set([l.strip() for l in locations_raw.split(', ') if l.strip()])

    dates_raw = str(row['date']) if pd.notna(row['date']) else ""
    dates_raw = ', '.join([d.strip() for d in dates_raw.split(',')])
    dates = set([d.strip() for d in dates_raw.split(', ') if d.strip()])


    result_1 = {
        'People': people,
        'Organizations': organizations,
        'Locations': locations,
        'Dates': dates
    }

    ground_truth[file_id] = result_1


ground_truth = dict(sorted(ground_truth.items()))


# 要删除的日期标签
unwanted_dates = {"Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday",}

for filename, result in ground_truth.items():
    result['Dates'] -= unwanted_dates
    

# 要删除的词组
unwanted_phrases = {"year", "month", "week", "day", "old", "annual", "season", "daily", "decade", "decades", "this", "next", "last", "more"}

for filename, result in ground_truth.items():
    result['Dates'] = {date for date in result['Dates'] if not any(word in date.lower() for word in unwanted_phrases)}
    
    
# 删除只含有一个月份单词的日期标签
months_to_remove = [month.lower() for month in calendar.month_name[1:]]

for filename, result in ground_truth.items():
    result['Dates'] = {date for date in result['Dates'] if not (date.lower() in months_to_remove and len(date.split()) == 1)}

In [5]:
retrieved_documents

{'0000800d9058217f6509d7e63ad475e2de0da611.story': {'People': {'Charlie Pugsley',
   'Evelyn Raywood',
   'Jake',
   'Jean Booth',
   'Kim Taylor',
   'Laura Baty',
   'Philip\n Le Shirley',
   'Richard Spillett',
   'Taylor',
   'Yasin\n Patel',
   'Yasin Patel'},
  'Organizations': {'Ford',
   'Prestige Vaping',
   'the Royal Society for the Prevention of Accidents',
   'the Wythenshawe Hospital'},
  'Locations': {'Chesterfield',
   'Derbyshire',
   'East London',
   'Fire\n ',
   'Leicester',
   'London',
   'Manchester',
   'North Yorkshire',
   'Richmond'},
  'Dates': {'11', '18', '26', '54'}},
 '0001d4ce3598e37f20a47fe609736f72e5d73467.story': {'People': {'Cameron Atfield',
   'Charles\n Vacca',
   'Charles \nVaca',
   'Charles Vacca',
   'Jim McCabe',
   'Liz\n Matthews',
   'McCabe\n ',
   'Mike Murphy',
   'Nikki Bateman',
   'Sam\n Scarmardo',
   'Sam Scarmardo',
   'Scarmardo',
   'Vaca',
   'Vacca'},
  'Organizations': {'Army',
   'Burgers',
   'Burgers Adventure',
   'ESPN

In [6]:
# 导出为 Excel 文件
df = pd.DataFrame(retrieved_documents)
df_transposed = df.transpose()

excel_path = '/Users/irisgong/Desktop/ISE540 Text Analytics/project/retrieved.xlsx'
df_transposed.to_excel(excel_path, index=False)

# 打印成功消息
print(f'DataFrame已成功导出到 {excel_path}')

DataFrame已成功导出到 /Users/irisgong/Desktop/ISE540 Text Analytics/project/retrieved.xlsx


In [7]:
ground_truth

{'0000800d9058217f6509d7e63ad475e2de0da611.story': {'People': {'Barmaid Laura Baty',
   'Carer Kim Taylor',
   'Charlie Pugsley',
   'Evelyn Raywood',
   'Fire chiefs',
   'Jake',
   'Jean Booth',
   'Kim Taylor',
   'Laura Baty',
   'London fire chiefs',
   'Philip Le Shirley',
   'Richard Spillett',
   'Yasin Patel'},
  'Organizations': {'Ford Mondeo',
   'Prestige Vaping， the Royal Society for the Prevention of Accidents'},
  'Locations': {'Chesterfield',
   'Derbyshire',
   'East London',
   'Leicester',
   'Manchester',
   'North Yorkshire',
   'Richmond',
   'Wythenshawe Hospital'},
  'Dates': set()},
 '0001d4ce3598e37f20a47fe609736f72e5d73467.story': {'People': {'Cameron Atfield',
   'Charles Vacca',
   'Jim McCabe',
   'Liz Matthews',
   'Mike Murphy',
   'Mohave County Sheriff',
   'Nikki Bateman',
   'Sam Scarmardo',
   'nine-year-old girl',
   'shooting instructor'},
  'Organizations': {'Arizona shooting range',
   'John Clayton Show',
   'Las Vegas Review Journal',
   'MSNB

In [8]:
# 导出为 Excel 文件
df2 = pd.DataFrame(ground_truth)
df2_transposed = df2.transpose()

excel_path = '/Users/irisgong/Desktop/ISE540 Text Analytics/project/ground_truth.xlsx'
df2_transposed.to_excel(excel_path, index=False)

# 打印成功消息
print(f'DataFrame已成功导出到 {excel_path}')

DataFrame已成功导出到 /Users/irisgong/Desktop/ISE540 Text Analytics/project/ground_truth.xlsx


In [9]:
def calculate_precision_recall_for_file(ground_truth, retrieved_documents, file_id, label):
    ground_truth_labels = set(ground_truth[file_id].get(label, []))
    retrieved_labels = set(retrieved_documents.get(file_id, {}).get(label, []))

    true_positives = len(ground_truth_labels & retrieved_labels)
    false_positives = len(retrieved_labels - ground_truth_labels)
    false_negatives = len(ground_truth_labels - retrieved_labels)

    precision = true_positives / (true_positives + false_positives) if (true_positives + false_positives) != 0 else 0
    recall = true_positives / (true_positives + false_negatives) if (true_positives + false_negatives) != 0 else 0

    return precision, recall


result_df = pd.DataFrame(columns=[
    'File_ID',
    'Precision_Person', 'Recall_Person',
    'Precision_Org', 'Recall_Org',
    'Precision_Loc', 'Recall_Loc',
    'Precision_Date', 'Recall_Date'
])

# 遍历每个文档
for file_id in ground_truth.keys():
    # 计算每个标签的精确度和召回率
    precision_person, recall_person = calculate_precision_recall_for_file(ground_truth, retrieved_documents, file_id, 'People')
    precision_org, recall_org = calculate_precision_recall_for_file(ground_truth, retrieved_documents, file_id, 'Organizations')
    precision_loc, recall_loc = calculate_precision_recall_for_file(ground_truth, retrieved_documents, file_id, 'Locations')
    precision_date, recall_date = calculate_precision_recall_for_file(ground_truth, retrieved_documents, file_id, 'Dates')

    # Append the results to the DataFrame
    result_df = pd.concat([result_df, pd.DataFrame({
        'File_ID': [file_id],
        'Precision_Person': [precision_person],
        'Recall_Person': [recall_person],
        'Precision_Org': [precision_org],
        'Recall_Org': [recall_org],
        'Precision_Loc': [precision_loc],
        'Recall_Loc': [recall_loc],
        'Precision_Date': [precision_date],
        'Recall_Date': [recall_date]
    })], ignore_index=True)

    

result_df

Unnamed: 0,File_ID,Precision_Person,Recall_Person,Precision_Org,Recall_Org,Precision_Loc,Recall_Loc,Precision_Date,Recall_Date
0,0000800d9058217f6509d7e63ad475e2de0da611.story,0.727273,0.615385,0.000000,0.000000,0.777778,0.875000,0.000000,0
1,0001d4ce3598e37f20a47fe609736f72e5d73467.story,0.428571,0.600000,0.090909,0.200000,0.555556,0.833333,0.000000,0
2,0002067d13d3ca304e0bc98d04dde85d4091c55e.story,0.500000,0.571429,0.400000,1.000000,0.400000,0.666667,0.666667,0.8
3,000219931d2c3aae55dc2acdc5f690d0c112ab17.story,0.538462,0.777778,0.333333,0.333333,0.600000,1.000000,0.250000,0.5
4,00022dbfa44ccdb94c1dc06938047e258076cf75.story,0.571429,1.000000,0.400000,1.000000,0.200000,1.000000,0.000000,0.0
...,...,...,...,...,...,...,...,...,...
245,ff6bb1e8a47fdd6c32f14201b7aa50dd7720a9ca.story,0.700000,0.700000,0.555556,0.714286,0.400000,1.000000,0.230769,0.428571
246,ff95cbf06bc9e4b2b0c4bb7b46794e63cacaa834.story,0.500000,1.000000,0.833333,0.909091,0.222222,1.000000,1.000000,1.0
247,ffcbb4742e7df96316bda9385d8ec14078aa5b3f.story,0.777778,0.933333,0.500000,0.500000,0.000000,0.000000,0.157895,0.142857
248,ffd480659edff188a04fbd2114b0f63113669407.story,1.000000,1.000000,0.000000,0.000000,0.600000,1.000000,0.000000,0


In [11]:
# 导出为 Excel 文件
excel_path = '/Users/irisgong/Desktop/ISE540 Text Analytics/project/precision&recall.xlsx'
result_df.to_excel(excel_path, index=False)

# 打印成功消息
print(f'DataFrame已成功导出到 {excel_path}')

DataFrame已成功导出到 /Users/irisgong/Desktop/ISE540 Text Analytics/project/precision&recall.xlsx
