!python -m spacy download en_core_web_md

In [1]:
import spacy
import os
import pandas as pd
import calendar
import re

In [2]:
# Load language model
nlp = spacy.load("en_core_web_md")

folder_path = "/Users/irisgong/Desktop/ISE540 Text Analytics/project/data/label/label_story"

retrieved_documents = {}


for filename in os.listdir(folder_path):
    if filename.endswith(".story"):
        file_path = os.path.join(folder_path, filename)


        with open(file_path, "r", encoding="utf-8") as file:
            document = file.read()
            
            people = set()
            organizations = set()
            locations = set()
            dates = set()
            doc = nlp(document)
            
            
            for ent in doc.ents:
                text = re.sub(r"'s\b", '', ent.text)
                text = text.replace('\n', '')
                text = text.replace('\xa0', '')
                if ent.label_ == "PERSON":
                    people.add(text)
                elif ent.label_ == "ORG":
                    organizations.add(text)
                elif ent.label_ in ["LOC", "GPE"]:
                    locations.add(text)
                elif ent.label_ == "DATE":
                    dates.add(text)
            

            result = {
                'People': people,
                'Organizations': organizations,
                'Locations': locations,
                'Dates': dates
            }

            retrieved_documents[filename] = result
            
            
retrieved_documents = dict(sorted(retrieved_documents.items(), key=lambda item: item[0]))


# Delete date tag
unwanted_dates = {"Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday",}

for filename, result in retrieved_documents.items():
    result['Dates'] -= unwanted_dates
    

# Delete phrase
unwanted_phrases = {"year", "month", "week", "day", "old", "annual", "season", "daily", "decade", "decades", "this", "next", "last", "more"}

for filename, result in retrieved_documents.items():
    result['Dates'] = {date for date in result['Dates'] if not any(word in date.lower() for word in unwanted_phrases)}
    
    
# Except date tags containing only one month word
months_to_remove = [month.lower() for month in calendar.month_name[1:]]

for filename, result in retrieved_documents.items():
    result['Dates'] = {date for date in result['Dates'] if not (date.lower() in months_to_remove and len(date.split()) == 1)}

In [4]:
# Ground truth

excel_path = "/Users/irisgong/Desktop/ISE540 Text Analytics/project/data/label/label.xlsx"
df = pd.read_excel(excel_path)

ground_truth = {}

for index, row in df.iterrows():

    file_id = row['id']
    
    # If there is no space after the comma, add the space
    people_raw = str(row['person']) if pd.notna(row['person']) else ""
    people_raw = ', '.join([p.strip() for p in people_raw.split(',')])
    people = set([p.strip() for p in people_raw.split(', ') if p.strip()])
    
    organizations_raw = str(row['organization']) if pd.notna(row['organization']) else ""
    organizations_raw = ', '.join([o.strip() for o in organizations_raw.split(',')])
    organizations = set([o.strip() for o in organizations_raw.split(', ') if o.strip()])
    
    locations_raw = str(row['location']) if pd.notna(row['location']) else ""
    locations_raw = ', '.join([l.strip() for l in locations_raw.split(',')])
    locations = set([l.strip() for l in locations_raw.split(', ') if l.strip()])

    dates_raw = str(row['date']) if pd.notna(row['date']) else ""
    dates_raw = ', '.join([d.strip() for d in dates_raw.split(',')])
    dates = set([d.strip() for d in dates_raw.split(', ') if d.strip()])


    result_1 = {
        'People': people,
        'Organizations': organizations,
        'Locations': locations,
        'Dates': dates
    }

    ground_truth[file_id] = result_1


ground_truth = dict(sorted(ground_truth.items()))


# Delete date tag
unwanted_dates = {"Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday",}

for filename, result in ground_truth.items():
    result['Dates'] -= unwanted_dates
    

# Delete phrase
unwanted_phrases = {"year", "month", "week", "day", "old", "annual", "season", "daily", "decade", "decades", "this", "next", "last", "more"}

for filename, result in ground_truth.items():
    result['Dates'] = {date for date in result['Dates'] if not any(word in date.lower() for word in unwanted_phrases)}
    
    
# Except date tags containing only one month word
months_to_remove = [month.lower() for month in calendar.month_name[1:]]

for filename, result in ground_truth.items():
    result['Dates'] = {date for date in result['Dates'] if not (date.lower() in months_to_remove and len(date.split()) == 1)}

In [5]:
# Export to Excel file
df = pd.DataFrame(retrieved_documents)
df_transposed = df.transpose()

excel_path = '/Users/irisgong/Desktop/ISE540 Text Analytics/project/retrieved.xlsx'
df_transposed.to_excel(excel_path, index=False)


print(f'DataFrame successfully exported to {excel_path}')

DataFrame已成功导出到 /Users/irisgong/Desktop/ISE540 Text Analytics/project/retrieved.xlsx


In [7]:
#
df2 = pd.DataFrame(ground_truth)
df2_transposed = df2.transpose()

excel_path = '/Users/irisgong/Desktop/ISE540 Text Analytics/project/ground_truth.xlsx'
df2_transposed.to_excel(excel_path, index=False)


print(f'DataFrame successfully exported to {excel_path}')

DataFrame已成功导出到 /Users/irisgong/Desktop/ISE540 Text Analytics/project/ground_truth.xlsx


In [8]:
def calculate_precision_recall_f1_for_file(ground_truth, retrieved_documents, file_id, label):
    ground_truth_labels = set(ground_truth[file_id].get(label, []))
    retrieved_labels = set(retrieved_documents.get(file_id, {}).get(label, []))

    true_positives = len(ground_truth_labels & retrieved_labels)
    false_positives = len(retrieved_labels - ground_truth_labels)
    false_negatives = len(ground_truth_labels - retrieved_labels)

    precision = true_positives / (true_positives + false_positives) if (true_positives + false_positives) != 0 else 0
    recall = true_positives / (true_positives + false_negatives) if (true_positives + false_negatives) != 0 else 0
    F1_measure = (2 * precision * recall) / (precision + recall) if (precision + recall) != 0 else 0

    return precision, recall, F1_measure


result_df = pd.DataFrame(columns=[
    'File_ID',
    'Precision_Person', 'Recall_Person', 'F1_Person',
    'Precision_Org', 'Recall_Org', 'F1_Org',
    'Precision_Loc', 'Recall_Loc', 'F1_Loc',
    'Precision_Date', 'Recall_Date', 'F1_Date'
])


for file_id in ground_truth.keys():
    # Calculate precision and recall
    precision_person, recall_person, f1_person = calculate_precision_recall_f1_for_file(ground_truth, retrieved_documents, file_id, 'People')
    precision_org, recall_org, f1_org = calculate_precision_recall_f1_for_file(ground_truth, retrieved_documents, file_id, 'Organizations')
    precision_loc, recall_loc, f1_loc = calculate_precision_recall_f1_for_file(ground_truth, retrieved_documents, file_id, 'Locations')
    precision_date, recall_date, f1_date = calculate_precision_recall_f1_for_file(ground_truth, retrieved_documents, file_id, 'Dates')

    # Append the results to the DataFrame
    result_df = pd.concat([result_df, pd.DataFrame({
        'File_ID': [file_id],
        'Precision_Person': [precision_person],
        'Recall_Person': [recall_person],
        'F1_Person': [f1_person],
        'Precision_Org': [precision_org],
        'Recall_Org': [recall_org],
        'F1_Org': [f1_org],
        'Precision_Loc': [precision_loc],
        'Recall_Loc': [recall_loc],
        'F1_Loc': [f1_loc],
        'Precision_Date': [precision_date],
        'Recall_Date': [recall_date],
        'F1_Date': [f1_date],
    })], ignore_index=True)

    

result_df

Unnamed: 0,File_ID,Precision_Person,Recall_Person,F1_Person,Precision_Org,Recall_Org,F1_Org,Precision_Loc,Recall_Loc,F1_Loc,Precision_Date,Recall_Date,F1_Date
0,0000800d9058217f6509d7e63ad475e2de0da611.story,0.900000,0.692308,0.782609,0.000000,0.000000,0,0.777778,0.875000,0.823529,0.000000,0,0
1,0001d4ce3598e37f20a47fe609736f72e5d73467.story,0.583333,0.700000,0.636364,0.090909,0.200000,0.125,0.555556,0.833333,0.666667,0.000000,0,0
2,0002067d13d3ca304e0bc98d04dde85d4091c55e.story,0.500000,0.571429,0.533333,0.400000,1.000000,0.571429,0.400000,0.666667,0.500000,0.666667,0.8,0.727273
3,000219931d2c3aae55dc2acdc5f690d0c112ab17.story,0.538462,0.777778,0.636364,0.333333,0.333333,0.333333,0.600000,1.000000,0.750000,0.250000,0.5,0.333333
4,00022dbfa44ccdb94c1dc06938047e258076cf75.story,0.666667,1.000000,0.800000,0.400000,1.000000,0.571429,0.200000,1.000000,0.333333,0.000000,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
245,ff6bb1e8a47fdd6c32f14201b7aa50dd7720a9ca.story,0.700000,0.700000,0.700000,0.555556,0.714286,0.625,0.400000,1.000000,0.571429,0.230769,0.428571,0.3
246,ff95cbf06bc9e4b2b0c4bb7b46794e63cacaa834.story,0.500000,1.000000,0.666667,0.833333,0.909091,0.869565,0.222222,1.000000,0.363636,1.000000,1.0,1.0
247,ffcbb4742e7df96316bda9385d8ec14078aa5b3f.story,0.764706,0.866667,0.812500,0.500000,0.500000,0.5,0.000000,0.000000,0.000000,0.157895,0.142857,0.15
248,ffd480659edff188a04fbd2114b0f63113669407.story,1.000000,1.000000,1.000000,0.000000,0.000000,0,0.600000,1.000000,0.750000,0.000000,0,0


In [9]:
# Export to Excel file
excel_path = '/Users/irisgong/Desktop/ISE540 Text Analytics/project/spaCy_output.xlsx'
result_df.to_excel(excel_path, index=False)

print(f'DataFrame successfully exported to {excel_path}')

DataFrame已成功导出到 /Users/irisgong/Desktop/ISE540 Text Analytics/project/spaCy_output.xlsx


In [10]:
print("SpaCy")
for column in result_df.columns[1:13]:
    print(column, ":", result_df[column].mean())

SpaCy
Precision_Person : 0.6397048998184143
Recall_Person : 0.8271890973639272
F1_Person : 0.7031142158508964
Precision_Org : 0.4033081500852089
Recall_Org : 0.523700027755465
F1_Org : 0.4278620331323703
Precision_Loc : 0.4807975915440015
Recall_Loc : 0.6763770567001056
F1_Loc : 0.5332335667494131
Precision_Date : 0.3504062228220123
Recall_Date : 0.47554742664742666
F1_Date : 0.38467598637583833


用上述模型跑完30w个文档，储存extracted entity到单独的文档(csv:id, entity)

In [12]:
# Extract CNN - date
folder_path = "/Users/irisgong/Desktop/ISE540 Text Analytics/project/data/原数据/cnn/stories"

retrieved_documents_cnn = {}


for filename in os.listdir(folder_path):
    if filename.endswith(".story"):
        file_path = os.path.join(folder_path, filename)
        
        
        with open(file_path, "r", encoding="utf-8") as file:
            document = file.read()
            
            dates = set()
            doc = nlp(document)
            file_id = filename.split('.story')[0]
            
            for ent in doc.ents:
                text = re.sub(r"'s\b", '', ent.text)
                text = text.replace('\n', '')
                text = text.replace('\xa0', '')
                if ent.label_ == "DATE":
                    dates.add(text)
            

            result = {
                'ID': file_id
                'Dates': dates
            }
            

            retrieved_documents_cnn[filename] = result
            
            
retrieved_documents_cnn = dict(sorted(retrieved_documents_cnn.items(), key=lambda item: item[0]))


# Delete date tag
unwanted_dates = {"Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday",}

for filename, result in retrieved_documents_cnn.items():
    result['Dates'] -= unwanted_dates
    

# Delete phrase
unwanted_phrases = {"year", "month", "week", "day", "old", "annual", "season", "daily", "decade", "decades", "this", "next", "last", "more"}

for filename, result in retrieved_documents_cnn.items():
    result['Dates'] = {date for date in result['Dates'] if not any(word in date.lower() for word in unwanted_phrases)}


# Except date tags containing only one month word
months_to_remove = [month.lower() for month in calendar.month_name[1:]]

for filename, result in retrieved_documents_cnn.items():
    result['Dates'] = {date for date in result['Dates'] if not (date.lower() in months_to_remove and len(date.split()) == 1)}

In [14]:
# Export to Excel file
df_cnn = pd.DataFrame(retrieved_documents_cnn)
df_cnn_transposed = df_cnn.transpose()

excel_path = '/Users/irisgong/Desktop/ISE540 Text Analytics/project/retrieved_date_cnn.xlsx'
df_cnn_transposed.to_excel(excel_path, index=False)


print(f'DataFrame successfully exported to {excel_path}')

DataFrame已成功导出到 /Users/irisgong/Desktop/ISE540 Text Analytics/project/retrieved_date_cnn.xlsx


In [17]:
# Extract Daily Mail - date
folder_path = "/Users/irisgong/Desktop/ISE540 Text Analytics/project/data/原数据/dailymail/stories"

retrieved_documents_dm = {}


for filename in os.listdir(folder_path):
    if filename.endswith(".story"):
        file_path = os.path.join(folder_path, filename)
        

  
        with open(file_path, "r", encoding="utf-8") as file:
            document = file.read()
            
            dates = set()
            doc = nlp(document)
            file_id = filename.split('.story')[0]
            
            for ent in doc.ents:
                text = re.sub(r"'s\b", '', ent.text)
                text = text.replace('\n', '')
                text = text.replace('\xa0', '')
                if ent.label_ == "DATE":
                    dates.add(text)
            
            
            result = {
                'ID': file_id
                'Dates': dates
            }
            
         
            retrieved_documents_dm[filename] = result
            
            
retrieved_documents_dm = dict(sorted(retrieved_documents_dm.items(), key=lambda item: item[0]))


# Delete date tag
unwanted_dates = {"Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday",}

for filename, result in retrieved_documents_dm.items():
    result['Dates'] -= unwanted_dates
    

# Delete phrase
unwanted_phrases = {"year", "month", "week", "day", "old", "annual", "season", "daily", "decade", "decades", "this", "next", "last", "more"}

for filename, result in retrieved_documents_dm.items():
    result['Dates'] = {date for date in result['Dates'] if not any(word in date.lower() for word in unwanted_phrases)}


# Except date tags containing only one month word
months_to_remove = [month.lower() for month in calendar.month_name[1:]]

for filename, result in retrieved_documents_dm.items():
    result['Dates'] = {date for date in result['Dates'] if not (date.lower() in months_to_remove and len(date.split()) == 1)}

In [18]:
# Export to Excel file
df_dm = pd.DataFrame(retrieved_documents_dm)
df_dm_transposed = df_dm.transpose()

excel_path = '/Users/irisgong/Desktop/ISE540 Text Analytics/project/retrieved_date_dm.xlsx'
df_dm_transposed.to_excel(excel_path, index=False)


print(f'DataFrame已成功导出到 {excel_path}')

DataFrame已成功导出到 /Users/irisgong/Desktop/ISE540 Text Analytics/project/retrieved_date_dm.xlsx


In [None]:
# Extract Daily Mail - people&org
folder_path = "/Users/irisgong/Desktop/ISE540 Text Analytics/project/data/原数据/dailymail/stories"

retrieved_documents_dm = {}


for filename in os.listdir(folder_path):
    if filename.endswith(".story"):
        file_path = os.path.join(folder_path, filename)
        

        
        with open(file_path, "r", encoding="utf-8") as file:
            document = file.read()
            
            people = set()
            organizations = set()
            doc = nlp(document)
            file_id = filename.split('.story')[0]
            
            for ent in doc.ents:
                text = re.sub(r"'s\b", '', ent.text)
                text = text.replace('\n', '')
                text = text.replace('\xa0', '')
                if ent.label_ == "PERSON":
                    people.add(text)
                elif ent.label_ == "ORG":
                    organizations.add(text)
            
           
            result = {
                'ID': file_id,
                'People': people,
                'Organizations': organizations
            }
            
            
            retrieved_documents_dm[filename] = result
            
            
retrieved_documents_dm = dict(sorted(retrieved_documents_dm.items(), key=lambda item: item[0]))

In [None]:
# Export Excel 文件
df_dm = pd.DataFrame(retrieved_documents_dm)
df_dm_transposed = df_dm.transpose()

excel_path = '/Users/irisgong/Desktop/ISE540 Text Analytics/project/retrieved_people&org_dm.xlsx'
df_dm_transposed.to_excel(excel_path, index=False)


print(f'DataFrame已成功导出到 {excel_path}')

In [26]:
folder_path = '/Users/irisgong/Desktop/ISE540 Text Analytics/project/data/原数据/cnn/stories'

retrieved_documents_cnn_id = {}

for filename in os.listdir(folder_path):
    file_path = os.path.join(folder_path, filename)
    id = filename.split('.story')[0]
    result = {'ID': filename
#                 'People': people,
#                 'Organizations': organizations,
            }

#             # Store the result in the main dictionary
    retrieved_documents_cnn_id[filename] = result

# Sort retrieved documents by filename
retrieved_documents_cnn_id = dict(sorted(retrieved_documents_cnn_id.items(), key=lambda item: item[0]))