In [1]:
!pip install python-docx


Collecting python-docx
  Obtaining dependency information for python-docx from https://files.pythonhosted.org/packages/3e/3d/330d9efbdb816d3f60bf2ad92f05e1708e4a1b9abe80461ac3444c83f749/python_docx-1.1.2-py3-none-any.whl.metadata
  Downloading python_docx-1.1.2-py3-none-any.whl.metadata (2.0 kB)
Downloading python_docx-1.1.2-py3-none-any.whl (244 kB)
   ---------------------------------------- 0.0/244.3 kB ? eta -:--:--
   --------------- ------------------------ 92.2/244.3 kB 2.6 MB/s eta 0:00:01
   ---------------------------------------- 244.3/244.3 kB 3.7 MB/s eta 0:00:00
Installing collected packages: python-docx
Successfully installed python-docx-1.1.2


### 在Nexis上下载的是word格式的文章。
### 文章统一的格式是：第一行标题，第二行报纸名，第三行日期。第四行开始是版权信息，一直到Body这个词，之后是新闻正文。正文结束后是Classification。最后的最后每篇文章以End of Document结束。
### 所以分割的时候提取第一行为Title，第三行中提取符合日期格式的部分作为日期。从Body开始提取，直到出现classification结束。
### 把这些内容提取为三列，title, date, body, 转换成csv。 
### 因为下载下来的word分为好多个，所以转化成了好多个csv

一个word中新闻内容的例子：

Women Earn High Marks on a Web Test
The New York Times
July 3, 1997, Thursday, Late Edition - Final


Copyright 1997 The New York Times Company
Distribution: Business/Financial Desk
Section: Section D;; Section D; Page 6; Column 3; Business/Financial Desk; Column 3;
Length: 236 words
Body


Results of a World Wide Web navigating test, released yesterday by the MCI Communications Corporation and the Educational Testing Service, indicated that older women could find their way around the Web better than young men could. 吧啦吧啦....


Classification


Language: ENGLISH

Subject: WOMEN (91%); ACADEMIC TESTING (90%); EDUCATIONAL TESTING SERVICES (78%)

Company: VERIZON COMMUNICATIONS INC  (96%);  EDUCATIONAL TESTING SERVICE  (58%); VERIZON COMMUNICATIONS INC  (96%);  EDUCATIONAL TESTING SERVICE  (58%)

Ticker: VZC (LSE)  (96%);  VZ (NYSE)  (96%)

Industry: NAICS517210 WIRELESS TELECOMMUNICATIONS CARRIERS (EXCEPT SATELLITE)  (96%);  NAICS517110 WIRED TELECOMMUNICATIONS CARRIERS  (96%);  SIC8733 NONCOMMERCIAL RESEARCH ORGANIZATIONS  (58%); INTERNET & WWW (93%); COMPUTER NETWORKS (90%); SEARCH ENGINES (74%)

Person: VINTON CERF (73%)

Load-Date: July 3, 1997


End of Document 


In [29]:
import re
import os
import pandas as pd
from docx import Document

# read word docx
def read_word_file(file_path):
    doc = Document(file_path)
    content = []
    for para in doc.paragraphs:
        content.append(para.text)
    return content

# extract topic and content
def extract_articles(content):
    articles = []
    current_article = {'title': '', 'date': '', 'body': ''}
    in_body = False


    # extract date
    date_pattern = re.compile(r'^([A-Z][a-z]+\s\d{1,2},\s\d{4})', re.IGNORECASE)

    for line in content:
        # topic
        if not current_article['title'] and not date_pattern.match(line):
            current_article['title'] = line.strip()
        
        # date
        elif date_pattern.match(line):
            
            current_article['date'] = date_pattern.search(line).group(1).strip()

        # content
        elif line.strip() == "Body":
            in_body = True
        
        # stop at "Classification" 
        elif line.strip() == "Classification":
            in_body = False
        
        # "End of Document" means the end
        elif line.strip() == "End of Document":
            articles.append(current_article.copy()) 
            current_article = {'title': '', 'date': '', 'body': ''} 
        
        elif in_body:
            current_article['body'] += line.strip() + ' '

    return articles

# save as csv
def save_to_csv(articles, output_file):
    df = pd.DataFrame(articles)
    df.to_csv(output_file, index=False, encoding='utf-8')

def main():
    input_dir = 'D:/news/'  
    file_pattern = re.compile(r'85_us_(\d+)\.DOCX', re.IGNORECASE)  # read the word docx

    for file_name in os.listdir(input_dir):
        match = file_pattern.match(file_name)
        if match:
            number_suffix = match.group(1)  # docs name as 85_country_number
            input_file = os.path.join(input_dir, file_name)
            output_file = f'internet_us_{number_suffix}.csv'  # save as csv
            
            # read and processed the docs
            content = read_word_file(input_file)
            articles = extract_articles(content)
            save_to_csv(articles, output_file)
            print(f'Processed {file_name} -> {output_file}')

if __name__ == '__main__':
    main()


Processed 85_us_1.DOCX -> internet_us_1.csv
Processed 85_us_10.DOCX -> internet_us_10.csv
Processed 85_us_11.DOCX -> internet_us_11.csv
Processed 85_us_12.DOCX -> internet_us_12.csv
Processed 85_us_13.DOCX -> internet_us_13.csv
Processed 85_us_14.DOCX -> internet_us_14.csv
Processed 85_us_15.DOCX -> internet_us_15.csv
Processed 85_us_16.DOCX -> internet_us_16.csv
Processed 85_us_17.DOCX -> internet_us_17.csv
Processed 85_us_2.DOCX -> internet_us_2.csv
Processed 85_us_20.DOCX -> internet_us_20.csv
Processed 85_us_21.DOCX -> internet_us_21.csv
Processed 85_us_22.DOCX -> internet_us_22.csv
Processed 85_us_3.DOCX -> internet_us_3.csv
Processed 85_us_4.DOCX -> internet_us_4.csv
Processed 85_us_5.DOCX -> internet_us_5.csv
Processed 85_us_6.DOCX -> internet_us_6.csv
Processed 85_us_7.DOCX -> internet_us_7.csv
Processed 85_us_8.DOCX -> internet_us_8.csv
Processed 85_us_9.DOCX -> internet_us_9.csv


### 然后把这些csv合成了同一个

In [31]:
import pandas as pd
import glob

# combined all the csv
def merge_csv_files(file_paths):
    
    all_data = pd.concat([pd.read_csv(file) for file in file_paths], ignore_index=True)
    
    all_data.insert(0, 'id', range(1, len(all_data) + 1))
    
    return all_data

# save the combined csv
def save_merged_csv(merged_data, output_file):
    merged_data.to_csv(output_file, index=False, encoding='utf-8')


def main():
    file_paths = glob.glob('C:/Users/70794/news/us/*.csv')  
    
    merged_data = merge_csv_files(file_paths)

    output_file = 'internet_us.csv'
    save_merged_csv(merged_data, output_file)
    print(f"csv are saved in: {output_file}")

if __name__ == '__main__':
    main()


csv are saved in: internet_us.csv


### 然后把date拆成了日、月、年，保存到了新的csv。新的csv在新的notebook里处理。
### 这个notebook里只有美国的处理步骤。两个报纸的文档命名只有后缀名不一样，中国的没有保存步骤。

In [33]:
import pandas as pd

# read the combined csv
def process_csv(file_path, output_file):
    # read csv
    df = pd.read_csv(file_path)
    
    # seperate the date column to day, month, year
    df[['month_name', 'day', 'year']] = df['date'].str.extract(r'([A-Za-z]+) (\d{1,2}), (\d{4})')
    
    month_mapping = {
        'january': 1, 'february': 2, 'march': 3, 'april': 4, 'may': 5, 'june': 6,
        'july': 7, 'august': 8, 'september': 9, 'october': 10, 'november': 11, 'december': 12
    }
    df['month'] = df['month_name'].str.lower().map(month_mapping)
    
    df.drop(columns=['date', 'month_name'], inplace=True)
    
    df.to_csv(output_file, index=False, encoding='utf-8')
    print(f"processed csv save to: {output_file}")

# 主函数
def main():
    input_file = 'C:/Users/70794/news/us/internet_us.csv'  
    output_file = 'internet_us_clean.csv' 
    

    process_csv(input_file, output_file)

if __name__ == '__main__':
    main()


processed csv save to: internet_us_clean.csv
