In [137]:
import os
import sys
import chardet


def read_text(path: str) -> str | None:
    try:
        if not os.path.exists(path):
            return None

        with open(path, "rb") as file:
            data = file.read()

        encoding = chardet.detect(data)["encoding"]
        return data.decode(encoding)
    except Exception as e:
        return None

In [138]:
import regex as re

# 4 digits with no other digits before or after
reg = r'(?<!\d)\d{4}(?!\d)'
def get_year(file_name: str) -> int|None:
    if re.search(reg, file_name) is not None:
        return int(re.search(reg, file_name).group(0))
    else:
        return None
    

def count_regexp_occ(regexp: str, text: str) -> int:
    return len(re.findall(regexp, text))


In [139]:
regex你好 = r'(?<!\p{Han})你好(?!\p{Han})'
regex嗨 = r'(?<!\p{Han})(嗨|嘿)(?!\p{Han})'
regex上午下午晚上中午早上好 = r'(?<!\p{Han})(上午|下午|晚上|中午|早上)好(?!\p{Han})'
regex你好吗 = r'((?<!\p{Han})你(最近)?好[吗么没嘛啊](?!\p{Han}))'
regex吃了吗 = r'吃[过]?[饭]?了[吗么没嘛啊](?!\p{Han})'

all_regex = [regex你好, regex嗨, regex上午下午晚上中午早上好, regex你好吗, regex吃了吗]

In [140]:

test_text = "我是为了你好，你好,你好你好-嗨！-嘿！你吃过饭了吗？你吃了吗？吃了没？你好吗？你最近好吗？你说我好吗？"
result = re.findall(pattern=regex嗨, string=test_text)
print(result)
result = re.findall(pattern=regex吃了吗, string=test_text)
print(result)
result = re.findall(pattern=regex你好吗, string=test_text)
print(result)

['嗨', '嘿']
['吃过饭了吗', '吃了吗', '吃了没']
[('你好吗', ''), ('你最近好吗', '最近')]


In [141]:
def walk_decade(decade: int):
    for root, dirs, files in os.walk(f"./captions/{decade}s"):
        for file in files:
            path = os.path.join(root, file)
            text = read_text(path)
            if text is None:
                continue

            year_of_movie = get_year(file)
            
            counts = [
                count_regexp_occ(r, text)
                for r in all_regex
            ]

            yield file, decade, str(year_of_movie), counts[0], counts[1], counts[2], counts[3], counts[4]

In [142]:
import pandas as pd

In [143]:
def generate_csv(decade: int):
    data = list(walk_decade(decade))

    df = pd.DataFrame(data, columns=[
        "file", "decade", "year",
            "regex你好", 
            "regex上午下午晚上中午早上好",
            "regex你好吗",
            "regex吃了吗",
            "regex嗨",
    ])
    df.to_csv(f"{decade}.csv", index=False)

In [144]:
for y in range(1960, 2030, 10):
    print(y)
    generate_csv(y)

1960
1970
1980
1990
2000
2010
2020
