# Most harmless user function of Python


In [7]:
from typing import List
import pandas as pd
def merge_df_list(df_left: pd.DataFrame, dfs_right: List[pd.DataFrame], keys: List[str], methods: List[str]) -> pd.DataFrame:
    """
    功能：横向合并多个df
    参数：
    df_left: 最左边的df
    df_list：需要合并的df列表(除最左边的以外)
    keys：合并df所需要的key列表，需要与df_list一一对应, 列表元素为一个二元元组，元组元素为str列表
    methods: 合并df所需要的方法列表，需要与df_list一一对应

    返回值：合并后的df
    """
    # 将最左边的数据帧赋值给df_merged
    df_merged = df_left
    # 使用zip函数同时迭代df_right，keys和methods列表
    for df, key, method in zip(dfs_right, keys, methods):
        # 使用指定的键和方法合并当前数据帧与df_merged
        df_merged = df_merged.merge(df, left_on=key[0], right_on=key[1], how=method)
    # 返回合并后的数据帧
    return df_merged

# 示例代码：
if __name__ == "__main__":
    df1 = pd.DataFrame({"A": [1, 2, 3], "B": [1, 2, 6]})
    df2 = pd.DataFrame({"B": [1, 2, 9], "C": [10, 11, 12]})
    df3 = pd.DataFrame({"C": [10, 3, 15], "D": [16, 17, 18]})


    df_left = df1
    df_right = [df2, df3]
    keys = [("B","B"),("C","C")]
    methods = ['inner','outer']

    merged_df = merge_df_list(df_left,df_right, keys,methods)
    print(f"df1: \n {df1} \n")
    print(f"df1: \n {df2} \n")
    print(f"df1: \n {df3} \n")
    print(f"merged df: \n {merged_df} \n")


df1: 
    A  B
0  1  1
1  2  2
2  3  6 

df1: 
    B   C
0  1  10
1  2  11
2  9  12 

df1: 
     C   D
0  10  16
1   3  17
2  15  18 

merged df: 
      A    B   C     D
0  1.0  1.0  10  16.0
1  2.0  2.0  11   NaN
2  NaN  NaN   3  17.0
3  NaN  NaN  15  18.0 



### 批量扫描文件夹并获取文件路径


In [13]:
from typing import List, Tuple, Pattern
import os
import re # 需要exclude支持正则表达式

def scan_file_path(folder:str, extensions:List[str], exclude:str="^$", recursive:bool=False)-> List[Tuple[str, str, str]]:
  """扫描指定目录及其子目录下的所有指定扩展名的文件路径及对应的文件名。
    
  Args:
    folder: 要查找文件的目录。
    extensions: 一个包含文件扩展名的列表。
    exclude: 需要被排除的文件路径列表。
    recuisive: 是否扫描所有子目录中的文件，默认为False。

    
  Returns:
    一个包含找到的文件路径的列表。
  """
  # 创建一个空列表用来保存找到的文件
  file_list = []
  # 如果folder不是一个存在的目录，抛出一个异常
  if not os.path.isdir(folder):
        raise ValueError("'{}'is not existing folder".format(folder))
  # 遍历 folder 目录下的所有条目
  with os.scandir(folder) as it:
    for entry in it:
      # 如果条目是一个目录，递归调用 find_files
      if entry.is_dir() and recursive:
        file_list.extend(scan_file_path(entry.path,extensions,exclude,recursive))
      # 否则，如果条目是指定扩展名的文件且未被指定排除，将其路径添加到 file_list 中
      elif not re.search(exclude, entry.path) and any(map(entry.name.endswith, extensions)):
        file_name, file_ext = os.path.splitext(entry.name)
        file_list.append((entry.path,entry.name,file_name))
  # 返回找到的文件列表
  return file_list


# 示例代码
if __name__ == "__main__":
  extensions = ['.csv','.dta']
  folder = "." 
  exclude = r".csv$"
  path1 = scan_file_path(folder,extensions,exclude=exclude,recursive=True)
  path2 = scan_file_path(folder,extensions,recursive=True)

  print(f"{path1} \n")
  print(f"{path2} \n")  

[('./assets/auto.dta', 'auto.dta', 'auto')] 

[('./assets/auto.dta', 'auto.dta', 'auto'), ('./assets/auto.csv', 'auto.csv', 'auto')] 



In [9]:
from typing import Dict, Any
def dict_to_df(_dict: Dict[Any, Any], key_name: str, value_name: str) -> pd.DataFrame:
    """
    字典转换为一个dataframe, 字典键对应第一列，字典值第二列。

    参数：
    _dict (Dict[Any, Any]): 字典。
    key_name (str): 字典的keys对应的列名。
    value_name (str): 字典的values对应的列名。
    
    返回值：
    df(pd.DataFrame): 一个两列dataframe，第一列对应字典的keys，第二列对应字典的values。
    """
    df = (pd.DataFrame.from_dict(_dict, orient='index', columns=[value_name])
        .rename_axis(key_name))
  
    return df

if __name__ == "__main__":
    # 定义测试字典
    test_dict = {'a': 1, 'b': 2, 'c': 3}

    # 调用 dict_to_df 函数
    df = dict_to_df(test_dict, 'key', 'value')

    # 打印输出结果，查看是否符合预期
    print(f"{test_dict} \n")
    print(f"{df} \n")


{'a': 1, 'b': 2, 'c': 3} 

     value
key       
a        1
b        2
c        3 



In [15]:
import pandas as pd
from typing import Generator

def df_chunk_generator(file_path: str, chunksize: int=1000, keep_label_info: bool = True, convert_categoricals:bool=False,convert_missing:bool=True) -> Generator:
    """
    读取 Stata 文件并返回数据和标签。
    
    参数:
    file_path (str): Stata 文件的路径。
    chunksize (int): 数据块的大小。
    keep_label_info (bool): 是否保留stata标签数据。默认值为True。
    convert_categoricals (bool): 是否转换原始值为值标签对应值，默认值为False。注意，有些文件转换会报错。
    convert_missing (bool): 是否以stata缺失值类型存储，默认值为True。

    
    返回值:
    Generator: 返回一个（DataFrame）生成器。
    """
    
    # 创建 StataReader 并设置参数。
    reader = pd.io.stata.StataReader(file_path, chunksize=chunksize, convert_categoricals=convert_categoricals,convert_missing=convert_missing)
    # 如果保留标签信息
    if keep_label_info:
        # 获取Stata文件的变量标签dataframe
        variable_labels = dict_to_df(reader.variable_labels(),key_name='column_name',value_name='column_label').reset_index()
        # 获取Stata文件的值标签dataframe
        value_labels = dict_to_df(reader.value_labels(),key_name='value_label_name',value_name='value_label').reset_index()
        # Outer横向合并生成标签dataframe
        labels = pd.merge(variable_labels,value_labels,left_on='column_name',right_on='value_label_name',how='outer')
        # 返回标签dataframe
        yield labels
        # 返回数据dataframe块
        yield from reader
    # 否则
    else:
        # 仅返回数据数据dataframe块
        yield from reader
        

# 函数调用示例：
if __name__ == "__main__":
    # 生成两个示例的df chunks生成器
    data1 = df_chunk_generator("assets/auto.dta",chunksize=50)
    data2 = df_chunk_generator("assets/auto.dta",chunksize=50,keep_label_info=False)
    # 遍历生成器
    for index, df in enumerate(data1):
        print(f"生成器data1中的df块{index+1}:\n {df}")
    for index, df in enumerate(data2):
        print(f"生成器data2中的df块{index+1}:\n {df}")

生成器data1中的df块1:
      column_name            column_label value_label_name value_label
0           make          Make and model              NaN         NaN
1          price                   Price              NaN         NaN
2            mpg           Mileage (mpg)              NaN         NaN
3          rep78      Repair record 1978              NaN         NaN
4       headroom          Headroom (in.)              NaN         NaN
5          trunk   Trunk space (cu. ft.)              NaN         NaN
6         weight           Weight (lbs.)              NaN         NaN
7         length            Length (in.)              NaN         NaN
8           turn       Turn circle (ft.)              NaN         NaN
9   displacement  Displacement (cu. in.)              NaN         NaN
10    gear_ratio              Gear ratio              NaN         NaN
11       foreign              Car origin              NaN         NaN
生成器data1中的df块2:
                  make  price  mpg rep78  headroom  trunk

In [11]:
# import pandas as pd
# import sqlite3
# import os

# def read_data_to_sqlite(file_path_list, sqlite_file_name,chunksize=10000,table_method="replace"):
#     # create a connection to the sqlite database
#     conn = sqlite3.connect(sqlite_file_name)

#     # create a dictionary that maps file extensions to pandas read_* functions
#     read_funcs = {
#         ".csv": pd.read_csv,
#         ".xlsx": pd.read_excel,
#         ".json": pd.read_json
#     }

#     write_methods: {
#         "replace": f"DROP TABLE {table_name}",
#         "append": 
#     }
#     # iterate over the file paths
#     for file_path in file_path_list:
#         # get the file extension
#         file_ext = os.path.splitext(file_path)[1]

#         # get the table name by removing the file extension from the file name
#         table_name = os.path.splitext(os.path.basename(file_path))[0]

#         # read the data from the file using the appropriate pandas function
#         # from the read_funcs dictionary
#         data = read_funcs[file_ext](file_path,chunksize=chunksize)
#         # iterate over the data chunks
#         for chunk in data:
#             # write the chunk to the sqlite database as a table with the given name
#             chunk.to_sql(table_name, conn, index=False, if_exists="append")

#         # write the data to the sqlite database as a table with the given name
#         data.to_sql(table_name, conn, index=False, if_exists="replace")
        
#     conn.close()
#     # read the data from the file using the appropriate pandas function
# # from the read_funcs dictionary, with chunksize set to 1000
# data = read_funcs[file_ext](file_path, chunksize=1000)

# # iterate over the data chunks
# for chunk in data:
#     # write the chunk to the sqlite database as a table with the given name
#     chunk.to_sql(table_name, conn, index=False, if_exists="append")


# # read the data from the file using the appropriate pandas function
# # from the read_funcs dictionary, with chunksize set to 1000
# data = read_funcs[file_ext](file_path, chunksize=1000)

# # create a cursor to execute SQL commands
# cursor = conn.cursor()

# # check if the table already exists in the database
# table_exists = cursor.execute(
#     f"SELECT name FROM sqlite_master WHERE type='table' AND name='{table_name}'"
# ).fetchone() is not None

# # if the table exists and the method is "replace", drop the table
# if table_exists and method == "replace":
#     cursor.execute(f"DROP TABLE {table_name}")

# # if the table exists and the method is "rename", rename the table
# if table_exists and method == "rename":
#     # generate a unique name for the new table
#     new_table_name = f"{table_name}_{uuid.uuid4().hex}"
#     cursor.execute(f"ALTER TABLE {table_name} RENAME TO {new_table_



# def read_data_to_sqlite(file_path_list, sqlite_file_name, chunksize=None, method=None):
#     # create a connection to the sqlite database
#     conn = sqlite3.connect(sqlite_file_name)

#     # create a dictionary that maps file extensions to pandas read_* functions
#     read_funcs = {
#         ".csv": pd.read_csv,
#         ".xlsx": pd.read_excel,
#         ".json": pd.read_json
#     }

#     # iterate over the file paths
#     for file_path in file_path_list:
#         # get the file extension
#         file_ext = os.path.splitext(file_path)[1]

#         # get the table name by removing the file extension from the file name
#         table_name = os.path.splitext(os.path.basename(file_path))[0]

#         # read the data from the file using the appropriate pandas function
#         # from the read_funcs dictionary
#         if chunksize:
#             for chunk in pd.read_csv(file_path, chunksize=chunksize):
#                 if method == "replace":
#                     chunk.to_sql(table_name, conn, index=False, if_exists="replace")
#                 elif method == "append":
#                     chunk.to_sql(table_name, conn, index=False, if_exists="append")
#                 elif method == "rename":
#                     chunk.
