In [1]:
import pandas as pd
import time
import os
#ignore warnings
import warnings
warnings.filterwarnings("ignore")

print("🔄 Bắt đầu đọc dữ liệu Parquet...")
start_read_time = time.time()  # Đo thời gian đọc
parquet_folder = r"C:\Khue\TDN\data\processed"

try:
    all_files_in_folder = os.listdir(parquet_folder)
    parquet_files = [
        os.path.join(parquet_folder, f)
        for f in all_files_in_folder
        if f.startswith("sanluong_") and f.endswith(".parquet")
    ]

    if not parquet_files:
        print(f"⚠️ Không tìm thấy file Parquet trong: {parquet_folder}")
        df_all = pd.DataFrame()
    else:
        df_list = []
        print(f"🔍 Tìm thấy {len(parquet_files)} file Parquet. Bắt đầu đọc...")
        for i, f in enumerate(parquet_files):
            try:
                df_temp = pd.read_parquet(f, engine="pyarrow")
                # Chỉ đọc các cột thực sự cần thiết ngay từ đầu để tiết kiệm bộ nhớ
                required_cols_read = ["CTDL", "NMTD", "MADIEMDO", "ENDTIME", "CS"]
                if all(col in df_temp.columns for col in required_cols_read):
                    df_list.append(df_temp[required_cols_read])  # Chỉ lấy cột cần thiết
                else:
                    print(f"   ⚠️ File {os.path.basename(f)} thiếu cột, bỏ qua.")
            except Exception as e:
                print(f"❌ Lỗi đọc file {os.path.basename(f)}: {e}")

        if df_list:
            print("   Ghép các DataFrame...")
            df_all = pd.concat(df_list, ignore_index=True)
            print(f"✅ Đọc và ghép {len(df_list)} file thành công.")
            print(f"👉 Tổng số dòng: {df_all.shape[0]:,}")
            print(f"⏱️ Thời gian đọc và ghép: {time.time() - start_read_time:.2f} giây")
            # Hiển thị thông tin bộ nhớ (tùy chọn)
            # df_all.info(memory_usage='deep')
        else:
            print("❌ Không đọc được file nào thành công.")
            df_all = pd.DataFrame()

except FileNotFoundError:
    print(f"❌ Lỗi: Không tìm thấy thư mục: {parquet_folder}")
    df_all = pd.DataFrame()
except Exception as e:
    print(f"❌ Lỗi không xác định khi đọc file/thư mục: {e}")
    df_all = pd.DataFrame()

🔄 Bắt đầu đọc dữ liệu Parquet...
🔍 Tìm thấy 4 file Parquet. Bắt đầu đọc...
   Ghép các DataFrame...
✅ Đọc và ghép 4 file thành công.
👉 Tổng số dòng: 18,404,864
⏱️ Thời gian đọc và ghép: 6.71 giây


In [2]:
# Đổi tên cột ENDTIME thành TIME để dễ hiểu hơn
df_all = df_all.rename(columns={"ENDTIME": "TIME"})

# Chuyển đổi kiểu dữ liệu cho cột TIME nếu cột này tồn tại
if "TIME" in df_all.columns:
    df_all["TIME"] = pd.to_datetime(df_all["TIME"], errors="coerce", format="%Y-%m-%d %H:%M:%S")
    
    # Số lượng dòng ban đầu trước khi lọc
    initial_rows = len(df_all)
    print(f"🔍 Tổng số dòng trước khi lọc theo độ phân giải 30 phút: {initial_rows:,}")
    
    # Lọc các dòng dữ liệu có phút là 0 hoặc 30 và giây là 0
    valid_time_mask = (
        ((df_all["TIME"].dt.minute == 0) | (df_all["TIME"].dt.minute == 30)) 
        & (df_all["TIME"].dt.second == 0)
    )
    
    # Lưu những dòng không hợp lệ vào df_removed trước khi lọc
    df_removed = df_all[~valid_time_mask].copy()
    
    # Áp dụng bộ lọc vào df_all
    df_all = df_all[valid_time_mask]
    
    # Sắp xếp dữ liệu theo CTDL, NMTD và TIME
    df_all = df_all.sort_values(by=["CTDL", "NMTD", "TIME"])
    
    # Báo cáo kết quả
    removed_rows = initial_rows - len(df_all)
    print(f"⚠️ Đã loại bỏ {removed_rows:,} dòng có thời gian không theo độ phân giải 30 phút chẵn")
    print(f"✅ Còn lại {len(df_all):,} dòng dữ liệu hợp lệ")
    print(f"📊 Tỷ lệ dữ liệu hợp lệ: {(len(df_all) / initial_rows * 100):.2f}%")
    
    if not df_removed.empty:
        print(f"💾 Đã lưu {len(df_removed):,} dòng không hợp lệ vào df_removed")
        # Hiển thị phân phối các giá trị phút không hợp lệ
        invalid_minutes = df_removed["TIME"].dt.minute.value_counts().sort_index()
        if not invalid_minutes.empty:
            print("\nPhân phối giá trị phút không hợp lệ:")
            print(invalid_minutes)
            
    # Gán kết quả cho df để sử dụng trong các cell tiếp theo
    df = df_all.copy()

🔍 Tổng số dòng trước khi lọc theo độ phân giải 30 phút: 18,404,864
⚠️ Đã loại bỏ 60,706 dòng có thời gian không theo độ phân giải 30 phút chẵn
✅ Còn lại 18,344,158 dòng dữ liệu hợp lệ
📊 Tỷ lệ dữ liệu hợp lệ: 99.67%
💾 Đã lưu 60,706 dòng không hợp lệ vào df_removed

Phân phối giá trị phút không hợp lệ:
TIME
0       284
1       270
2       285
3       263
4       387
5       367
6       362
7       296
8       226
9       241
10      196
11      194
12      215
13      209
14      212
15    22933
16      207
17      174
18      186
19      199
20      205
21      204
22      184
23      207
24      230
25      249
26      299
27      382
28      373
29      359
30      386
31      387
32      305
33      294
34      371
35      381
36      440
37      275
38      239
39      221
40      226
41      198
42      203
43      207
44      147
45    22900
46      215
47      175
48      195
49      180
50      185
51      183
52      218
53      201
54      229
55      236
56      242
57      3

In [3]:
df_removed

Unnamed: 0,CTDL,NMTD,MADIEMDO,TIME,CS
4265821,CTY ĐIỆN LỰC BẮC KẠN,NMTĐ PÁC CÁP,G2A234S000M331,2021-07-27 08:05:00,0.0
4265823,CTY ĐIỆN LỰC BẮC KẠN,NMTĐ PÁC CÁP,G2A234S000M331,2021-07-27 08:15:00,0.0
4265828,CTY ĐIỆN LỰC BẮC KẠN,NMTĐ PÁC CÁP,G2A234S000M331,2021-07-27 08:40:00,0.0
4265894,CTY ĐIỆN LỰC BẮC KẠN,NMTĐ PÁC CÁP,G2A234S000M331,2021-08-28 16:35:00,0.0
4265895,CTY ĐIỆN LỰC BẮC KẠN,NMTĐ PÁC CÁP,G2A234S000M331,2021-08-28 16:40:00,0.0
...,...,...,...,...,...
18393917,CTY ĐIỆN LỰC TUYÊN QUANG,NMTĐ Sông Lô 8B,G2A214S000M131,2024-04-14 17:52:32,0.0
18403846,CTY ĐIỆN LỰC TUYÊN QUANG,NMTĐ Sông Lô 8B,G2A214S000M132,2024-07-27 20:09:51,28.0
18403847,CTY ĐIỆN LỰC TUYÊN QUANG,NMTĐ Sông Lô 8B,G2A214S000M132,2024-07-27 20:42:37,0.0
18403853,CTY ĐIỆN LỰC TUYÊN QUANG,NMTĐ Sông Lô 8B,G2A214S000M131,2024-07-27 20:09:33,37.0


In [4]:
df_ref = pd.read_excel(r"C:\Khue\TDN\data\processed\DanhsachNM_sanluong_20250423.xlsx", sheet_name="Sheet1", usecols="A:D")
df_ref

Unnamed: 0,CTDL,NMTD,MADIEMDO,Lỗi đơn vị
0,CTY ĐIỆN LỰC BẮC KẠN,NMTĐ NẬM CẮT,G2A123S000M371,0.0
1,CTY ĐIỆN LỰC BẮC KẠN,NMTĐ PÁC CÁP,G2A234S000M331,0.0
2,CTY ĐIỆN LỰC BẮC KẠN,NMTĐ TÀ LÀNG,G2A121S000M371,0.0
3,CTY ĐIỆN LỰC BẮC KẠN,NMTĐ THƯỢNG ÂN,G2A122S000M371,0.0
4,CTY ĐIỆN LỰC BẮC KẠN,THÁC GIỀNG 1,G2A219S000M371,0.0
...,...,...,...,...
352,CTY ĐIỆN LỰC YÊN BÁI,NMTĐ Thác Cá 1,G2A247S000M131,1.0
353,CTY ĐIỆN LỰC YÊN BÁI,NMTĐ Thác Cá 2,G2A222S000M131,1.0
354,CTY ĐIỆN LỰC YÊN BÁI,NMTĐ THÀO SA CHẢI,G2A286S000M371,0.0
355,CTY ĐIỆN LỰC YÊN BÁI,NMTĐ TRẠM TẤU,G2A154S000M175,1.0


In [5]:
# Tôi muốn merge cột "Lỗi đơn vị" từ df_ref vào df sử dụng cột chung "MADIEMDO"
# Đảm bảo cột "MADIEMDO" trong df_ref là kiểu string
df_ref["MADIEMDO"] = df_ref["MADIEMDO"].astype(str)
# Đảm bảo cột "MADIEMDO" trong df là kiểu string
df["MADIEMDO"] = df["MADIEMDO"].astype(str)
# Merge df_ref vào df dựa trên cột "MADIEMDO"
df_final = df.merge(df_ref[["MADIEMDO", "Lỗi đơn vị"]], on="MADIEMDO", how="left")
df_final 

Unnamed: 0,CTDL,NMTD,MADIEMDO,TIME,CS,Lỗi đơn vị
0,CTY ĐIỆN LỰC BẮC KẠN,NMTĐ NẬM CẮT,G2A123S000M371,2021-01-01 00:30:00,951.300,0.0
1,CTY ĐIỆN LỰC BẮC KẠN,NMTĐ NẬM CẮT,G2A123S000M371,2021-01-01 01:00:00,1027.300,0.0
2,CTY ĐIỆN LỰC BẮC KẠN,NMTĐ NẬM CẮT,G2A123S000M371,2021-01-01 01:30:00,151.640,0.0
3,CTY ĐIỆN LỰC BẮC KẠN,NMTĐ NẬM CẮT,G2A123S000M371,2021-01-01 02:00:00,0.000,0.0
4,CTY ĐIỆN LỰC BẮC KẠN,NMTĐ NẬM CẮT,G2A123S000M371,2021-01-01 02:30:00,0.000,0.0
...,...,...,...,...,...,...
18344153,CTY ĐIỆN LỰC ĐIỆN BIÊN,NMTĐ Đề Bâu,G2A283S000M331,2024-08-08 08:00:00,0.066,3.0
18344154,CTY ĐIỆN LỰC ĐIỆN BIÊN,NMTĐ Đề Bâu,G2A283S000M331,2024-08-08 08:30:00,0.066,3.0
18344155,CTY ĐIỆN LỰC ĐIỆN BIÊN,NMTĐ Đề Bâu,G2A283S000M331,2024-08-08 09:00:00,0.066,3.0
18344156,CTY ĐIỆN LỰC ĐIỆN BIÊN,NMTĐ Đề Bâu,G2A283S000M331,2024-08-08 09:30:00,0.066,3.0


In [7]:
# Xóa các hàng có giá trị null ở cột 'Lỗi đơn vị'
rows_before = len(df_final)
df_final = df_final[df_final['Lỗi đơn vị'].notnull()]
rows_after = len(df_final)
print(f"Đã xóa {rows_before - rows_after:,} dòng có giá trị null ở cột 'Lỗi đơn vị'")

Đã xóa 638,056 dòng có giá trị null ở cột 'Lỗi đơn vị'


In [8]:
# Chuẩn hóa cột 'CS' dựa trên giá trị 'Lỗi đơn vị'
def normalize_cs(row):
    error = row['Lỗi đơn vị']
    cs = row['CS']
    if error == 0:
        return cs
    elif error == 1:
        return cs * 100
    elif error == 2:
        return cs * 1e4
    elif error == 3:
        return cs * 1e5
    elif error == 4:
        return cs * 1e6
    elif error == 5:
        return cs / 10
    else:
        return cs  # Giữ nguyên nếu giá trị khác

df_final['CS_chuanhoa'] = df_final.apply(normalize_cs, axis=1)

In [9]:
df_final

Unnamed: 0,CTDL,NMTD,MADIEMDO,TIME,CS,Lỗi đơn vị,CS_chuanhoa
0,CTY ĐIỆN LỰC BẮC KẠN,NMTĐ NẬM CẮT,G2A123S000M371,2021-01-01 00:30:00,951.300,0.0,951.30
1,CTY ĐIỆN LỰC BẮC KẠN,NMTĐ NẬM CẮT,G2A123S000M371,2021-01-01 01:00:00,1027.300,0.0,1027.30
2,CTY ĐIỆN LỰC BẮC KẠN,NMTĐ NẬM CẮT,G2A123S000M371,2021-01-01 01:30:00,151.640,0.0,151.64
3,CTY ĐIỆN LỰC BẮC KẠN,NMTĐ NẬM CẮT,G2A123S000M371,2021-01-01 02:00:00,0.000,0.0,0.00
4,CTY ĐIỆN LỰC BẮC KẠN,NMTĐ NẬM CẮT,G2A123S000M371,2021-01-01 02:30:00,0.000,0.0,0.00
...,...,...,...,...,...,...,...
18344153,CTY ĐIỆN LỰC ĐIỆN BIÊN,NMTĐ Đề Bâu,G2A283S000M331,2024-08-08 08:00:00,0.066,3.0,6600.00
18344154,CTY ĐIỆN LỰC ĐIỆN BIÊN,NMTĐ Đề Bâu,G2A283S000M331,2024-08-08 08:30:00,0.066,3.0,6600.00
18344155,CTY ĐIỆN LỰC ĐIỆN BIÊN,NMTĐ Đề Bâu,G2A283S000M331,2024-08-08 09:00:00,0.066,3.0,6600.00
18344156,CTY ĐIỆN LỰC ĐIỆN BIÊN,NMTĐ Đề Bâu,G2A283S000M331,2024-08-08 09:30:00,0.066,3.0,6600.00


In [12]:
# export df_final to parquet
output_folder = r"C:\Khue\TDN\data\interim"
output_file = os.path.join(output_folder, "sanluong_chuanhoa.parquet")
# export df_final to csv
output_file = os.path.join(output_folder, "sanluong_chuanhoa.csv")
df_final.to_parquet(output_file, index=False, engine="pyarrow")
df_final.to_csv(output_file, index=False, encoding="utf-8-sig")

In [1]:
import pandas as pd
import os

df = pd.read_parquet('C:\Khue\TDN\data\interim\sanluong_chuanhoa.parquet', engine="pyarrow")
df

Unnamed: 0,CTDL,NMTD,MADIEMDO,TIME,CS,Lỗi đơn vị,CS_chuanhoa
0,CTY ĐIỆN LỰC BẮC KẠN,NMTĐ NẬM CẮT,G2A123S000M371,2021-01-01 00:30:00,951.300,0.0,951.30
1,CTY ĐIỆN LỰC BẮC KẠN,NMTĐ NẬM CẮT,G2A123S000M371,2021-01-01 01:00:00,1027.300,0.0,1027.30
2,CTY ĐIỆN LỰC BẮC KẠN,NMTĐ NẬM CẮT,G2A123S000M371,2021-01-01 01:30:00,151.640,0.0,151.64
3,CTY ĐIỆN LỰC BẮC KẠN,NMTĐ NẬM CẮT,G2A123S000M371,2021-01-01 02:00:00,0.000,0.0,0.00
4,CTY ĐIỆN LỰC BẮC KẠN,NMTĐ NẬM CẮT,G2A123S000M371,2021-01-01 02:30:00,0.000,0.0,0.00
...,...,...,...,...,...,...,...
17706097,CTY ĐIỆN LỰC ĐIỆN BIÊN,NMTĐ Đề Bâu,G2A283S000M331,2024-08-08 08:00:00,0.066,3.0,6600.00
17706098,CTY ĐIỆN LỰC ĐIỆN BIÊN,NMTĐ Đề Bâu,G2A283S000M331,2024-08-08 08:30:00,0.066,3.0,6600.00
17706099,CTY ĐIỆN LỰC ĐIỆN BIÊN,NMTĐ Đề Bâu,G2A283S000M331,2024-08-08 09:00:00,0.066,3.0,6600.00
17706100,CTY ĐIỆN LỰC ĐIỆN BIÊN,NMTĐ Đề Bâu,G2A283S000M331,2024-08-08 09:30:00,0.066,3.0,6600.00


In [None]:
# Xóa cột 'CS', 'Lỗi đơn vị' khỏi df 
df = df.drop(columns=['CS', 'Lỗi đơn vị'], errors='ignore')
df

Unnamed: 0,CTDL,NMTD,MADIEMDO,TIME,CS_chuanhoa
0,CTY ĐIỆN LỰC BẮC KẠN,NMTĐ NẬM CẮT,G2A123S000M371,2021-01-01 00:30:00,951.30
1,CTY ĐIỆN LỰC BẮC KẠN,NMTĐ NẬM CẮT,G2A123S000M371,2021-01-01 01:00:00,1027.30
2,CTY ĐIỆN LỰC BẮC KẠN,NMTĐ NẬM CẮT,G2A123S000M371,2021-01-01 01:30:00,151.64
3,CTY ĐIỆN LỰC BẮC KẠN,NMTĐ NẬM CẮT,G2A123S000M371,2021-01-01 02:00:00,0.00
4,CTY ĐIỆN LỰC BẮC KẠN,NMTĐ NẬM CẮT,G2A123S000M371,2021-01-01 02:30:00,0.00
...,...,...,...,...,...
17706097,CTY ĐIỆN LỰC ĐIỆN BIÊN,NMTĐ Đề Bâu,G2A283S000M331,2024-08-08 08:00:00,6600.00
17706098,CTY ĐIỆN LỰC ĐIỆN BIÊN,NMTĐ Đề Bâu,G2A283S000M331,2024-08-08 08:30:00,6600.00
17706099,CTY ĐIỆN LỰC ĐIỆN BIÊN,NMTĐ Đề Bâu,G2A283S000M331,2024-08-08 09:00:00,6600.00
17706100,CTY ĐIỆN LỰC ĐIỆN BIÊN,NMTĐ Đề Bâu,G2A283S000M331,2024-08-08 09:30:00,6600.00


In [7]:
# Tạo cột '00:00_next' là giá trị '00:00' của ngày tiếp theo cho từng tổ hợp
df_pivot['00:00_next'] = (
    df_pivot
    .groupby(['CTDL', 'NMTD', 'MADIEMDO'])['00:00']
    .shift(-1)
)

# Đưa cột '00:00_next' ra cuối cùng
cols = list(df_pivot.columns)
cols.append(cols.pop(cols.index('00:00_next')))
df_pivot = df_pivot[cols]

df_pivot

Giờ,CTDL,NMTD,MADIEMDO,Ngày,00:00,00:30,01:00,01:30,02:00,02:30,...,19:30,20:00,20:30,21:00,21:30,22:00,22:30,23:00,23:30,00:00_next
0,CTY ĐIỆN LỰC BẮC KẠN,NMTĐ NẬM CẮT,G2A123S000M371,2021-01-01,,951.30,1027.30,151.64,0.00,0.00,...,2540.2,2084.5,1339.3,1341.5,1204.60,992.9,1154.5,1185.30,1037.8,1027.30
1,CTY ĐIỆN LỰC BẮC KẠN,NMTĐ NẬM CẮT,G2A123S000M371,2021-01-02,1027.30,936.66,966.84,855.54,0.00,0.00,...,2719.9,2276.8,1344.0,1572.1,1374.20,1370.9,1143.2,1130.30,1033.2,994.98
2,CTY ĐIỆN LỰC BẮC KẠN,NMTĐ NẬM CẮT,G2A123S000M371,2021-01-03,994.98,961.38,952.14,971.46,993.72,811.86,...,1040.8,1048.3,1041.2,1036.5,798.92,1017.6,1025.2,869.48,1007.1,1006.30
3,CTY ĐIỆN LỰC BẮC KẠN,NMTĐ NẬM CẮT,G2A123S000M371,2021-01-04,1006.30,815.72,957.18,0.00,0.00,0.00,...,2713.6,2347.4,1321.7,1415.0,1423.40,1209.6,1222.2,1189.40,1020.6,0.06
4,CTY ĐIỆN LỰC BẮC KẠN,NMTĐ NẬM CẮT,G2A123S000M371,2021-01-05,0.06,0.00,0.00,0.00,0.00,0.00,...,3047.5,2702.7,1774.5,1538.9,1444.40,1335.6,1221.8,518.30,0.0,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
371317,CTY ĐIỆN LỰC ĐIỆN BIÊN,NMTĐ Đề Bâu,G2A283S000M331,2024-08-04,6500.00,6500.00,6500.00,6600.00,6500.00,6500.00,...,6500.0,6500.0,6500.0,6500.0,6500.00,6500.0,1300.0,3900.00,6600.0,6600.00
371318,CTY ĐIỆN LỰC ĐIỆN BIÊN,NMTĐ Đề Bâu,G2A283S000M331,2024-08-05,6600.00,6500.00,6600.00,6600.00,6500.00,6600.00,...,6600.0,6500.0,6600.0,6600.0,6500.00,6600.0,6500.0,6600.00,6500.0,6600.00
371319,CTY ĐIỆN LỰC ĐIỆN BIÊN,NMTĐ Đề Bâu,G2A283S000M331,2024-08-06,6600.00,6500.00,5400.00,,6600.00,6600.00,...,6600.0,6600.0,6600.0,6600.0,6600.00,6600.0,6600.0,6600.00,6600.0,6600.00
371320,CTY ĐIỆN LỰC ĐIỆN BIÊN,NMTĐ Đề Bâu,G2A283S000M331,2024-08-07,6600.00,6600.00,6600.00,6600.00,6600.00,6600.00,...,6600.0,6600.0,6600.0,6600.0,6500.00,6600.0,6600.0,6600.00,6600.0,6600.00


In [8]:
# Xóa cột '00:00' và đổi tên cột '00:00_next' thành '00:00'
df_pivot = df_pivot.drop(columns=['00:00'], errors='ignore')
df_pivot = df_pivot.rename(columns={'00:00_next': '00:00'})
df_pivot

Giờ,CTDL,NMTD,MADIEMDO,Ngày,00:30,01:00,01:30,02:00,02:30,03:00,...,19:30,20:00,20:30,21:00,21:30,22:00,22:30,23:00,23:30,00:00
0,CTY ĐIỆN LỰC BẮC KẠN,NMTĐ NẬM CẮT,G2A123S000M371,2021-01-01,951.30,1027.30,151.64,0.00,0.00,0.00,...,2540.2,2084.5,1339.3,1341.5,1204.60,992.9,1154.5,1185.30,1037.8,1027.30
1,CTY ĐIỆN LỰC BẮC KẠN,NMTĐ NẬM CẮT,G2A123S000M371,2021-01-02,936.66,966.84,855.54,0.00,0.00,0.00,...,2719.9,2276.8,1344.0,1572.1,1374.20,1370.9,1143.2,1130.30,1033.2,994.98
2,CTY ĐIỆN LỰC BẮC KẠN,NMTĐ NẬM CẮT,G2A123S000M371,2021-01-03,961.38,952.14,971.46,993.72,811.86,948.36,...,1040.8,1048.3,1041.2,1036.5,798.92,1017.6,1025.2,869.48,1007.1,1006.30
3,CTY ĐIỆN LỰC BẮC KẠN,NMTĐ NẬM CẮT,G2A123S000M371,2021-01-04,815.72,957.18,0.00,0.00,0.00,0.00,...,2713.6,2347.4,1321.7,1415.0,1423.40,1209.6,1222.2,1189.40,1020.6,0.06
4,CTY ĐIỆN LỰC BẮC KẠN,NMTĐ NẬM CẮT,G2A123S000M371,2021-01-05,0.00,0.00,0.00,0.00,0.00,0.00,...,3047.5,2702.7,1774.5,1538.9,1444.40,1335.6,1221.8,518.30,0.0,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
371317,CTY ĐIỆN LỰC ĐIỆN BIÊN,NMTĐ Đề Bâu,G2A283S000M331,2024-08-04,6500.00,6500.00,6600.00,6500.00,6500.00,6500.00,...,6500.0,6500.0,6500.0,6500.0,6500.00,6500.0,1300.0,3900.00,6600.0,6600.00
371318,CTY ĐIỆN LỰC ĐIỆN BIÊN,NMTĐ Đề Bâu,G2A283S000M331,2024-08-05,6500.00,6600.00,6600.00,6500.00,6600.00,6500.00,...,6600.0,6500.0,6600.0,6600.0,6500.00,6600.0,6500.0,6600.00,6500.0,6600.00
371319,CTY ĐIỆN LỰC ĐIỆN BIÊN,NMTĐ Đề Bâu,G2A283S000M331,2024-08-06,6500.00,5400.00,,6600.00,6600.00,6600.00,...,6600.0,6600.0,6600.0,6600.0,6600.00,6600.0,6600.0,6600.00,6600.0,6600.00
371320,CTY ĐIỆN LỰC ĐIỆN BIÊN,NMTĐ Đề Bâu,G2A283S000M331,2024-08-07,6600.00,6600.00,6600.00,6600.00,6600.00,6600.00,...,6600.0,6600.0,6600.0,6600.0,6500.00,6600.0,6600.0,6600.00,6600.0,6600.00


In [11]:
# Đảm bảo biến output_folder đã được định nghĩa
output_folder = r"C:\Khue\TDN\data\interim"

# export df_pivot to xlsx
output_file = os.path.join(output_folder, "sanluong_chuanhoa.xlsx")
df_pivot.to_excel(output_file, index=False, engine="openpyxl")

In [12]:
# export df_pivot to parquet
output_folder = r"C:\Khue\TDN\data\interim"
output_file = os.path.join(output_folder, "sanluong_chuanhoa.parquet")
df_pivot.to_parquet(output_file, index=False, engine="pyarrow")

In [3]:
import pandas as pd
import os
# import df_BSCVH from C:\Khue\TDN\data\interim\DGMS_BCSVH.xlsx
df_BSCVH = pd.read_excel(r"C:\Khue\TDN\data\interim\DGMS_BCSVH.xlsx", sheet_name="Sheet1")
df_BSCVH

Unnamed: 0,Code,Nhà máy,P.Đặt,Q.ĐK,Điện lực,Ngày,00:30,01:00,01:30,02:00,...,19:30,20:00,20:30,21:00,21:30,22:00,22:30,23:00,23:30,00:00
0,,Nặm Cắt,3.2,PC_BACKAN,PC_BACKAN,2024.04.19,0.0,0.0,0.0,0.0,...,3.10,2.60,0.000,0.000,0.000,0.0,0.0,0.00,0.0,0.0
1,,Nặm Cắt,3.2,PC_BACKAN,PC_BACKAN,2024.04.20,0.0,0.0,0.0,0.0,...,3.20,3.20,3.200,3.100,2.600,1.4,1.0,0.00,0.0,0.0
2,,Nặm Cắt,3.2,PC_BACKAN,PC_BACKAN,2024.04.21,0.0,0.0,0.0,0.0,...,1.39,1.54,1.810,2.350,2.110,0.6,0.0,0.00,0.0,0.0
3,,Nặm Cắt,3.2,PC_BACKAN,PC_BACKAN,2024.04.22,0.0,0.0,0.0,0.0,...,3.20,3.20,3.055,2.715,2.275,1.9,1.6,1.42,0.0,0.0
4,,Nặm Cắt,3.2,PC_BACKAN,PC_BACKAN,2024.04.23,0.0,0.0,0.0,0.0,...,3.20,3.20,2.400,2.100,1.600,0.0,0.0,0.00,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
100511,A12.19,Đồng Sung,20.0,A1,PC_YENBAI,2025.02.25,0.0,0.0,0.0,0.0,...,16.00,0.00,0.000,0.000,0.000,0.0,0.0,0.00,0.0,0.0
100512,A12.19,Đồng Sung,20.0,A1,PC_YENBAI,2025.02.26,0.0,0.0,0.0,0.0,...,15.00,15.00,0.000,0.000,0.000,0.0,0.0,0.00,0.0,0.0
100513,A12.19,Đồng Sung,20.0,A1,PC_YENBAI,2025.02.27,0.0,0.0,0.0,0.0,...,20.00,0.00,0.000,0.000,0.000,0.0,0.0,0.00,0.0,0.0
100514,A12.19,Đồng Sung,20.0,A1,PC_YENBAI,2025.02.28,0.0,0.0,0.0,0.0,...,21.90,0.00,0.000,0.000,0.000,0.0,0.0,0.00,0.0,0.0


In [4]:
# import df_TTT from "C:\Khue\TDN\data\raw\DS_TTT_TDN.xlsx"
df_TTT = pd.read_excel(r"C:\Khue\TDN\data\raw\DS_TTT_TDN.xlsx", sheet_name="Final")
df_TTT

Unnamed: 0,ID_NM,DIEU_DO,TCTDL,DIEN_LUC,TENNHAMAY,TEN_NM,LOAI_HINH,QUYEN_DIEUKHIEN,CS_DAT,A_TB_NAM,U_DAUNOI,MIEN,TINH,HUYEN,KINHDO,VIDO,BACTHANGSONG,TD_THAMCHIEU,HIEU_LUC,DVPD
0,10997,A1,NPC,PC_BACKAN,Nặm Cắt,NAM_CAT,1,PC_BACKAN,3.2,0.000,35,Bắc,Bắc Kạn,Bạch Thông,105.789306,22.192518,SONG_HONG,TUYEN_QUANG,1,
1,10998,A1,NPC,PC_BACKAN,Pác Cáp,PAC_CAP,1,PC_BACKAN,6.0,0.000,0,,,,106.127303,22.210818,SONG_HONG,TUYEN_QUANG,1,
2,10995,A1,NPC,PC_BACKAN,Tà Làng,TA_LANG,1,PC_BACKAN,4.5,0.000,0,,,,105.721185,22.303931,SONG_HONG,TUYEN_QUANG,1,
3,11193,A1,NPC,PC_BACKAN,THÁC GIỀNG 1,THAC_GIENG 1,1,PC_BACKAN,5.5,0.000,0,,,,105.893735,22.095786,SONG_HONG,TUYEN_QUANG,1,
4,11191,A1,NPC,PC_BACKAN,Thượng Ân,THUONG_AN,1,PC_BACKAN,2.4,0.000,0,,,,106.003945,22.511839,SONG_HONG,TUYEN_QUANG,1,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
532,11054,A1,NPC,PC_YENBAI,Thào Sa Chải,THAO_SA_CHAI,1,PC_YENBAI,6.5,22.684,35,Bắc,Yên Bái,Mù Cang Chải,104.268669,21.838929,SONG_HONG,BAN_CHAT,0,
533,10830,A1,NPC,PC_YENBAI,Trạm Tấu,TRAM_TAU,1,A1,30.0,0.000,0,,,,104.442036,21.498660,SONG_HONG,BAN_CHAT,1,
534,10840,A1,NPC,PC_YENBAI,Vực Tuần,VUC_TUAN,1,PC_YENBAI,5.0,0.000,0,,,,104.569095,21.416027,SONG_HONG,BAN_CHAT,1,
535,10837,A1,NPC,PC_YENBAI,Nậm Tục,NAM_TUC,1,PC_YENBAI,3.0,0.000,0,,,,104.447998,21.608201,SONG_HONG,BAN_CHAT,1,


In [5]:
# Merge cột 'TD_THAMCHIEU' từ df_TTT vào df_BSCVH dựa trên key 'TENNHAMAY' (df_TTT) và 'Nhà máy' (df_BSCVH)
df_BSCVH = df_BSCVH.merge(
    df_TTT[['TENNHAMAY', 'TD_THAMCHIEU']],
    left_on='Nhà máy',
    right_on='TENNHAMAY',
    how='left'
)
# Xóa cột 'TENNHAMAY' dư thừa sau khi merge
df_BSCVH = df_BSCVH.drop(columns=['TENNHAMAY'])
df_BSCVH

Unnamed: 0,Code,Nhà máy,P.Đặt,Q.ĐK,Điện lực,Ngày,00:30,01:00,01:30,02:00,...,20:00,20:30,21:00,21:30,22:00,22:30,23:00,23:30,00:00,TD_THAMCHIEU
0,,Nặm Cắt,3.2,PC_BACKAN,PC_BACKAN,2024.04.19,0.0,0.0,0.0,0.0,...,2.60,0.000,0.000,0.000,0.0,0.0,0.00,0.0,0.0,TUYEN_QUANG
1,,Nặm Cắt,3.2,PC_BACKAN,PC_BACKAN,2024.04.20,0.0,0.0,0.0,0.0,...,3.20,3.200,3.100,2.600,1.4,1.0,0.00,0.0,0.0,TUYEN_QUANG
2,,Nặm Cắt,3.2,PC_BACKAN,PC_BACKAN,2024.04.21,0.0,0.0,0.0,0.0,...,1.54,1.810,2.350,2.110,0.6,0.0,0.00,0.0,0.0,TUYEN_QUANG
3,,Nặm Cắt,3.2,PC_BACKAN,PC_BACKAN,2024.04.22,0.0,0.0,0.0,0.0,...,3.20,3.055,2.715,2.275,1.9,1.6,1.42,0.0,0.0,TUYEN_QUANG
4,,Nặm Cắt,3.2,PC_BACKAN,PC_BACKAN,2024.04.23,0.0,0.0,0.0,0.0,...,3.20,2.400,2.100,1.600,0.0,0.0,0.00,0.0,0.0,TUYEN_QUANG
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
100827,A12.19,Đồng Sung,20.0,A1,PC_YENBAI,2025.02.25,0.0,0.0,0.0,0.0,...,0.00,0.000,0.000,0.000,0.0,0.0,0.00,0.0,0.0,BAN_CHAT
100828,A12.19,Đồng Sung,20.0,A1,PC_YENBAI,2025.02.26,0.0,0.0,0.0,0.0,...,15.00,0.000,0.000,0.000,0.0,0.0,0.00,0.0,0.0,BAN_CHAT
100829,A12.19,Đồng Sung,20.0,A1,PC_YENBAI,2025.02.27,0.0,0.0,0.0,0.0,...,0.00,0.000,0.000,0.000,0.0,0.0,0.00,0.0,0.0,BAN_CHAT
100830,A12.19,Đồng Sung,20.0,A1,PC_YENBAI,2025.02.28,0.0,0.0,0.0,0.0,...,0.00,0.000,0.000,0.000,0.0,0.0,0.00,0.0,0.0,BAN_CHAT


In [6]:
# Đưa cột 'TD_THAMCHIEU' lên vị trí thứ 3 trong df_BSCVH
cols = list(df_BSCVH.columns)
cols.insert(2, cols.pop(cols.index('TD_THAMCHIEU')))
df_BSCVH = df_BSCVH[cols]
df_BSCVH.head()

Unnamed: 0,Code,Nhà máy,TD_THAMCHIEU,P.Đặt,Q.ĐK,Điện lực,Ngày,00:30,01:00,01:30,...,19:30,20:00,20:30,21:00,21:30,22:00,22:30,23:00,23:30,00:00
0,,Nặm Cắt,TUYEN_QUANG,3.2,PC_BACKAN,PC_BACKAN,2024.04.19,0.0,0.0,0.0,...,3.1,2.6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,,Nặm Cắt,TUYEN_QUANG,3.2,PC_BACKAN,PC_BACKAN,2024.04.20,0.0,0.0,0.0,...,3.2,3.2,3.2,3.1,2.6,1.4,1.0,0.0,0.0,0.0
2,,Nặm Cắt,TUYEN_QUANG,3.2,PC_BACKAN,PC_BACKAN,2024.04.21,0.0,0.0,0.0,...,1.39,1.54,1.81,2.35,2.11,0.6,0.0,0.0,0.0,0.0
3,,Nặm Cắt,TUYEN_QUANG,3.2,PC_BACKAN,PC_BACKAN,2024.04.22,0.0,0.0,0.0,...,3.2,3.2,3.055,2.715,2.275,1.9,1.6,1.42,0.0,0.0
4,,Nặm Cắt,TUYEN_QUANG,3.2,PC_BACKAN,PC_BACKAN,2024.04.23,0.0,0.0,0.0,...,3.2,3.2,2.4,2.1,1.6,0.0,0.0,0.0,0.0,0.0


In [10]:
df_BSCVH.to_excel(r"C:\Khue\TDN\data\interim\DGMS_BCSVH_ver2.xlsx", index=False, engine="openpyxl")

In [20]:
# import df_sanluong from C:\Khue\TDN\data\interim\sanluong_chuanhoa.xlsx
df_sanluong = pd.read_excel(r"C:\Khue\TDN\data\interim\sanluong_chuanhoa.xlsx", sheet_name="Sheet1")
df_sanluong

Unnamed: 0,CTDL,NMTD,MADIEMDO,Ngày,00:30,01:00,01:30,02:00,02:30,03:00,...,19:30,20:00,20:30,21:00,21:30,22:00,22:30,23:00,23:30,00:00
0,CTY ĐIỆN LỰC BẮC KẠN,NMTĐ NẬM CẮT,G2A123S000M371,2021-01-01,951.30,1027.30,151.64,0.00,0.00,0.00,...,2540.2,2084.5,1339.3,1341.5,1204.60,992.9,1154.5,1185.30,1037.8,1027.30
1,CTY ĐIỆN LỰC BẮC KẠN,NMTĐ NẬM CẮT,G2A123S000M371,2021-01-02,936.66,966.84,855.54,0.00,0.00,0.00,...,2719.9,2276.8,1344.0,1572.1,1374.20,1370.9,1143.2,1130.30,1033.2,994.98
2,CTY ĐIỆN LỰC BẮC KẠN,NMTĐ NẬM CẮT,G2A123S000M371,2021-01-03,961.38,952.14,971.46,993.72,811.86,948.36,...,1040.8,1048.3,1041.2,1036.5,798.92,1017.6,1025.2,869.48,1007.1,1006.30
3,CTY ĐIỆN LỰC BẮC KẠN,NMTĐ NẬM CẮT,G2A123S000M371,2021-01-04,815.72,957.18,0.00,0.00,0.00,0.00,...,2713.6,2347.4,1321.7,1415.0,1423.40,1209.6,1222.2,1189.40,1020.6,0.06
4,CTY ĐIỆN LỰC BẮC KẠN,NMTĐ NẬM CẮT,G2A123S000M371,2021-01-05,0.00,0.00,0.00,0.00,0.00,0.00,...,3047.5,2702.7,1774.5,1538.9,1444.40,1335.6,1221.8,518.30,0.0,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
371317,CTY ĐIỆN LỰC ĐIỆN BIÊN,NMTĐ Đề Bâu,G2A283S000M331,2024-08-04,6500.00,6500.00,6600.00,6500.00,6500.00,6500.00,...,6500.0,6500.0,6500.0,6500.0,6500.00,6500.0,1300.0,3900.00,6600.0,6600.00
371318,CTY ĐIỆN LỰC ĐIỆN BIÊN,NMTĐ Đề Bâu,G2A283S000M331,2024-08-05,6500.00,6600.00,6600.00,6500.00,6600.00,6500.00,...,6600.0,6500.0,6600.0,6600.0,6500.00,6600.0,6500.0,6600.00,6500.0,6600.00
371319,CTY ĐIỆN LỰC ĐIỆN BIÊN,NMTĐ Đề Bâu,G2A283S000M331,2024-08-06,6500.00,5400.00,,6600.00,6600.00,6600.00,...,6600.0,6600.0,6600.0,6600.0,6600.00,6600.0,6600.0,6600.00,6600.0,6600.00
371320,CTY ĐIỆN LỰC ĐIỆN BIÊN,NMTĐ Đề Bâu,G2A283S000M331,2024-08-07,6600.00,6600.00,6600.00,6600.00,6600.00,6600.00,...,6600.0,6600.0,6600.0,6600.0,6500.00,6600.0,6600.0,6600.00,6600.0,6600.00


In [30]:
import pandas as pd
import re

# Giả sử bạn đã có DataFrame với cột "NMTD"
# df = pd.read_csv('path_to_your_file.csv')

def standardize_plant_name(name):
    if pd.isna(name):
        return name
    
    # Chuyển đổi thành chuỗi nếu không phải
    name = str(name).strip()
    
    # Kiểm tra mẫu "NMTD/NMTĐ + tên"
    if re.match(r'(NMTD|NMTĐ)\s+', name, re.IGNORECASE):
        # Loại bỏ tiền tố "NMTD" hoặc "NMTĐ"
        plant_name = re.sub(r'^(NMTD|NMTĐ)\s+', '', name, flags=re.IGNORECASE).strip()
        
        # Chuẩn hóa tên: chữ cái đầu viết hoa, còn lại viết thường
        words = plant_name.split()
        standardized_name = ' '.join(word.capitalize() for word in words)
        
        return standardized_name
    
    # Kiểm tra trường hợp "THÁC GIỀNG 1"
    elif name.isupper() and any(x in name for x in ['THÁC', 'THAC']):
        words = name.split()
        standardized_name = ' '.join(word.capitalize() for word in words)
        return standardized_name
    
    # Trả về giá trị gốc nếu không khớp với bất kỳ mẫu nào
    return name

df_sanluong['NMTD_Standardized'] = df_sanluong['NMTD'].apply(standardize_plant_name)
# Sử dụng cols_2 để sắp xếp lại thứ tự cột, vì cols_2 đã có 'NMTD_Standardized'
df_sanluong = df_sanluong[cols_2]

In [32]:
# Đưa cột 'NMTD_Standardized' lên vị trí thứ 3 trong df_sanluong
cols_2_new = list(df_sanluong.columns)
cols_2_new.insert(2, cols_2_new.pop(cols_2_new.index('NMTD_Standardized')))
df_sanluong = df_sanluong[cols_2_new]
df_sanluong.head()

Unnamed: 0,CTDL,NMTD,NMTD_Standardized,MADIEMDO,Ngày,00:30,01:00,01:30,02:00,02:30,...,19:30,20:00,20:30,21:00,21:30,22:00,22:30,23:00,23:30,00:00
0,CTY ĐIỆN LỰC BẮC KẠN,NMTĐ NẬM CẮT,Nậm Cắt,G2A123S000M371,2021-01-01,951.3,1027.3,151.64,0.0,0.0,...,2540.2,2084.5,1339.3,1341.5,1204.6,992.9,1154.5,1185.3,1037.8,1027.3
1,CTY ĐIỆN LỰC BẮC KẠN,NMTĐ NẬM CẮT,Nậm Cắt,G2A123S000M371,2021-01-02,936.66,966.84,855.54,0.0,0.0,...,2719.9,2276.8,1344.0,1572.1,1374.2,1370.9,1143.2,1130.3,1033.2,994.98
2,CTY ĐIỆN LỰC BẮC KẠN,NMTĐ NẬM CẮT,Nậm Cắt,G2A123S000M371,2021-01-03,961.38,952.14,971.46,993.72,811.86,...,1040.8,1048.3,1041.2,1036.5,798.92,1017.6,1025.2,869.48,1007.1,1006.3
3,CTY ĐIỆN LỰC BẮC KẠN,NMTĐ NẬM CẮT,Nậm Cắt,G2A123S000M371,2021-01-04,815.72,957.18,0.0,0.0,0.0,...,2713.6,2347.4,1321.7,1415.0,1423.4,1209.6,1222.2,1189.4,1020.6,0.06
4,CTY ĐIỆN LỰC BẮC KẠN,NMTĐ NẬM CẮT,Nậm Cắt,G2A123S000M371,2021-01-05,0.0,0.0,0.0,0.0,0.0,...,3047.5,2702.7,1774.5,1538.9,1444.4,1335.6,1221.8,518.3,0.0,0.0


In [38]:
df_sanluong

Unnamed: 0,CTDL,NMTD,NMTD_Standardized,MADIEMDO,Ngày,00:30,01:00,01:30,02:00,02:30,...,19:30,20:00,20:30,21:00,21:30,22:00,22:30,23:00,23:30,00:00
0,CTY ĐIỆN LỰC BẮC KẠN,NMTĐ NẬM CẮT,Nậm Cắt,G2A123S000M371,2021-01-01,951.30,1027.30,151.64,0.00,0.00,...,2540.2,2084.5,1339.3,1341.5,1204.60,992.9,1154.5,1185.30,1037.8,1027.30
1,CTY ĐIỆN LỰC BẮC KẠN,NMTĐ NẬM CẮT,Nậm Cắt,G2A123S000M371,2021-01-02,936.66,966.84,855.54,0.00,0.00,...,2719.9,2276.8,1344.0,1572.1,1374.20,1370.9,1143.2,1130.30,1033.2,994.98
2,CTY ĐIỆN LỰC BẮC KẠN,NMTĐ NẬM CẮT,Nậm Cắt,G2A123S000M371,2021-01-03,961.38,952.14,971.46,993.72,811.86,...,1040.8,1048.3,1041.2,1036.5,798.92,1017.6,1025.2,869.48,1007.1,1006.30
3,CTY ĐIỆN LỰC BẮC KẠN,NMTĐ NẬM CẮT,Nậm Cắt,G2A123S000M371,2021-01-04,815.72,957.18,0.00,0.00,0.00,...,2713.6,2347.4,1321.7,1415.0,1423.40,1209.6,1222.2,1189.40,1020.6,0.06
4,CTY ĐIỆN LỰC BẮC KẠN,NMTĐ NẬM CẮT,Nậm Cắt,G2A123S000M371,2021-01-05,0.00,0.00,0.00,0.00,0.00,...,3047.5,2702.7,1774.5,1538.9,1444.40,1335.6,1221.8,518.30,0.0,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
371317,CTY ĐIỆN LỰC ĐIỆN BIÊN,NMTĐ Đề Bâu,Đề Bâu,G2A283S000M331,2024-08-04,6500.00,6500.00,6600.00,6500.00,6500.00,...,6500.0,6500.0,6500.0,6500.0,6500.00,6500.0,1300.0,3900.00,6600.0,6600.00
371318,CTY ĐIỆN LỰC ĐIỆN BIÊN,NMTĐ Đề Bâu,Đề Bâu,G2A283S000M331,2024-08-05,6500.00,6600.00,6600.00,6500.00,6600.00,...,6600.0,6500.0,6600.0,6600.0,6500.00,6600.0,6500.0,6600.00,6500.0,6600.00
371319,CTY ĐIỆN LỰC ĐIỆN BIÊN,NMTĐ Đề Bâu,Đề Bâu,G2A283S000M331,2024-08-06,6500.00,5400.00,,6600.00,6600.00,...,6600.0,6600.0,6600.0,6600.0,6600.00,6600.0,6600.0,6600.00,6600.0,6600.00
371320,CTY ĐIỆN LỰC ĐIỆN BIÊN,NMTĐ Đề Bâu,Đề Bâu,G2A283S000M331,2024-08-07,6600.00,6600.00,6600.00,6600.00,6600.00,...,6600.0,6600.0,6600.0,6600.0,6500.00,6600.0,6600.0,6600.00,6600.0,6600.00


In [37]:
# Xóa tất cả các dòng có giá trị 'NMTD' bị trùng lặp, chỉ giữ lại một dòng duy nhất cho mỗi giá trị 'NMTD'
df_sanluong_unique = df_sanluong.drop_duplicates(subset=['NMTD'], keep='first')
df_sanluong_unique

Unnamed: 0,CTDL,NMTD,NMTD_Standardized,MADIEMDO,Ngày,00:30,01:00,01:30,02:00,02:30,...,19:30,20:00,20:30,21:00,21:30,22:00,22:30,23:00,23:30,00:00
0,CTY ĐIỆN LỰC BẮC KẠN,NMTĐ NẬM CẮT,Nậm Cắt,G2A123S000M371,2021-01-01,951.3,1027.3,151.64,0.0,0.0,...,2540.2,2084.5,1339.30000,1341.500000,1204.6,992.9,1154.5,1185.300000,1037.8,1027.3
1308,CTY ĐIỆN LỰC BẮC KẠN,NMTĐ PÁC CÁP,Pác Cáp,G2A234S000M331,2021-07-27,0.0,0.0,0.00,0.0,0.0,...,0.0,0.0,0.00000,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0
2386,CTY ĐIỆN LỰC BẮC KẠN,NMTĐ THƯỢNG ÂN,Thượng Ân,G2A122S000M371,2021-01-01,0.0,0.0,0.00,0.0,0.0,...,0.0,0.0,0.00000,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0
3702,CTY ĐIỆN LỰC BẮC KẠN,NMTĐ TÀ LÀNG,Tà Làng,G2A121S000M371,2021-01-01,0.0,0.0,0.00,0.0,0.0,...,1188.9,1042.2,938.58000,755.440000,70.0,0.0,0.0,0.000000,0.0,0.0
5018,CTY ĐIỆN LỰC BẮC KẠN,Thác Giềng 1,Thác Giềng 1,G2A219S000M371,2021-02-17,0.0,0.0,0.00,0.0,0.0,...,0.0,0.0,0.00000,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
363194,CTY ĐIỆN LỰC ĐIỆN BIÊN,NMTĐ NẬM NÚA,Nậm Núa,G2A136S000M131,2021-01-01,0.0,0.0,0.00,0.0,0.0,...,11.2,10.7,0.00000,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0
365812,CTY ĐIỆN LỰC ĐIỆN BIÊN,NMTĐ PA KHOANG,Pa Khoang,G2A033S000M371,2021-01-01,1185.2,1185.8,1186.10,1185.8,1185.2,...,2729.2,2737.5,1255.00000,1134.000000,1133.7,1134.0,1133.7,1132.600000,1133.7,1133.8
367079,CTY ĐIỆN LỰC ĐIỆN BIÊN,NMTĐ Sông Mã 3,Sông Mã 3,G2A258S000M131,2022-09-06,0.0,0.0,0.00,0.0,0.0,...,7376.8,7371.7,7112.40000,4822.500000,4949.6,6199.1,2912.2,0.093568,0.0,0.0
368446,CTY ĐIỆN LỰC ĐIỆN BIÊN,NMTĐ TRUNG THU,Trung Thu,G2A036S000M131,2021-01-01,0.0,0.0,0.00,0.0,0.0,...,15667.0,14904.0,0.68448,0.000002,0.0,0.0,0.0,0.000000,0.0,0.0


In [39]:
df_sanluong = pd.read_excel(r"C:\Khue\TDN\data\interim\sanluong_chuanhoa_ver2.xlsx", sheet_name="Sheet1")
df_sanluong

Unnamed: 0,CTDL,NMTD,NMTD_Standardized,MADIEMDO,Ngày,00:30,01:00,01:30,02:00,02:30,...,19:30,20:00,20:30,21:00,21:30,22:00,22:30,23:00,23:30,00:00
0,CTY ĐIỆN LỰC BẮC KẠN,NMTĐ NẬM CẮT,Nậm Cắt,G2A123S000M371,2021-01-01,951.30,1027.30,151.64,0.00,0.00,...,2540.2,2084.5,1339.3,1341.5,1204.60,992.9,1154.5,1185.30,1037.8,1027.30
1,CTY ĐIỆN LỰC BẮC KẠN,NMTĐ NẬM CẮT,Nậm Cắt,G2A123S000M371,2021-01-02,936.66,966.84,855.54,0.00,0.00,...,2719.9,2276.8,1344.0,1572.1,1374.20,1370.9,1143.2,1130.30,1033.2,994.98
2,CTY ĐIỆN LỰC BẮC KẠN,NMTĐ NẬM CẮT,Nậm Cắt,G2A123S000M371,2021-01-03,961.38,952.14,971.46,993.72,811.86,...,1040.8,1048.3,1041.2,1036.5,798.92,1017.6,1025.2,869.48,1007.1,1006.30
3,CTY ĐIỆN LỰC BẮC KẠN,NMTĐ NẬM CẮT,Nậm Cắt,G2A123S000M371,2021-01-04,815.72,957.18,0.00,0.00,0.00,...,2713.6,2347.4,1321.7,1415.0,1423.40,1209.6,1222.2,1189.40,1020.6,0.06
4,CTY ĐIỆN LỰC BẮC KẠN,NMTĐ NẬM CẮT,Nậm Cắt,G2A123S000M371,2021-01-05,0.00,0.00,0.00,0.00,0.00,...,3047.5,2702.7,1774.5,1538.9,1444.40,1335.6,1221.8,518.30,0.0,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
371317,CTY ĐIỆN LỰC ĐIỆN BIÊN,NMTĐ Đề Bâu,Đề Bâu,G2A283S000M331,2024-08-04,6500.00,6500.00,6600.00,6500.00,6500.00,...,6500.0,6500.0,6500.0,6500.0,6500.00,6500.0,1300.0,3900.00,6600.0,6600.00
371318,CTY ĐIỆN LỰC ĐIỆN BIÊN,NMTĐ Đề Bâu,Đề Bâu,G2A283S000M331,2024-08-05,6500.00,6600.00,6600.00,6500.00,6600.00,...,6600.0,6500.0,6600.0,6600.0,6500.00,6600.0,6500.0,6600.00,6500.0,6600.00
371319,CTY ĐIỆN LỰC ĐIỆN BIÊN,NMTĐ Đề Bâu,Đề Bâu,G2A283S000M331,2024-08-06,6500.00,5400.00,,6600.00,6600.00,...,6600.0,6600.0,6600.0,6600.0,6600.00,6600.0,6600.0,6600.00,6600.0,6600.00
371320,CTY ĐIỆN LỰC ĐIỆN BIÊN,NMTĐ Đề Bâu,Đề Bâu,G2A283S000M331,2024-08-07,6600.00,6600.00,6600.00,6600.00,6600.00,...,6600.0,6600.0,6600.0,6600.0,6500.00,6600.0,6600.0,6600.00,6600.0,6600.00


In [40]:
# Xóa cột 'NMTD' và đổi tên cột 'NMTD_Standardized' thành 'NMTD' trong df_sanluong
df_sanluong = df_sanluong.drop(columns=['NMTD'])
df_sanluong = df_sanluong.rename(columns={'NMTD_Standardized': 'NMTD'})
df_sanluong.head()

Unnamed: 0,CTDL,NMTD,MADIEMDO,Ngày,00:30,01:00,01:30,02:00,02:30,03:00,...,19:30,20:00,20:30,21:00,21:30,22:00,22:30,23:00,23:30,00:00
0,CTY ĐIỆN LỰC BẮC KẠN,Nậm Cắt,G2A123S000M371,2021-01-01,951.3,1027.3,151.64,0.0,0.0,0.0,...,2540.2,2084.5,1339.3,1341.5,1204.6,992.9,1154.5,1185.3,1037.8,1027.3
1,CTY ĐIỆN LỰC BẮC KẠN,Nậm Cắt,G2A123S000M371,2021-01-02,936.66,966.84,855.54,0.0,0.0,0.0,...,2719.9,2276.8,1344.0,1572.1,1374.2,1370.9,1143.2,1130.3,1033.2,994.98
2,CTY ĐIỆN LỰC BẮC KẠN,Nậm Cắt,G2A123S000M371,2021-01-03,961.38,952.14,971.46,993.72,811.86,948.36,...,1040.8,1048.3,1041.2,1036.5,798.92,1017.6,1025.2,869.48,1007.1,1006.3
3,CTY ĐIỆN LỰC BẮC KẠN,Nậm Cắt,G2A123S000M371,2021-01-04,815.72,957.18,0.0,0.0,0.0,0.0,...,2713.6,2347.4,1321.7,1415.0,1423.4,1209.6,1222.2,1189.4,1020.6,0.06
4,CTY ĐIỆN LỰC BẮC KẠN,Nậm Cắt,G2A123S000M371,2021-01-05,0.0,0.0,0.0,0.0,0.0,0.0,...,3047.5,2702.7,1774.5,1538.9,1444.4,1335.6,1221.8,518.3,0.0,0.0


In [41]:
# Merge cột 'TD_THAMCHIEU' từ df_TTT vào df_sanluong dựa trên key 'NMTD' (df_sanluong) và 'TENNHAMAY' (df_TTT)
df_sanluong = df_sanluong.merge(
    df_TTT[['TENNHAMAY', 'TD_THAMCHIEU']],
    left_on='NMTD',
    right_on='TENNHAMAY',
    how='left'
)
# Xóa cột 'TENNHAMAY' dư thừa sau khi merge
df_sanluong = df_sanluong.drop(columns=['TENNHAMAY'])
df_sanluong

Unnamed: 0,CTDL,NMTD,MADIEMDO,Ngày,00:30,01:00,01:30,02:00,02:30,03:00,...,20:00,20:30,21:00,21:30,22:00,22:30,23:00,23:30,00:00,TD_THAMCHIEU
0,CTY ĐIỆN LỰC BẮC KẠN,Nậm Cắt,G2A123S000M371,2021-01-01,951.30,1027.30,151.64,0.00,0.00,0.00,...,2084.5,1339.3,1341.5,1204.60,992.9,1154.5,1185.30,1037.8,1027.30,
1,CTY ĐIỆN LỰC BẮC KẠN,Nậm Cắt,G2A123S000M371,2021-01-02,936.66,966.84,855.54,0.00,0.00,0.00,...,2276.8,1344.0,1572.1,1374.20,1370.9,1143.2,1130.30,1033.2,994.98,
2,CTY ĐIỆN LỰC BẮC KẠN,Nậm Cắt,G2A123S000M371,2021-01-03,961.38,952.14,971.46,993.72,811.86,948.36,...,1048.3,1041.2,1036.5,798.92,1017.6,1025.2,869.48,1007.1,1006.30,
3,CTY ĐIỆN LỰC BẮC KẠN,Nậm Cắt,G2A123S000M371,2021-01-04,815.72,957.18,0.00,0.00,0.00,0.00,...,2347.4,1321.7,1415.0,1423.40,1209.6,1222.2,1189.40,1020.6,0.06,
4,CTY ĐIỆN LỰC BẮC KẠN,Nậm Cắt,G2A123S000M371,2021-01-05,0.00,0.00,0.00,0.00,0.00,0.00,...,2702.7,1774.5,1538.9,1444.40,1335.6,1221.8,518.30,0.0,0.00,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
372455,CTY ĐIỆN LỰC ĐIỆN BIÊN,Đề Bâu,G2A283S000M331,2024-08-04,6500.00,6500.00,6600.00,6500.00,6500.00,6500.00,...,6500.0,6500.0,6500.0,6500.00,6500.0,1300.0,3900.00,6600.0,6600.00,LAI_CHAU
372456,CTY ĐIỆN LỰC ĐIỆN BIÊN,Đề Bâu,G2A283S000M331,2024-08-05,6500.00,6600.00,6600.00,6500.00,6600.00,6500.00,...,6500.0,6600.0,6600.0,6500.00,6600.0,6500.0,6600.00,6500.0,6600.00,LAI_CHAU
372457,CTY ĐIỆN LỰC ĐIỆN BIÊN,Đề Bâu,G2A283S000M331,2024-08-06,6500.00,5400.00,,6600.00,6600.00,6600.00,...,6600.0,6600.0,6600.0,6600.00,6600.0,6600.0,6600.00,6600.0,6600.00,LAI_CHAU
372458,CTY ĐIỆN LỰC ĐIỆN BIÊN,Đề Bâu,G2A283S000M331,2024-08-07,6600.00,6600.00,6600.00,6600.00,6600.00,6600.00,...,6600.0,6600.0,6600.0,6500.00,6600.0,6600.0,6600.00,6600.0,6600.00,LAI_CHAU


In [42]:
df_sanluong['Ngày'] = pd.to_datetime(df_sanluong['Ngày']).dt.date
df_sanluong.head()

Unnamed: 0,CTDL,NMTD,MADIEMDO,Ngày,00:30,01:00,01:30,02:00,02:30,03:00,...,20:00,20:30,21:00,21:30,22:00,22:30,23:00,23:30,00:00,TD_THAMCHIEU
0,CTY ĐIỆN LỰC BẮC KẠN,Nậm Cắt,G2A123S000M371,2021-01-01,951.3,1027.3,151.64,0.0,0.0,0.0,...,2084.5,1339.3,1341.5,1204.6,992.9,1154.5,1185.3,1037.8,1027.3,
1,CTY ĐIỆN LỰC BẮC KẠN,Nậm Cắt,G2A123S000M371,2021-01-02,936.66,966.84,855.54,0.0,0.0,0.0,...,2276.8,1344.0,1572.1,1374.2,1370.9,1143.2,1130.3,1033.2,994.98,
2,CTY ĐIỆN LỰC BẮC KẠN,Nậm Cắt,G2A123S000M371,2021-01-03,961.38,952.14,971.46,993.72,811.86,948.36,...,1048.3,1041.2,1036.5,798.92,1017.6,1025.2,869.48,1007.1,1006.3,
3,CTY ĐIỆN LỰC BẮC KẠN,Nậm Cắt,G2A123S000M371,2021-01-04,815.72,957.18,0.0,0.0,0.0,0.0,...,2347.4,1321.7,1415.0,1423.4,1209.6,1222.2,1189.4,1020.6,0.06,
4,CTY ĐIỆN LỰC BẮC KẠN,Nậm Cắt,G2A123S000M371,2021-01-05,0.0,0.0,0.0,0.0,0.0,0.0,...,2702.7,1774.5,1538.9,1444.4,1335.6,1221.8,518.3,0.0,0.0,


In [43]:
# Đưa cột 'TD_THAMCHIEU' lên vị trí thứ 2 trong df_sanluong
cols_3 = list(df_sanluong.columns)
cols_3.insert(2, cols_3.pop(cols_3.index('TD_THAMCHIEU')))
df_sanluong = df_sanluong[cols_3]
df_sanluong.head()

Unnamed: 0,CTDL,NMTD,TD_THAMCHIEU,MADIEMDO,Ngày,00:30,01:00,01:30,02:00,02:30,...,19:30,20:00,20:30,21:00,21:30,22:00,22:30,23:00,23:30,00:00
0,CTY ĐIỆN LỰC BẮC KẠN,Nậm Cắt,,G2A123S000M371,2021-01-01,951.3,1027.3,151.64,0.0,0.0,...,2540.2,2084.5,1339.3,1341.5,1204.6,992.9,1154.5,1185.3,1037.8,1027.3
1,CTY ĐIỆN LỰC BẮC KẠN,Nậm Cắt,,G2A123S000M371,2021-01-02,936.66,966.84,855.54,0.0,0.0,...,2719.9,2276.8,1344.0,1572.1,1374.2,1370.9,1143.2,1130.3,1033.2,994.98
2,CTY ĐIỆN LỰC BẮC KẠN,Nậm Cắt,,G2A123S000M371,2021-01-03,961.38,952.14,971.46,993.72,811.86,...,1040.8,1048.3,1041.2,1036.5,798.92,1017.6,1025.2,869.48,1007.1,1006.3
3,CTY ĐIỆN LỰC BẮC KẠN,Nậm Cắt,,G2A123S000M371,2021-01-04,815.72,957.18,0.0,0.0,0.0,...,2713.6,2347.4,1321.7,1415.0,1423.4,1209.6,1222.2,1189.4,1020.6,0.06
4,CTY ĐIỆN LỰC BẮC KẠN,Nậm Cắt,,G2A123S000M371,2021-01-05,0.0,0.0,0.0,0.0,0.0,...,3047.5,2702.7,1774.5,1538.9,1444.4,1335.6,1221.8,518.3,0.0,0.0


In [None]:
# import df_sanluong from C:\Khue\TDN\data\interim\sanluong_chuanhoa.xlsx

In [44]:
# export df_sanluong to xlsx
output_folder = r"C:\Khue\TDN\data\interim"
output_file = os.path.join(output_folder, "sanluong_chuanhoa_ver2.xlsx")
df_sanluong.to_excel(output_file, index=False, engine="openpyxl")

In [45]:
# export df_sanluong to parquet
output_file_parquet = os.path.join(output_folder, "sanluong_chuanhoa_ver2.parquet")
df_sanluong.to_parquet(output_file_parquet, index=False, engine="pyarrow")
