In [2]:
import pandas as pd
import os
import glob


In [3]:
# 데이터 경로 설정
data_path = "/kaggle/input/march-machine-learning-mania-2025"

# 모든 하위 폴더 포함하여 CSV 파일 찾기
csv_files = glob.glob(f"{data_path}/**/*.csv", recursive=True)

# 모든 CSV 파일 불러와서 저장
data_dict = {}

for file in csv_files:
    file_name = file.split("/")[-1]  # 파일명만 추출
    try:
        data_dict[file_name] = pd.read_csv(file, encoding='utf-8')  # UTF-8로 읽기
    except UnicodeDecodeError:
        print(f"⚠️ UTF-8 디코딩 실패: {file_name} → ISO-8859-1로 재시도")
        try:
            data_dict[file_name] = pd.read_csv(file, encoding='ISO-8859-1')  # ISO-8859-1 (Latin-1)로 읽기
        except UnicodeDecodeError:
            print(f"⚠️ ISO-8859-1도 실패: {file_name} → CP1252로 재시도")
            try:
                data_dict[file_name] = pd.read_csv(file, encoding='cp1252')  # Windows에서 사용되는 CP1252 인코딩
            except Exception as e:
                print(f"❌ {file_name} 파일 로드 실패: {e}")

# 불러온 파일 개수 확인
print(f"총 {len(data_dict)}개의 CSV 파일을 성공적으로 불러왔습니다.")

⚠️ UTF-8 디코딩 실패: MTeamSpellings.csv → ISO-8859-1로 재시도
⚠️ UTF-8 디코딩 실패: WTeamSpellings.csv → ISO-8859-1로 재시도
총 36개의 CSV 파일을 성공적으로 불러왔습니다.


In [4]:
# 각 데이터프레임의 크기(행, 열) 확인
for file, df in data_dict.items():
    print(f"{file}: {df.shape[0]} rows, {df.shape[1]} columns")


Conferences.csv: 51 rows, 2 columns
MSeasons.csv: 41 rows, 6 columns
Cities.csv: 502 rows, 3 columns
WSeasons.csv: 28 rows, 6 columns
SeedBenchmarkStage1.csv: 507108 rows, 2 columns
MSecondaryTourneyTeams.csv: 1836 rows, 3 columns
SampleSubmissionStage2.csv: 131407 rows, 2 columns
MMasseyOrdinals.csv: 5489117 rows, 5 columns
SampleSubmissionStage1.csv: 507108 rows, 2 columns
WSecondaryTourneyTeams.csv: 824 rows, 3 columns
MGameCities.csv: 85534 rows, 6 columns
WGameCities.csv: 82329 rows, 6 columns
MTeamSpellings.csv: 1177 rows, 2 columns
WTeamSpellings.csv: 1170 rows, 2 columns
WTeams.csv: 378 rows, 2 columns
MTeamCoaches.csv: 13533 rows, 5 columns
MTeamConferences.csv: 13388 rows, 3 columns
MTeams.csv: 380 rows, 4 columns
WTeamConferences.csv: 9490 rows, 3 columns
WNCAATourneyDetailedResults.csv: 894 rows, 34 columns
MNCAATourneyCompactResults.csv: 2518 rows, 8 columns
WNCAATourneyCompactResults.csv: 1650 rows, 8 columns
MNCAATourneyDetailedResults.csv: 1382 rows, 34 columns
WSeconda

In [5]:
# 모든 파일의 첫 5개 행 출력
for file, df in data_dict.items():
    print(f"\n📂 파일: {file}")
    display(df.head())  # 캐글 노트북에서는 display() 사용




📂 파일: Conferences.csv


Unnamed: 0,ConfAbbrev,Description
0,a_sun,Atlantic Sun Conference
1,a_ten,Atlantic 10 Conference
2,aac,American Athletic Conference
3,acc,Atlantic Coast Conference
4,aec,America East Conference



📂 파일: MSeasons.csv


Unnamed: 0,Season,DayZero,RegionW,RegionX,RegionY,RegionZ
0,1985,10/29/1984,East,West,Midwest,Southeast
1,1986,10/28/1985,East,Midwest,Southeast,West
2,1987,10/27/1986,East,Southeast,Midwest,West
3,1988,11/02/1987,East,Midwest,Southeast,West
4,1989,10/31/1988,East,West,Midwest,Southeast



📂 파일: Cities.csv


Unnamed: 0,CityID,City,State
0,4001,Abilene,TX
1,4002,Akron,OH
2,4003,Albany,NY
3,4004,Albuquerque,NM
4,4005,Allentown,PA



📂 파일: WSeasons.csv


Unnamed: 0,Season,DayZero,RegionW,RegionX,RegionY,RegionZ
0,1998,10/27/1997,East,Midwest,Mideast,West
1,1999,10/26/1998,East,Mideast,Midwest,West
2,2000,11/01/1999,East,Midwest,Mideast,West
3,2001,10/30/2000,East,Midwest,Mideast,West
4,2002,10/29/2001,East,West,Mideast,Midwest



📂 파일: SeedBenchmarkStage1.csv


Unnamed: 0,ID,Pred
0,2021_1101_1102,0.5
1,2021_1101_1103,0.5
2,2021_1101_1104,0.14
3,2021_1101_1105,0.5
4,2021_1101_1106,0.5



📂 파일: MSecondaryTourneyTeams.csv


Unnamed: 0,Season,SecondaryTourney,TeamID
0,1985,NIT,1108
1,1985,NIT,1133
2,1985,NIT,1139
3,1985,NIT,1145
4,1985,NIT,1151



📂 파일: SampleSubmissionStage2.csv


Unnamed: 0,ID,Pred
0,2025_1101_1102,0.5
1,2025_1101_1103,0.5
2,2025_1101_1104,0.5
3,2025_1101_1105,0.5
4,2025_1101_1106,0.5



📂 파일: MMasseyOrdinals.csv


Unnamed: 0,Season,RankingDayNum,SystemName,TeamID,OrdinalRank
0,2003,35,SEL,1102,159
1,2003,35,SEL,1103,229
2,2003,35,SEL,1104,12
3,2003,35,SEL,1105,314
4,2003,35,SEL,1106,260



📂 파일: SampleSubmissionStage1.csv


Unnamed: 0,ID,Pred
0,2021_1101_1102,0.5
1,2021_1101_1103,0.5
2,2021_1101_1104,0.5
3,2021_1101_1105,0.5
4,2021_1101_1106,0.5



📂 파일: WSecondaryTourneyTeams.csv


Unnamed: 0,Season,SecondaryTourney,TeamID
0,2013,WBI,3125
1,2013,WBI,3158
2,2013,WBI,3178
3,2013,WBI,3184
4,2013,WBI,3189



📂 파일: MGameCities.csv


Unnamed: 0,Season,DayNum,WTeamID,LTeamID,CRType,CityID
0,2010,7,1143,1293,Regular,4027
1,2010,7,1314,1198,Regular,4061
2,2010,7,1326,1108,Regular,4080
3,2010,7,1393,1107,Regular,4340
4,2010,9,1143,1178,Regular,4027



📂 파일: WGameCities.csv


Unnamed: 0,Season,DayNum,WTeamID,LTeamID,CRType,CityID
0,2010,11,3103,3237,Regular,4002
1,2010,11,3104,3399,Regular,4085
2,2010,11,3110,3224,Regular,4363
3,2010,11,3111,3267,Regular,4158
4,2010,11,3119,3447,Regular,4367



📂 파일: MTeamSpellings.csv


Unnamed: 0,TeamNameSpelling,TeamID
0,a&m-corpus chris,1394
1,a&m-corpus christi,1394
2,abilene chr,1101
3,abilene christian,1101
4,abilene-christian,1101



📂 파일: WTeamSpellings.csv


Unnamed: 0,TeamNameSpelling,TeamID
0,a&m-corpus chris,3394
1,a&m-corpus christi,3394
2,abilene chr,3101
3,abilene christian,3101
4,abilene-christian,3101



📂 파일: WTeams.csv


Unnamed: 0,TeamID,TeamName
0,3101,Abilene Chr
1,3102,Air Force
2,3103,Akron
3,3104,Alabama
4,3105,Alabama A&M



📂 파일: MTeamCoaches.csv


Unnamed: 0,Season,TeamID,FirstDayNum,LastDayNum,CoachName
0,1985,1102,0,154,reggie_minton
1,1985,1103,0,154,bob_huggins
2,1985,1104,0,154,wimp_sanderson
3,1985,1106,0,154,james_oliver
4,1985,1108,0,154,davey_whitney



📂 파일: MTeamConferences.csv


Unnamed: 0,Season,TeamID,ConfAbbrev
0,1985,1102,wac
1,1985,1103,ovc
2,1985,1104,sec
3,1985,1106,swac
4,1985,1108,swac



📂 파일: MTeams.csv


Unnamed: 0,TeamID,TeamName,FirstD1Season,LastD1Season
0,1101,Abilene Chr,2014,2025
1,1102,Air Force,1985,2025
2,1103,Akron,1985,2025
3,1104,Alabama,1985,2025
4,1105,Alabama A&M,2000,2025



📂 파일: WTeamConferences.csv


Unnamed: 0,Season,TeamID,ConfAbbrev
0,1998,3102,wac
1,1998,3103,mac
2,1998,3104,sec
3,1998,3106,swac
4,1998,3108,swac



📂 파일: WNCAATourneyDetailedResults.csv


Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,WLoc,NumOT,WFGM,WFGA,...,LFGA3,LFTM,LFTA,LOR,LDR,LAst,LTO,LStl,LBlk,LPF
0,2010,138,3124,69,3201,55,N,0,28,57,...,34,3,5,17,19,12,18,4,1,18
1,2010,138,3173,67,3395,66,N,0,23,59,...,27,14,15,18,26,8,8,8,6,22
2,2010,138,3181,72,3214,37,H,0,26,57,...,15,3,8,10,21,4,16,6,4,20
3,2010,138,3199,75,3256,61,H,0,25,63,...,20,17,22,16,21,13,16,5,4,24
4,2010,138,3207,62,3265,42,N,0,24,68,...,26,11,17,16,22,9,10,3,4,12



📂 파일: MNCAATourneyCompactResults.csv


Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,WLoc,NumOT
0,1985,136,1116,63,1234,54,N,0
1,1985,136,1120,59,1345,58,N,0
2,1985,136,1207,68,1250,43,N,0
3,1985,136,1229,58,1425,55,N,0
4,1985,136,1242,49,1325,38,N,0



📂 파일: WNCAATourneyCompactResults.csv


Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,WLoc,NumOT
0,1998,137,3104,94,3422,46,H,0
1,1998,137,3112,75,3365,63,H,0
2,1998,137,3163,93,3193,52,H,0
3,1998,137,3198,59,3266,45,H,0
4,1998,137,3203,74,3208,72,A,0



📂 파일: MNCAATourneyDetailedResults.csv


Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,WLoc,NumOT,WFGM,WFGA,...,LFGA3,LFTM,LFTA,LOR,LDR,LAst,LTO,LStl,LBlk,LPF
0,2003,134,1421,92,1411,84,N,1,32,69,...,31,14,31,17,28,16,15,5,0,22
1,2003,136,1112,80,1436,51,N,0,31,66,...,16,7,7,8,26,12,17,10,3,15
2,2003,136,1113,84,1272,71,N,0,31,59,...,28,14,21,20,22,11,12,2,5,18
3,2003,136,1141,79,1166,73,N,0,29,53,...,17,12,17,14,17,20,21,6,6,21
4,2003,136,1143,76,1301,74,N,1,27,64,...,21,15,20,10,26,16,14,5,8,19



📂 파일: WSecondaryTourneyCompactResults.csv


Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,WLoc,NumOT,SecondaryTourney
0,2013,135,3120,80,3412,57,H,0,WNIT
1,2013,135,3140,69,3226,54,H,0,WNIT
2,2013,135,3158,72,3297,70,A,0,WBI
3,2013,135,3183,60,3281,58,A,0,WNIT
4,2013,135,3189,68,3184,61,H,0,WBI



📂 파일: MSecondaryTourneyCompactResults.csv


Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,WLoc,NumOT,SecondaryTourney
0,1985,136,1151,67,1155,65,H,0,NIT
1,1985,136,1153,77,1245,61,H,0,NIT
2,1985,136,1201,79,1365,76,H,0,NIT
3,1985,136,1231,79,1139,57,H,0,NIT
4,1985,136,1249,78,1222,71,H,0,NIT



📂 파일: WRegularSeasonCompactResults.csv


Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,WLoc,NumOT
0,1998,18,3104,91,3202,41,H,0
1,1998,18,3163,87,3221,76,H,0
2,1998,18,3222,66,3261,59,H,0
3,1998,18,3307,69,3365,62,H,0
4,1998,18,3349,115,3411,35,H,0



📂 파일: MRegularSeasonDetailedResults.csv


Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,WLoc,NumOT,WFGM,WFGA,...,LFGA3,LFTM,LFTA,LOR,LDR,LAst,LTO,LStl,LBlk,LPF
0,2003,10,1104,68,1328,62,N,0,27,58,...,10,16,22,10,22,8,18,9,2,20
1,2003,10,1272,70,1393,63,N,0,26,62,...,24,9,20,20,25,7,12,8,6,16
2,2003,11,1266,73,1437,61,N,0,24,58,...,26,14,23,31,22,9,12,2,5,23
3,2003,11,1296,56,1457,50,N,0,18,38,...,22,8,15,17,20,9,19,4,3,23
4,2003,11,1400,77,1208,71,N,0,30,61,...,16,17,27,21,15,12,10,7,1,14



📂 파일: MRegularSeasonCompactResults.csv


Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,WLoc,NumOT
0,1985,20,1228,81,1328,64,N,0
1,1985,25,1106,77,1354,70,H,0
2,1985,25,1112,63,1223,56,H,0
3,1985,25,1165,70,1432,54,H,0
4,1985,25,1192,86,1447,74,H,0



📂 파일: WRegularSeasonDetailedResults.csv


Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,WLoc,NumOT,WFGM,WFGA,...,LFGA3,LFTM,LFTA,LOR,LDR,LAst,LTO,LStl,LBlk,LPF
0,2010,11,3103,63,3237,49,H,0,23,54,...,13,6,10,11,27,11,23,7,6,19
1,2010,11,3104,73,3399,68,N,0,26,62,...,21,14,27,14,26,7,20,4,2,27
2,2010,11,3110,71,3224,59,A,0,29,62,...,14,19,23,17,23,8,15,6,0,15
3,2010,11,3111,63,3267,58,A,0,27,52,...,26,16,25,22,22,15,11,14,5,14
4,2010,11,3119,74,3447,70,H,1,30,74,...,17,11,21,21,32,12,14,4,2,14



📂 파일: WConferenceTourneyGames.csv


Unnamed: 0,Season,ConfAbbrev,DayNum,WTeamID,LTeamID
0,2002,a_sun,129,3194,3144
1,2002,a_sun,129,3209,3407
2,2002,a_sun,129,3273,3125
3,2002,a_sun,129,3391,3416
4,2002,a_sun,130,3194,3273



📂 파일: MConferenceTourneyGames.csv


Unnamed: 0,Season,ConfAbbrev,DayNum,WTeamID,LTeamID
0,2001,a_sun,121,1194,1144
1,2001,a_sun,121,1416,1240
2,2001,a_sun,122,1209,1194
3,2001,a_sun,122,1359,1239
4,2001,a_sun,122,1391,1273



📂 파일: MNCAATourneySeedRoundSlots.csv


Unnamed: 0,Seed,GameRound,GameSlot,EarlyDayNum,LateDayNum
0,W01,1,R1W1,136,137
1,W01,2,R2W1,138,139
2,W01,3,R3W1,143,144
3,W01,4,R4W1,145,146
4,W01,5,R5WX,152,152



📂 파일: WNCAATourneySlots.csv


Unnamed: 0,Season,Slot,StrongSeed,WeakSeed
0,1998,R1W1,W01,W16
1,1998,R1W2,W02,W15
2,1998,R1W3,W03,W14
3,1998,R1W4,W04,W13
4,1998,R1W5,W05,W12



📂 파일: MNCAATourneySlots.csv


Unnamed: 0,Season,Slot,StrongSeed,WeakSeed
0,1985,R1W1,W01,W16
1,1985,R1W2,W02,W15
2,1985,R1W3,W03,W14
3,1985,R1W4,W04,W13
4,1985,R1W5,W05,W12



📂 파일: MNCAATourneySeeds.csv


Unnamed: 0,Season,Seed,TeamID
0,1985,W01,1207
1,1985,W02,1210
2,1985,W03,1228
3,1985,W04,1260
4,1985,W05,1374



📂 파일: WNCAATourneySeeds.csv


Unnamed: 0,Season,Seed,TeamID
0,1998,W01,3330
1,1998,W02,3163
2,1998,W03,3112
3,1998,W04,3301
4,1998,W05,3272


In [8]:
# 각 데이터프레임에서 결측값 개수 확인
for file, df in data_dict.items():
    missing_values = df.isna().sum()
    missing_values = missing_values[missing_values > 0]  # 결측값이 있는 컬럼만 출력
    if not missing_values.empty:
        print(f"\n📂 {file} (결측값 있는 컬럼)")
        print(missing_values)