In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
from scipy import stats

from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import mean_squared_error, r2_score, f1_score, classification_report, confusion_matrix
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.linear_model import LinearRegression, LogisticRegression

# from imblearn.over_sampling import SMOTE
import xgboost as xgb
import lightgbm as lgb

import plotly.graph_objects as go
from plotly.subplots import make_subplots



In [5]:
# Load data (update path as needed)
df = pd.read_csv('unsw_datathon_2025.csv')
print(f"Dataset shape: {df.shape}")
df.head()

Dataset shape: (97429, 76)


Unnamed: 0,tarrdatetime_year,arrdatetime_year,depdatetime_year,admdatetimeop_year,sdatetime_year,gdate_year,wdisch_year,hdisch_year,tarrdatetime_month,arrdatetime_month,...,fwalk2,fbonemed2,fop2,ahos_code,surg,gerimed,mort30d,mort90d,mort120d,mort365d
0,2016.0,2016.0,2016.0,,2016.0,2016.0,2016.0,2016.0,1.0,1.0,...,,,,q2dGVL,2,1.0,1.0,1.0,1.0,2.0
1,,2016.0,2016.0,,2016.0,2016.0,2016.0,2016.0,,1.0,...,3.0,3.0,1.0,5ndV7C,2,1.0,1.0,1.0,1.0,1.0
2,2016.0,2016.0,2016.0,,2016.0,2016.0,2016.0,2016.0,1.0,1.0,...,,,,NW06AU,2,1.0,1.0,1.0,1.0,1.0
3,,2016.0,2016.0,,2016.0,2016.0,2016.0,2016.0,,1.0,...,,,,GZPU45,2,1.0,1.0,2.0,2.0,2.0
4,,2016.0,2016.0,,2016.0,2016.0,2016.0,2016.0,,1.0,...,1.0,2.0,1.0,rHZmbk,2,1.0,1.0,1.0,1.0,1.0


In [23]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 97429 entries, 0 to 97428
Data columns (total 76 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   tarrdatetime_year       14670 non-null  float64
 1   arrdatetime_year        93275 non-null  float64
 2   depdatetime_year        90073 non-null  float64
 3   admdatetimeop_year      2089 non-null   float64
 4   sdatetime_year          94496 non-null  float64
 5   gdate_year              85012 non-null  float64
 6   wdisch_year             95785 non-null  float64
 7   hdisch_year             86931 non-null  float64
 8   tarrdatetime_month      14670 non-null  float64
 9   arrdatetime_month       93275 non-null  float64
 10  depdatetime_month       90073 non-null  float64
 11  admdatetimeop_month     2089 non-null   float64
 12  sdatetime_month         94496 non-null  float64
 13  gdate_month             85012 non-null  float64
 14  wdisch_month            95785 non-null

In [17]:
# print(df.isnull().sum())
null_counts_df = pd.DataFrame(df.isnull().sum()).reset_index()
null_counts_df.columns = ['Column', 'Null Count']
pd.set_option('display.max_rows', 79)
null_counts_df

Unnamed: 0,Column,Null Count
0,tarrdatetime_year,82759
1,arrdatetime_year,4154
2,depdatetime_year,7356
3,admdatetimeop_year,95340
4,sdatetime_year,2933
5,gdate_year,12417
6,wdisch_year,1644
7,hdisch_year,10498
8,tarrdatetime_month,82759
9,arrdatetime_month,4154


In [18]:
null_counts_df.describe()


Unnamed: 0,Null Count
count,76.0
mean,22521.236842
std,32146.150121
min,0.0
25%,1837.0
50%,5016.5
75%,26405.75
max,95340.0


In [22]:
dic = pd.read_csv('/Users/irajput/Downloads/datathon 2025 dec/unsw_datathon_2025_data_dict.csv')
dic.head(100)

Unnamed: 0,pos,variable,label,col_type,missing,levels,value_labels
0,1,tarrdatetime_year,Transfer Hospital Arrival Year,dbl,82759,,
1,2,arrdatetime_year,Operating Hospital Arrival Year,dbl,4154,,
2,3,depdatetime_year,Operating Hospital Departure Year,dbl,7356,,
3,4,admdatetimeop_year,In-patient Fracture Year,dbl,95340,,
4,5,sdatetime_year,Hip Fracture Surgery Year,dbl,2933,,
5,6,gdate_year,Geriatric Medicine Assessment Year,dbl,12417,,
6,7,wdisch_year,Discharge From Acute Ward Year,dbl,1644,,
7,8,hdisch_year,Discharge From Hospital Year,dbl,10498,,
8,9,tarrdatetime_month,Transfer Hospital Arrival Month,dbl,82759,,
9,10,arrdatetime_month,Operating Hospital Arrival Month,dbl,4154,,
