In [5]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [7]:
# Load the dataset, skipping the first row
df = pd.read_csv("ILINet.csv", skiprows=1)

# Display the first few rows to check
print(df.head())
print(df.info())

  REGION TYPE REGION  YEAR  WEEK  % WEIGHTED ILI  %UNWEIGHTED ILI  AGE 0-4  \
0    National      X  1997    40         1.10148          1.21686      179   
1    National      X  1997    41         1.20007          1.28064      199   
2    National      X  1997    42         1.37876          1.23906      228   
3    National      X  1997    43         1.19920          1.14473      188   
4    National      X  1997    44         1.65618          1.26112      217   

  AGE 25-49 AGE 25-64  AGE 5-24 AGE 50-64  AGE 65  ILITOTAL  \
0         X       157       205         X      29       570   
1         X       151       242         X      23       615   
2         X       153       266         X      34       681   
3         X       193       236         X      36       653   
4         X       162       280         X      41       700   

   NUM. OF PROVIDERS  TOTAL PATIENTS  
0                192           46842  
1                191           48023  
2                219           5496

## Preprocessing

In [10]:
#Handling Categorical Variables
#'REGION TYPE'
#'REGION'
#convert to dummy variables in order to be used in a model
df = pd.get_dummies(df, columns=['REGION TYPE', 'REGION'], drop_first=True)

In [12]:
print(df.head())

   YEAR  WEEK  % WEIGHTED ILI  %UNWEIGHTED ILI  AGE 0-4 AGE 25-49 AGE 25-64  \
0  1997    40         1.10148          1.21686      179         X       157   
1  1997    41         1.20007          1.28064      199         X       151   
2  1997    42         1.37876          1.23906      228         X       153   
3  1997    43         1.19920          1.14473      188         X       193   
4  1997    44         1.65618          1.26112      217         X       162   

   AGE 5-24 AGE 50-64  AGE 65  ILITOTAL  NUM. OF PROVIDERS  TOTAL PATIENTS  
0       205         X      29       570                192           46842  
1       242         X      23       615                191           48023  
2       266         X      34       681                219           54961  
3       236         X      36       653                213           57044  
4       280         X      41       700                213           55506  


In [18]:
# Standardize Numeric Features
# Data has features on different scales -> percentages vs raw counts, need to scale
# Using StandardScaler to make sure all values have a zero mean and unit variance

# Convert problematic columns to numeric (replace 'X' with NaN and fill with median)
for col in ['AGE 25-49', 'AGE 25-64', 'AGE 50-64']:
    df[col] = pd.to_numeric(df[col], errors='coerce')  # Convert 'X' to NaN
    df[col] = df[col].fillna(df[col].median())  # Replace NaNs with median value

# Identify numeric columns again after cleaning
numeric_cols = ['% WEIGHTED ILI', '%UNWEIGHTED ILI', 'AGE 0-4', 'AGE 25-49', 
                'AGE 25-64', 'AGE 5-24', 'AGE 50-64', 'AGE 65', 'ILITOTAL', 
                'NUM. OF PROVIDERS', 'TOTAL PATIENTS']

# Standardize numeric features
scaler = StandardScaler()
df[numeric_cols] = scaler.fit_transform(df[numeric_cols])

In [20]:
print(df.head())

   YEAR  WEEK  % WEIGHTED ILI  %UNWEIGHTED ILI   AGE 0-4  AGE 25-49  \
0  1997    40       -0.542828        -0.444555 -0.745622  -0.284656   
1  1997    41       -0.476397        -0.399487 -0.742095  -0.284656   
2  1997    42       -0.355993        -0.428868 -0.736981  -0.284656   
3  1997    43       -0.476983        -0.495523 -0.744035  -0.284656   
4  1997    44       -0.169063        -0.413280 -0.738920  -0.284656   

   AGE 25-64  AGE 5-24  AGE 50-64    AGE 65  ILITOTAL  NUM. OF PROVIDERS  \
0  -0.693968 -0.653795  -0.273357 -0.514437 -0.646937          -1.295188   
1  -0.700287 -0.649881  -0.273357 -0.516591 -0.645309          -1.296069   
2  -0.698181 -0.647342  -0.273357 -0.512642 -0.642922          -1.271403   
3  -0.656057 -0.650515  -0.273357 -0.511925 -0.643935          -1.276689   
4  -0.688703 -0.645861  -0.273357 -0.510130 -0.642235          -1.276689   

   TOTAL PATIENTS  
0       -1.029240  
1       -1.027559  
2       -1.017683  
3       -1.014718  
4       -1.01690

## Training

In [22]:
# Define features (X) and target variable (y)
X = df.drop(columns=['% WEIGHTED ILI'])  # Features
y = df['% WEIGHTED ILI']  # Target variable

In [24]:
# Split into 80% training and 20% testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, shuffle=False)  # Shuffle=False for time series


In [26]:
### Step 4: Output Data Information
print("Training set size:", X_train.shape)
print("Testing set size:", X_test.shape)

Training set size: (1147, 12)
Testing set size: (287, 12)


## Conclusions

###
Training and testing split appears well balanced. With 1147 training samples, there is enough historical data to learn patterns. Also with 287 test samples, there is enough unseen data to fairly evaluate performance. 