In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report
import lightgbm as lgb

In [6]:
dt=pd.read_csv('Weather_with_labels.csv')
df=dt.copy()
print( dt.columns.to_list() )
print( dt.shape )
print( dt.head() )
print( dt.describe() )
print( dt.info() )

['Time (LST)', 'Air Temp (°F)', '0.5 m Air Temp (°F)', '1.5 m Air Temp (°F)', '3 m Air Temp (°F)', 'Relative Humidity (%)', 'Precipitation (in)', 'Accumulated Precip (in)', 'Solar Radiation (W/m²)', 'Wind Speed (mph)', 'Wind Direction (°)', 'Wind Gust (mph)', '4" Bare Soil Temp (°F)', '4" Grass Soil Temp (°F)', '2" Soil Temp (°F)', '2" Soil Water Content (%)', '4" Soil Temp (°F)', '4" Soil Water Content (%)', '8" Soil Temp (°F)', '8" Soil Water Content (%)', '20" Soil Temp (°F)', '20" Soil Water Content (%)', 'Inversion Strength', 'Max Inversion', 'Battery Voltage', 'Heat Index (°F)', 'Wind Chill (°F)', 'Path']
(865, 28)
            Time (LST)  Air Temp (°F)  ...  Wind Chill (°F)  Path
0  2025-04-27 18:00:00           69.4  ...              NaN     C
1  2025-04-27 17:30:00           70.0  ...              NaN     C
2  2025-04-27 17:00:00           69.6  ...              NaN     C
3  2025-04-27 16:30:00           69.6  ...              NaN     C
4  2025-04-27 16:00:00           69.3  ..

In [11]:
# 2) Drop rows where Path is missing (we can’t train on those)
df = df.dropna(subset=['Path'])

# 3) Convert the time column into useful numeric features
df['Time (LST)'] = pd.to_datetime(df['Time (LST)'])
df['hour']      = df['Time (LST)'].dt.hour
df['dayofweek'] = df['Time (LST)'].dt.dayofweek
df['month']     = df['Time (LST)'].dt.month

# 4) Clean up any non-numeric columns
#    e.g. Solar Radiation may be object; coerce to float
df['Solar Radiation (W/m²)'] = pd.to_numeric(
    df['Solar Radiation (W/m²)'], errors='coerce'
)

# 5) Handle missing values in your numeric features
#    Here: fill with median (you could also drop sparse cols)
num_cols = df.select_dtypes(include=['float64','int64']).columns
df[num_cols] = df[num_cols].fillna(df[num_cols].median())

# 6) Encode your target
le = LabelEncoder()
df['Path_enc'] = le.fit_transform(df['Path'])

# 7) Define X and y
#    Drop original Time, the raw Path, and the encoded target
X = df.drop(columns=['Time (LST)', 'Path', 'Path_enc'])
y = df['Path_enc']

# 8) Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

# Clean column names before fitting the model
# This will replace special characters and spaces with underscores
X_train.columns = X_train.columns.str.replace('[^A-Za-z0-9_]+', '_', regex=True)
X_test.columns = X_test.columns.str.replace('[^A-Za-z0-9_]+', '_', regex=True)

# Now fit the model with clean feature names
model = lgb.LGBMClassifier(
    n_estimators=1000,
    learning_rate=0.05,
    random_state=42,
    verbose=100
)
model.fit(
    X_train, y_train,
    eval_set=[(X_test, y_test)],
    callbacks=[lgb.early_stopping(stopping_rounds=100)]
)

# 10) Evaluate
y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(
    y_test, y_pred,
    target_names=le.classes_
))

[LightGBM] [Debug] Dataset::GetMultiBinFromSparseFeatures: sparse rate 0.912757
[LightGBM] [Debug] Dataset::GetMultiBinFromAllFeatures: sparse rate 0.191811
[LightGBM] [Debug] init for col-wise cost 0.000875 seconds, init for row-wise cost 0.000283 seconds
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000435 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1907
[LightGBM] [Info] Number of data points in the train set: 682, number of used features: 27
[LightGBM] [Info] Start training from score -1.023771
[LightGBM] [Info] Start training from score -1.100080
[LightGBM] [Info] Start training from score -1.177922
[LightGBM] [Debug] Trained a tree with leaves = 18 and depth = 7
[LightGBM] [Debug] Trained a tree with leaves = 26 and depth = 10
[LightGBM] [Debug] Trained a tree with leaves = 19 and depth = 5
Training until validation scores don't improve for 100 rounds
[LightGBM] [Debug] Trained a tree wi

In [None]:

# SLipt the data into train and test in aleatory not by time
from sklearn.model_selection import train_test_split
X = dt.drop(columns=['Path'])
y = dt['Path']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Train the model
model = lgb.LGBMClassifier()
model.fit(X_train, y_train)
# Predict the test set
y_pred = model.predict(X_test)
# Evaluate the model
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))
import joblib
joblib.dump(model, 'lgbm_model.pkl')

accuracy = accuracy_score(y_test, y_pred)
f1_score = f1_score(y_test, y_pred, average='weighted')

print(f'Accuracy: {accuracy:.2f}')

# Save the model
