In [None]:
#%%

import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier

In [None]:
# %%

data = pd.read_csv("provided_data.csv", header=None, names=['frame', 'xc', 'yc', 'w', 'h', 'effort'])

In [None]:
# %%

target = pd.read_csv('target.csv')

In [None]:
# %%

data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 111900 entries, 0 to 111899
Data columns (total 6 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   frame   111900 non-null  int64  
 1   xc      111900 non-null  float64
 2   yc      111900 non-null  float64
 3   w       111900 non-null  float64
 4   h       111900 non-null  float64
 5   effort  1865 non-null    float64
dtypes: float64(5), int64(1)
memory usage: 5.1 MB


In [None]:
# %%

data.isna().sum()

frame          0
xc             0
yc             0
w              0
h              0
effort    110035
dtype: int64

In [None]:
# %%

data.head()

Unnamed: 0,frame,xc,yc,w,h,effort
0,0,0.0,0.0,0.0175,0.0175,53350.0
1,1,0.02918,0.038191,0.018532,0.018284,
2,2,0.058361,0.076381,0.019563,0.019068,
3,3,0.087541,0.114572,0.020595,0.019852,
4,4,0.116721,0.152763,0.021627,0.020636,


In [None]:
# %%

data.shape

(111900, 6)

In [None]:
# %%

data['effort'] = pd.to_numeric(data['effort'], errors='coerce')

In [None]:
#%%

data.isna().sum()

frame          0
xc             0
yc             0
w              0
h              0
effort    110035
dtype: int64

In [None]:
# %%

data['effort'] = data['effort'].interpolate(method='linear')

In [None]:
# %%

# Ensure 'frame' is integer type for merging
data['frame'] = data['frame'].astype(int)

In [None]:
# %%

# Ensure 'frame' is integer type for merging
target['frame'] = target['frame'].astype(int)

In [None]:
# %%

# Merge data and target on 'frame'
merged = pd.merge(data, target, on='frame', how='inner')

In [None]:
# %%

# Features and target
features = ['xc', 'yc', 'w', 'h', 'effort']
X = merged[features]
y = merged['value']

In [None]:
# %%

# Normalize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [None]:
#%%

# Function to create lag features for time series data
def create_lag_features(X, window_size):
    X_lagged = pd.DataFrame()
    for i in range(window_size):
        X_shifted = pd.DataFrame(X).shift(i)
        X_shifted.columns = [f"{col}_lag_{i}" for col in X_shifted.columns]
        X_lagged = pd.concat([X_lagged, X_shifted], axis=1)
    return X_lagged.dropna()

In [None]:
# %%

window_size = 100  # Define the window size for time series chunks
X_lagged = create_lag_features(X_scaled, window_size)

In [None]:
# %%

y_lagged = y.iloc[window_size - 1:]  # Adjust y to align with lagged features
frames_lagged = merged['frame'].iloc[window_size - 1:]  # Get corresponding frame numbers

In [None]:
# %%

# Align indices
y_lagged = y_lagged.iloc[:len(X_lagged)].reset_index(drop=True)
frames_lagged = frames_lagged.iloc[:len(X_lagged)].reset_index(drop=True)
X_lagged = X_lagged.reset_index(drop=True)

In [None]:
# %%

# Split into train and test sets (chronological split to respect time series nature)
split_index = int(len(X_lagged) * 0.7)
X_train = X_lagged.iloc[:split_index]
X_test = X_lagged.iloc[split_index:]
y_train = y_lagged.iloc[:split_index]
y_test = y_lagged.iloc[split_index:]
frames_test = frames_lagged.iloc[split_index:]  # Frames corresponding to test set

In [None]:
# %%

'''Logistic Regression'''
from sklearn.linear_model import LogisticRegression
from sklearn.utils.class_weight import compute_class_weight
import numpy as np

classes = np.unique(y_train)

class_weights = compute_class_weight('balanced', classes=classes, y=y_train)

class_weight_dict = dict(zip(classes, class_weights))

In [None]:
# %%

# Initialize the Logistic Regression model with class weights
log_reg_model = LogisticRegression(class_weight=class_weight_dict, solver='liblinear', random_state=42)

# Train the model on the training data
log_reg_model.fit(X_train, y_train)

In [None]:
# Make predictions
y_pred3 = log_reg_model.predict(X_lagged)

# Evaluate the model
print(classification_report(y_lagged, y_pred3))

#%%
# Write predictions to CSV with the same syntax as target.csv
predictions_df = pd.DataFrame({'frame': frames_test, 'value': y_pred3})
predictions_df.to_csv('predictions_all.csv', index=False)

              precision    recall  f1-score   support

           0       0.95      0.84      0.89     76127
           1       0.57      0.84      0.68     19158

    accuracy                           0.84     95285
   macro avg       0.76      0.84      0.79     95285
weighted avg       0.88      0.84      0.85     95285



ValueError: array length 95285 does not match index length 28586

In [None]:
y_lagged.shape

(95285,)

In [None]:
y_pred3.shape

(95285,)

In [None]:
frames_test

66699     72739
66700     72740
66701     72741
66702     72742
66703     72743
          ...  
95280    101320
95281    101321
95282    101322
95283    101323
95284    101324
Name: frame, Length: 28586, dtype: int32

In [None]:
predictions_df = pd.DataFrame({'frame': frames_lagged, 'value': y_pred3})
predictions_df.to_csv('predictions_all.csv', index=False)