In [11]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from datetime import datetime

# Sample data as a DataFrame
dataset = pd.read_excel('DAIICT Hackathon Data.xlsx', sheet_name='Past In and Out Container Data')

df = pd.DataFrame(dataset)

In [13]:


# Convert datetime strings to datetime objects
try:
  df['IN_TIME'] = pd.to_datetime(df['IN_TIME'], format='%d-%m-%y %H:%M:%S')
except ValueError:
  df['IN_TIME'] = pd.to_datetime(df['IN_TIME'], format='%d-%m-%y %H:%M:%S.%f')

try:
  df['OUT_TIME'] = pd.to_datetime(df['OUT_TIME'], format='%d-%m-%y %H:%M:%S')
except ValueError:
  df['OUT_TIME'] = pd.to_datetime(df['OUT_TIME'], format='%d-%m-%y %H:%M:%S.%f')


In [14]:
# Convert STATUS to numerical values
df['STATUS'] = df['STATUS'].map({'E': 0, 'L': 1})

# Calculate the target variable (time difference)
df['TIME_DIFF'] = (df['OUT_TIME'] - df['IN_TIME']).dt.total_seconds()

# Separate input features and target variable
X_datetime = df['IN_TIME'].values.astype(np.int64) // 10**9  # Convert datetime to Unix timestamp
X_categorical = df['STATUS'].values
y = df['TIME_DIFF'].values

# Reshape input features if necessary
X_datetime = X_datetime.reshape(-1, 1)
X_categorical = X_categorical.reshape(-1, 1)

# Concatenate the input features horizontally
X = np.hstack((X_datetime, X_categorical))

# Handle NaN values
X = np.nan_to_num(X, nan=np.nanmean(X))  # Replace NaN with the mean value of the array

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the Linear Regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Predict using the trained model
y_pred = model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error: {mse}")

Mean Squared Error: 78022189005.652
