In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

# Load dataset
df = pd.read_csv("Dataset-PT.csv", skiprows=1)

# Drop specified columns
df = df.drop(['weather', 'temperature', 'day_of_week','time_of_day','Calendar_date','bus_id'], axis=1, errors='ignore')

# Separate features and target variable
x = df.drop(['arrival_delay'], axis=1)
y = df['arrival_delay']

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

# Create a Linear Regression model
model_LR = LinearRegression()

# Fit the model
model_LR.fit(X_train, y_train)

# Get the coefficients
coefficients = model_LR.coef_

# Create a DataFrame to view feature names and their corresponding coefficients
coef_df = pd.DataFrame({'Feature': X_train.columns, 'Coefficient': coefficients})

# Sort by absolute value of the coefficient to see most influential variables
coef_df = coef_df.reindex(coef_df['Coefficient'].abs().sort_values(ascending=False).index)

# Print the sorted dataframe
print(coef_df[['Feature', 'Coefficient']])


                              Feature  Coefficient
5                 upstream_stop_delay     1.005975
2                          dwell_time     0.980795
4               scheduled_travel_time    -0.972245
10                    recurrent_delay     0.707684
22    factor(time_of_day)Morning_peak     0.493475
16            factor(temperature)Cold     0.428270
19         factor(day_of_week)weekday     0.357136
20         factor(day_of_week)weekend    -0.357136
18          factor(temperature)Normal    -0.351095
11          factor(weather)Light_Rain    -0.297595
23        factor(time_of_day)Off-peak    -0.296962
14                factor(weather)Rain     0.276791
9                   traffic_condition     0.241352
21  factor(time_of_day)Afternoon_peak    -0.196513
13              factor(weather)Normal    -0.162067
15                factor(weather)Snow     0.135484
17      factor(temperature)Extra_cold    -0.077175
1                       stop_sequence    -0.057181
12          factor(weather)Ligh

In [3]:
coef_df.to_csv('coef_df.csv', index=False)

In [4]:
from statsmodels.stats.outliers_influence import variance_inflation_factor
import pandas as pd

# Assuming df is the DataFrame with your features
X = df.drop(['arrival_delay'], axis=1)

# Calculate VIF for each feature
vif_data = pd.DataFrame()
vif_data["Feature"] = X.columns
vif_data["VIF"] = [variance_inflation_factor(X.values, i) for i in range(len(X.columns))]

print(vif_data)


  return 1 - self.ssr/self.centered_tss
  vif = 1. / (1. - r_squared_i)


                              Feature        VIF
0                            route_id   0.000000
1                       stop_sequence   1.391733
2                          dwell_time   1.146614
3    travel_time_for_previous_section   1.348401
4               scheduled_travel_time   2.315171
5                 upstream_stop_delay   2.374923
6                        origin_delay   2.236902
7                  previous_bus_delay   1.147077
8           previous_trip_travel_time   5.558099
9                   traffic_condition  10.767179
10                    recurrent_delay   7.760660
11          factor(weather)Light_Rain        inf
12          factor(weather)Light_Snow        inf
13              factor(weather)Normal        inf
14                factor(weather)Rain        inf
15                factor(weather)Snow        inf
16            factor(temperature)Cold        inf
17      factor(temperature)Extra_cold        inf
18          factor(temperature)Normal        inf
19         factor(da

In [6]:
vif_data.to_csv('vif_data.csv', index=False)

In [15]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
import joblib  # For saving and loading the model

# Load your dataset (assuming it's already cleaned and ready)
df = pd.read_csv("Dataset-PT.csv", skiprows=1)  # Replace with your actual dataset path

df = df.drop(['weather', 'temperature', 'day_of_week','time_of_day','Calendar_date','bus_id'], axis=1, errors='ignore')

# Pisahkan fitur dan target (arrival_delay)
X = df.drop(['arrival_delay'], axis=1)  # Fitur
y = df['arrival_delay']  # Target (arrival_delay)

# Membagi dataset menjadi data latih dan data uji
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Membuat dan melatih model Linear Regression
model_LR = LinearRegression()
model_LR.fit(X_train, y_train)

# Simpan model yang telah dilatih menggunakan joblib
joblib.dump(model_LR, 'linear_regression_model.pkl')

# Fungsi untuk memuat model yang disimpan
def load_trained_model(filepath):
    model = joblib.load(filepath)
    return model

# Fungsi untuk mendapatkan input dari pengguna
def get_user_input():
    # Mengumpulkan input dari pengguna untuk setiap fitur
    user_data = {
        'dwell_time': float(input("Enter dwell_time: ")),
        'travel_time_for_previous_section': float(input("Enter travel_time_for_previous_section: ")),
        'scheduled_travel_time': float(input("Enter scheduled_travel_time: ")),
        'upstream_stop_delay': float(input("Enter upstream_stop_delay: ")),
        'origin_delay': float(input("Enter origin_delay: ")),
        'previous_bus_delay': float(input("Enter previous_bus_delay: ")),
        'previous_trip_travel_time': float(input("Enter previous_trip_travel_time: ")),
        'recurrent_delay': float(input("Enter recurrent_delay: ")),
        'route_id': int(input("Enter route_id: ")),
        'factor(weather)Light_Rain': int(input("Enter factor(weather)Light_Rain (0 or 1): ")),
        'factor(weather)Light_Snow': int(input("Enter factor(weather)Light_Snow (0 or 1): ")),
        'factor(weather)Normal': int(input("Enter factor(weather)Normal (0 or 1): ")),
        'factor(weather)Rain': int(input("Enter factor(weather)Rain (0 or 1): ")),
        'factor(weather)Snow': int(input("Enter factor(weather)Snow (0 or 1): ")),
        'factor(temperature)Cold': int(input("Enter factor(temperature)Cold (0 or 1): ")),
        'factor(temperature)Extra_cold': int(input("Enter factor(temperature)Extra_cold (0 or 1): ")),
        'factor(temperature)Normal': int(input("Enter factor(temperature)Normal (0 or 1): ")),
        'factor(day_of_week)weekday': int(input("Enter factor(day_of_week)weekday (0 or 1): ")),
        'factor(day_of_week)weekend': int(input("Enter factor(day_of_week)weekend (0 or 1): ")),
        'factor(time_of_day)Afternoon_peak': int(input("Enter factor(time_of_day)Afternoon_peak (0 or 1): ")),
        'factor(time_of_day)Morning_peak': int(input("Enter factor(time_of_day)Morning_peak (0 or 1): ")),
        'factor(time_of_day)Off-peak': int(input("Enter factor(time_of_day)Off-peak (0 or 1): ")),
    }

    # Mengonversi input pengguna ke dalam DataFrame
    return pd.DataFrame([user_data])

# Load the trained Linear Regression model
filepath = 'linear_regression_model.pkl'  # Model yang disimpan
model = load_trained_model(filepath)

# Mengambil input dari pengguna
input_data = get_user_input()

# Lakukan prediksi menggunakan model
prediction = model.predict(input_data)

# Menampilkan hasil prediksi
print(f"Predicted arrival_delay: {prediction[0]}")


FileNotFoundError: [Errno 2] No such file or directory: 'your_dataset.csv'

In [17]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
import joblib  # For saving and loading the model

# Load your dataset (assuming it's already cleaned and ready)
df = pd.read_csv("Dataset-PT.csv", skiprows=1)  # Replace with your actual dataset path

df = df.drop(['weather', 'temperature', 'day_of_week','time_of_day','Calendar_date','bus_id'], axis=1, errors='ignore')

# Pisahkan fitur dan target (arrival_delay)
X = df.drop(['arrival_delay'], axis=1)  # Fitur
y = df['arrival_delay']  # Target (arrival_delay)

# Membagi dataset menjadi data latih dan data uji
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Membuat dan melatih model Linear Regression
model_LR = LinearRegression()
model_LR.fit(X_train, y_train)

# Simpan model yang telah dilatih menggunakan joblib
joblib.dump(model_LR, 'linear_regression_model.pkl')

# Fungsi untuk memuat model yang disimpan
def load_trained_model(filepath):
    model = joblib.load(filepath)
    return model

# Fungsi untuk mendapatkan input dari pengguna
def get_user_input():
    # Mengumpulkan input dari pengguna untuk setiap fitur, termasuk fitur yang sebelumnya hilang
    user_data = {
        'dwell_time': float(input("Enter dwell_time: ")),
        'travel_time_for_previous_section': float(input("Enter travel_time_for_previous_section: ")),
        'scheduled_travel_time': float(input("Enter scheduled_travel_time: ")),
        'upstream_stop_delay': float(input("Enter upstream_stop_delay: ")),
        'origin_delay': float(input("Enter origin_delay: ")),
        'previous_bus_delay': float(input("Enter previous_bus_delay: ")),
        'previous_trip_travel_time': float(input("Enter previous_trip_travel_time: ")),
        'recurrent_delay': float(input("Enter recurrent_delay: ")),
        'route_id': int(input("Enter route_id: ")),
        'stop_sequence': int(input("Enter stop_sequence: ")),  # Fitur tambahan yang diperlukan
        'traffic_condition': int(input("Enter traffic_condition: ")),  # Fitur tambahan yang diperlukan
        'factor(weather)Light_Rain': int(input("Enter factor(weather)Light_Rain (0 or 1): ")),
        'factor(weather)Light_Snow': int(input("Enter factor(weather)Light_Snow (0 or 1): ")),
        'factor(weather)Normal': int(input("Enter factor(weather)Normal (0 or 1): ")),
        'factor(weather)Rain': int(input("Enter factor(weather)Rain (0 or 1): ")),
        'factor(weather)Snow': int(input("Enter factor(weather)Snow (0 or 1): ")),
        'factor(temperature)Cold': int(input("Enter factor(temperature)Cold (0 or 1): ")),
        'factor(temperature)Extra_cold': int(input("Enter factor(temperature)Extra_cold (0 or 1): ")),
        'factor(temperature)Normal': int(input("Enter factor(temperature)Normal (0 or 1): ")),
        'factor(day_of_week)weekday': int(input("Enter factor(day_of_week)weekday (0 or 1): ")),
        'factor(day_of_week)weekend': int(input("Enter factor(day_of_week)weekend (0 or 1): ")),
        'factor(time_of_day)Afternoon_peak': int(input("Enter factor(time_of_day)Afternoon_peak (0 or 1): ")),
        'factor(time_of_day)Morning_peak': int(input("Enter factor(time_of_day)Morning_peak (0 or 1): ")),
        'factor(time_of_day)Off-peak': int(input("Enter factor(time_of_day)Off-peak (0 or 1): ")),
    }

    # Mengonversi input pengguna ke dalam DataFrame
    return pd.DataFrame([user_data])

# Load the trained Linear Regression model
filepath = 'linear_regression_model.pkl'  # Model yang disimpan
model = load_trained_model(filepath)

# Mengambil input dari pengguna
input_data = get_user_input()

# Lakukan prediksi menggunakan model
prediction = model.predict(input_data)

# Menampilkan hasil prediksi
print(f"Predicted arrival_delay: {prediction[0]}")


ValueError: could not convert string to float: ''