In [1]:
import pandas as pd
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler

In [2]:
import pandas as pd

# Load the CSV file into a DataFrame
df = pd.read_csv('train_data-Dell.csv')
df_test = pd.read_csv('test_data-Dell.csv')


labels = df['time_to_tca']
labels_test = df_test['time_to_tca']

# the labels as a numpy array: convinience for following comands
labels_array = labels.values
labels_test_array = labels_test.values


df = df.drop(columns=['c_object_type'])  # Drop 'c_object_type' column from df
df_test = df_test.drop(columns=['c_object_type'])  # Drop 'c_object_type' column from df_test



In [3]:
from sklearn.preprocessing import MinMaxScaler

# Initialize MinMaxScaler
scaler = MinMaxScaler()


df_features_scaled = scaler.fit_transform(df.drop(columns=['time_to_tca']))
df_test_features_scaled = scaler.transform(df_test.drop(columns=['time_to_tca']))

# contain both the scaled features and the original target variable
df_scaled = pd.concat([pd.DataFrame(df_features_scaled, columns=df.columns[:-1]), df[['time_to_tca']]], axis=1)
df_test_scaled = pd.concat([pd.DataFrame(df_test_features_scaled, columns=df_test.columns[:-1]), df_test[['time_to_tca']]], axis=1)


In [4]:
# Create an imputer object with a strategy (e.g., mean, median, mode) to handle missing values
imputer = SimpleImputer(strategy='mean')

# Apply imputation to the DataFrames
train_data_imputed = imputer.fit_transform(df_scaled)
test_data_imputed = imputer.transform(df_test_scaled)

# Create and train the Ridge regression model
ridge_model = Ridge(alpha=0.1)  
ridge_model.fit(train_data_imputed, labels_array)  #Training on the entire training set

# Predict labels on the test set
predictions = ridge_model.predict(test_data_imputed)

# Evaluate the model performance
mse = mean_squared_error(labels_test_array, predictions)
r2 = r2_score(labels_test_array, predictions)

print("Mean Squared Error:", mse)
print("r^2 Score:", r2)


Mean Squared Error: 9.037780696944162e-13
r^2 Score: 0.9999999999995572


In [5]:
# Extract features and labels for the training set
X_train = train_data_imputed[:, train_data_imputed.shape[1] != 1]  # Excludes the second column
y_train = train_data_imputed[:, 1]  # Labels for training data (second column)

X_test = test_data_imputed[:, test_data_imputed.shape[1] != 1]  # Excludes the second column
y_test = test_data_imputed[:, 1]  # Labels for testing data (second column)


In [6]:
# Reshape X_train and X_test if necessary
X_train = X_train.squeeze(axis=1)  # Remove the extra dimension if present
X_test = X_test.squeeze(axis=1)  

In [7]:
# Check the shape of X_train and y_train
print("X_train shape:", X_train.shape)
print("y_train shape:", y_train.shape)


X_train shape: (70108, 102)
y_train shape: (70108,)


In [8]:
# Create an imputer object with a strategy (e.g., mean, median, mode) to handle missing values
imputer = SimpleImputer(strategy='mean')

# Apply imputation to the DataFrames
train_data_imputed = imputer.fit_transform(df_scaled)
test_data_imputed = imputer.transform(df_test_scaled)

# Create and train the Ridge regression model
ridge_model = Ridge(alpha=0.1)  # You can adjust the regularization parameter 'alpha' as needed
ridge_model.fit(X_train, y_train)

# Predict labels on the test set
predictions = ridge_model.predict(X_test)

# Evaluate the model performance
mse = mean_squared_error(y_test, predictions)
r2 = r2_score(y_test, predictions)

print("Mean Squared Error:", mse)
print("r^2 Score:", r2)

Mean Squared Error: 1.0284506462447139e-10
r^2 Score: 0.9999999984334017


In [9]:
ridge_model.score(X_train, y_train)

0.9999999984784927

In [10]:
from sklearn.model_selection import cross_val_score
import numpy as np


In [11]:
# Perform cross-validation
cv_scores = cross_val_score(ridge_model, train_data_imputed, labels_array, cv=5, scoring='neg_mean_squared_error')

# Convert scores to positive values and calculate mean and standard deviation
mse_scores = -cv_scores
mean_mse = mse_scores.mean()
std_mse = mse_scores.std()

print("Cross-Validation Mean Squared Error:", mean_mse)
print("Cross-Validation MSE Standard Deviation:", std_mse)

alphas = np.logspace(-3, 3, 7)  #alphas from 10^-3 to 10^3

# Iterate over different alpha values and perform cross-validation
for alpha in alphas:
    ridge_model = Ridge(alpha=alpha)
    cv_scores = cross_val_score(ridge_model, train_data_imputed, labels_array, cv=5, scoring='neg_mean_squared_error')
    mse_scores = -cv_scores
    mean_mse = mse_scores.mean()
    std_mse = mse_scores.std()
    print(f"Alpha: {alpha}, Mean Squared Error: {mean_mse}, Std Dev: {std_mse}")


Cross-Validation Mean Squared Error: 1.5451087040992855e-12
Cross-Validation MSE Standard Deviation: 3.4603210334699674e-14
Alpha: 0.001, Mean Squared Error: 1.5527770572522024e-16, Std Dev: 3.567487884774278e-18
Alpha: 0.01, Mean Squared Error: 1.5495500422021503e-14, Std Dev: 3.48591438308595e-16
Alpha: 0.09999999999999999, Mean Squared Error: 1.5451087040992855e-12, Std Dev: 3.4603210334699674e-14
Alpha: 1.0, Mean Squared Error: 1.513773086244638e-10, Std Dev: 3.518247612462456e-12
Alpha: 10.0, Mean Squared Error: 1.3819757108355028e-08, Std Dev: 3.1620644794457213e-10
Alpha: 100.0, Mean Squared Error: 1.2133943313429706e-06, Std Dev: 2.614908472210432e-08
Alpha: 1000.0, Mean Squared Error: 0.00010184569616806375, Std Dev: 1.970721678551218e-06


numbers indicate that the Ridge regression model performs exceptionally well, with very low mean squared errors and minimal variability in performance across different test sets or regularization strengths.