In [6]:
import pandas as pd
from pygam import LinearGAM, s, f
import statsmodels.api as sm
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import GridSearchCV

In [7]:
df = pd.read_csv('assets/output/featureSelectionResult.csv')

In [8]:
X = df.drop(['date', 'visitor'], axis=1)
y = df['visitor']

In [9]:
# Data Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Add Constant Column
X_train = sm.add_constant(X_train)
X_test = sm.add_constant(X_test)

# Create and learn GAM models
# Returns: Instance of the LinearGAM class
gam = LinearGAM()
gam.fit(X_train, y_train)

# Prediction of test data
y_pred = gam.predict(X_test)

# Evaluation: Calculating the Mean Square Error
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error: {mse}")

# Evaluation: Calculating the coefficient of determination (R^2)
r2_gam = r2_score(y_test, y_pred)
print(f"R^2 Score: {r2_gam}")

Mean Squared Error: 26534776.152322445
R^2 Score: 0.39124391278347137


In [11]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# data load
df = pd.read_csv('assets/output/featureSelectionResult.csv')

# Features and Target Split
X = df.drop(['visitor','date'], axis=1)
y = df['visitor']

# Add polynomial characteristics
# Creates an instance of the PolynomialFeatures class.
# Parameters - degree: The degree of the polynomial features to be generated. It determines the maximum degree of monomials in the output array.
poly = PolynomialFeatures(degree=4)
X_poly = poly.fit_transform(X)

# Data Split
# Parameters:
# - X_poly: The input features (in polynomial form) used for splitting into train and test sets.
# - y: The target variable used for splitting into train and test sets.
# - test_size: The proportion of the dataset to include in the test split. It should be a float value between 0.0 and 1.0.
# - random_state: Determines the random seed for the shuffling of the dataset before splitting. It ensures reproducibility of the split.
X_train, X_test, y_train, y_test = train_test_split(X_poly, y, test_size=0.2, random_state=42)

# Model Learning
model = LinearRegression()
model.fit(X_train, y_train)

# Prediction of test data
y_pred = model.predict(X_test)

# Evaluation: Calculating the Mean Square Error
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error: {mse}")

# Evaluation: Calculating the coefficient of determination (R^2)
r2_lin = r2_score(X_test, y_test)
print(f"R^2 Score: {r2_lin}")

score = model.score(X_test, y_test)
print(f"score: {score}")

Mean Squared Error: 313695222.7262245


ValueError: y_true and y_pred have different number of output (135751!=1)

In [None]:
from sklearn.model_selection import KFold
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import pandas as pd
import numpy as np

# Data load
data = pd.read_csv('assets/output/featureSelectionResult.csv')

# Create a k-fold object that divides data into k subsets
k = 5  # Set the value of k in k-fold cross-validation

# Parameters:
# - n_splits: The number of folds (subsets) to split the data into. It determines the number of times the cross-validation process will be performed.
# - shuffle: Determines whether to shuffle the data before splitting it into folds. If True, the data will be shuffled randomly.
# - random_state: Determines the random seed for shuffling the data if `shuffle` is set to True. It ensures reproducibility of the shuffling.
kf = KFold(n_splits=k, shuffle=True, random_state=42)

# Features and Target Split
X = df.drop(['date', 'visitor'], axis=1)
y = df['visitor']

# Add polynomial characteristics
# Creates an instance of the PolynomialFeatures class.
# Parameters - degree: The degree of the polynomial features to be generated. It determines the maximum degree of monomials in the output array.
poly = PolynomialFeatures(degree=7)
X_poly = poly.fit_transform(X)

# Data Split
X_train, X_test, y_train, y_test = train_test_split(X_poly, y, test_size=0.2, random_state=42)
model = LinearRegression()

# Perform k-fold cross-validation
mse_scores = []
score_arr = []

for train_index, test_index in kf.split(X_poly):
    # Split training set, test set
    X_train, X_test = X_poly[train_index], X_poly[test_index]
    y_train, y_test = y[train_index], y[test_index]

    model.fit(X_train, y_train)

    # Predicting test Sets
    y_pred = model.predict(X_test)

    # Calculate the mean square error of the test set
    # Parameters:
    # - y_test: The true target values from the test set.
    # - y_pred: The predicted target values corresponding to the test set.

    # Returns:
    # The mean squared error (MSE) between the true target values and the predicted target values.
    # MSE is a measure of the average squared difference between the predicted and true values, providing an overall assessment of the model's performance.
    mse = mean_squared_error(y_test, y_pred)
    score = model.score(X_test, y_test)

    # for caculating mean score, appending
    score_arr.append(score)
    mse_scores.append(mse)

# Calculate the mean MSE of k-fold cross-validation, and score
mean_mse = np.mean(mse_scores)
mean_score = np.mean(score_arr)

print(mean_score)
print(mean_mse)

MemoryError: Unable to allocate 1.57 PiB for an array with shape (2468, 89356415775) and data type float64

In [12]:
from sklearn.model_selection import KFold
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
import numpy as np

# Loading data
df = pd.read_csv('assets/output/featureSelectionResult.csv')

# Methods for finding the best results by using K-fold in the polynomial regression, changing the degree and K
# Parameters:
    # X: The input features used for the regression analysis.
    # y: The target variable.
    # min_degree: The minimum degree of polynomial features to consider.
    # max_degree: The maximum degree of polynomial features to consider.
    # min_k: The minimum number of folds (subsets) for cross-validation.
    # max_k: The maximum number of folds (subsets) for cross-validation.
    # random_state: Determines the random seed for shuffling the data and cross-validation. It ensures reproducibility of the results.
# Returns:
    # best_degree: The degree value that yielded the best score.
    # best_score: The highest score obtained among different degree and k combinations.
    # best_k: The k value that yielded the best score.
def find_best_degree(X, y, min_degree, max_degree, min_k, max_k, random_state=42):
    results = []
    best_degree = None
    best_score = -np.inf
    best_k = None

    for degree in range(min_degree, max_degree+1):
        # Add polynomial characteristics
        poly = PolynomialFeatures(degree=degree)
        X_poly = poly.fit_transform(X)

        for k in range(min_k, max_k+1):
            model = LinearRegression()

            # Perform k-fold cross-validation
            # Parameters:
                # - n_splits: The number of folds (subsets) to split the data into. It determines the number of times the cross-validation process will be performed.
                # - shuffle: Determines whether to shuffle the data before splitting it into folds. If True, the data will be shuffled randomly.
                # - random_state: Determines the random seed for shuffling the data if `shuffle` is set to True. It ensures reproducibility of the shuffling.
            kf = KFold(n_splits=k, shuffle=True, random_state=random_state)
            score_arr = []
            mse_scores = []

            for train_index, test_index in kf.split(X_poly):
                # Split training set, test set
                X_train, X_test = X_poly[train_index], X_poly[test_index]
                y_train, y_test = y[train_index], y[test_index]

                model.fit(X_train, y_train)

                # Calculating the score for the test set
                score = model.score(X_test, y_test)
                score_arr.append(score)

                # Calculating the mean squared error (MSE) for the test set
                mse = mean_squared_error(y_test, y_pred)
                mse_scores.append(mse)

                print(f"Degree: {degree}, K: {k}, MSE: {mse}")

            # Calculating the Average Score
            mean_score = np.mean(score_arr)
            results.append((degree, k, mean_score))
            print(f"Degree: {degree}, K: {k}, Score: {mean_score}")

            # Update if the current degree and k values are higher than the highest score
            if mean_score > best_score:
                best_degree = degree
                best_score = mean_score
                best_k = k

    return best_degree, best_score, best_k, mse_scores

X = df.drop(['date', 'visitor'], axis=1)
y = df['visitor']

# Define the minimum and maximum degrees to consider.
min_degree = 2
max_degree = 6
min_k = 2
max_k = 10

# This function is expected to return the best degree, score, and k value based on the provided ranges.
best_degree, best_score, best_k = find_best_degree(X, y, min_degree, max_degree, min_k, max_k)

print(f"Best degree: {best_degree}")
print(f"Best score: {best_score}")
print(f"Best k: {best_k}")
print(f"Average MSE scores: {np.mean(mse_scores)}")

ValueError: Found input variables with inconsistent numbers of samples: [1234, 494]