In [13]:
import pandas as pd
import numpy as np
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.feature_selection import mutual_info_classif

# Load dataset
df = pd.read_csv("/Users/peterripperger/Desktop/Startup/data/Initial Products/shopping_trends.csv")

# Automatically detect numeric and non-numeric columns
numeric_X = df.select_dtypes(include=[np.number])

# 1. Calculate Multicollinearity (VIF)
if not numeric_X.empty:
    vif_data = pd.DataFrame()
    vif_data['Feature'] = numeric_X.columns
    vif_data['VIF'] = [variance_inflation_factor(numeric_X.values, i) for i in range(numeric_X.shape[1])]
else:
    vif_data = pd.DataFrame(columns=['Feature', 'VIF'])

# 2. Calculate Coefficient of Variation (CV)
cv_data = df.std(numeric_only=True) / df.mean(numeric_only=True)

# 3. Calculate Entropy
def calculate_entropy(series):
    proportions = series.value_counts(normalize=True)
    return -np.sum(proportions * np.log2(proportions))

entropy_data = df.apply(calculate_entropy)

# 4. Feature Relevance (Mutual Information)
# For simplicity in a fully automated workflow, mutual information considers all columns
if not numeric_X.empty:
    mi_scores = mutual_info_classif(numeric_X, pd.Series(np.zeros(len(numeric_X))), discrete_features=False)
    mi_data = pd.Series(mi_scores, index=numeric_X.columns)
else:
    mi_data = pd.Series(dtype=float)

# 5. Gradient-Based Scoring
def score_metric(value, lower_bound, upper_bound):
    if value <= lower_bound:
        return 0
    elif value >= upper_bound:
        return 1
    elif lower_bound < value < upper_bound:
        return 1
    else:
        return (value - lower_bound) / (upper_bound - lower_bound)

# Calculate individual scores
if not vif_data.empty:
    vif_scores = vif_data['VIF'].apply(lambda x: score_metric(x, 1, 10))  # VIF range: 1 to 5
else:
    vif_scores = pd.Series(dtype=float)

cv_scores = cv_data.apply(lambda x: score_metric(x, 0.05, 2))  # CV range: 0.1 to 1
entropy_scores = entropy_data.apply(lambda x: score_metric(x, 0.5, 1))  # Entropy range: 0.7 to 1

# Combine metrics into weighted score
vif_weight = 0.4
cv_weight = 0.2
entropy_weight = 0.4

weighted_scores = pd.Series(0, index=df.columns)
if not vif_scores.empty:
    weighted_scores.update(vif_scores * vif_weight)
if not cv_scores.empty:
    weighted_scores.update(cv_scores * cv_weight)
if not entropy_scores.empty:
    weighted_scores.update(entropy_scores * entropy_weight)

# Final dataset quality score
final_score = weighted_scores.mean()

# Print the final dataset quality score
print(f"Final Dataset Quality Score: {final_score:.3f}")


Final Dataset Quality Score: 0.400


In [15]:
import pandas as pd
import numpy as np
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.feature_selection import mutual_info_classif

# Load dataset
df = pd.read_csv("/Users/peterripperger/Desktop/Startup/data/Initial Products/car_crash_augmented.csv")

# Automatically detect numeric and non-numeric columns
numeric_X = df.select_dtypes(include=[np.number])

# 1. Calculate Multicollinearity (VIF)
if not numeric_X.empty:
    vif_data = pd.DataFrame()
    vif_data['Feature'] = numeric_X.columns
    vif_data['VIF'] = [variance_inflation_factor(numeric_X.values, i) for i in range(numeric_X.shape[1])]
else:
    vif_data = pd.DataFrame(columns=['Feature', 'VIF'])

# 2. Calculate Coefficient of Variation (CV)
cv_data = df.std(numeric_only=True) / df.mean(numeric_only=True)

# 3. Calculate Entropy
def calculate_entropy(series):
    proportions = series.value_counts(normalize=True)
    return -np.sum(proportions * np.log2(proportions))

entropy_data = df.apply(calculate_entropy)

# 4. Scoring Functions with Expanded Ranges
def score_vif(value):
    return max(0, min(1, 1 - (value - 5) / 10)) if value > 1 else 1  # Penalize VIF > 15 gradually

def score_cv(value):
    return max(0, min(1, 1 - abs(value - 1) / 5))  # Reward CV in range [0.01, 5]

def score_entropy(value):
    return max(0, min(1, (value - 0.3) / 0.7))  # Reward Entropy in range [0.3, 1]

# Calculate individual scores
if not vif_data.empty:
    vif_scores = vif_data['VIF'].apply(score_vif)
else:
    vif_scores = pd.Series(dtype=float)

cv_scores = cv_data.apply(score_cv)
entropy_scores = entropy_data.apply(score_entropy)

# Combine metrics into weighted score
vif_weight = 0.2
cv_weight = 0.3
entropy_weight = 0.5

weighted_scores = pd.Series(0, index=df.columns)
if not vif_scores.empty:
    weighted_scores.update(vif_scores * vif_weight)
if not cv_scores.empty:
    weighted_scores.update(cv_scores * cv_weight)
if not entropy_scores.empty:
    weighted_scores.update(entropy_scores * entropy_weight)

# Final dataset quality score
final_score = weighted_scores.mean()

# Debug outputs
print(f"VIF Scores:\n{vif_scores}")
print(f"CV Scores:\n{cv_scores}")
print(f"Entropy Scores:\n{entropy_scores}")
print(f"Weighted Scores:\n{weighted_scores}")

# Print the final dataset quality score
print(f"Final Dataset Quality Score: {final_score:.3f}")

VIF Scores:
0    0.00000
1    0.00000
2    0.00000
3    0.47234
4    0.00000
5    1.00000
Name: VIF, dtype: float64
CV Scores:
Severity                     0.841397
Noise Aug Temperature(F)     0.864445
Noise Aug Wind_Chill(F)      0.872783
Noise Aug Humidity(%)        0.872603
Noise Aug Pressure(in)       0.809820
Noise Aug Wind_Speed(mph)    0.947251
dtype: float64
Entropy Scores:
Severity                     0.648961
Noise Aug Temperature(F)     1.000000
Noise Aug Wind_Chill(F)      1.000000
Noise Aug Humidity(%)        1.000000
Noise Aug Pressure(in)       1.000000
Noise Aug Wind_Speed(mph)    1.000000
Aug Sunrise_Sunset           0.898436
Aug Civil_Twilight           0.795443
Aug Nautical_Twilight        0.703225
Aug Astronomical_Twilight    0.565514
dtype: float64
Weighted Scores:
Severity                     0.324480
Noise Aug Temperature(F)     0.500000
Noise Aug Wind_Chill(F)      0.500000
Noise Aug Humidity(%)        0.500000
Noise Aug Pressure(in)       0.500000
Noise Aug Wi

In [16]:
import pandas as pd
import numpy as np
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.feature_selection import mutual_info_classif

# Load dataset
df = pd.read_csv("/Users/peterripperger/Desktop/Startup/data/Initial Products/car_crash_augmented.csv")

# Ask user for the dependent variable (DV) column to drop
dv_column = input("Enter the name of the dependent variable (DV) column to drop: ").strip()

# Ensure the DV column exists in the dataset
if dv_column not in df.columns:
    raise ValueError(f"Column '{dv_column}' not found in the dataset.")

# Drop the DV column and separate features
X = df.drop(columns=[dv_column])

# Automatically detect numeric columns
numeric_X = X.select_dtypes(include=[np.number])

# 1. Calculate Multicollinearity (VIF)
if not numeric_X.empty:
    vif_data = pd.DataFrame()
    vif_data['Feature'] = numeric_X.columns
    vif_data['VIF'] = [variance_inflation_factor(numeric_X.values, i) for i in range(numeric_X.shape[1])]
else:
    vif_data = pd.DataFrame(columns=['Feature', 'VIF'])

# 2. Calculate Coefficient of Variation (CV)
cv_data = X.std(numeric_only=True) / X.mean(numeric_only=True)

# 3. Calculate Entropy
def calculate_entropy(series):
    proportions = series.value_counts(normalize=True)
    return -np.sum(proportions * np.log2(proportions))

entropy_data = X.apply(calculate_entropy)

# 4. Scoring Functions with Expanded Ranges
def score_vif(value):
    return max(0, min(1, 1 - (value - 5) / 10)) if value > 1 else 1  # Penalize VIF > 15 gradually

def score_cv(value):
    return max(0, min(1, 1 - abs(value - 1) / 5))  # Reward CV in range [0.01, 5]

def score_entropy(value):
    return max(0, min(1, (value - 0.3) / 0.7))  # Reward Entropy in range [0.3, 1]

# Calculate individual scores
if not vif_data.empty:
    vif_scores = vif_data['VIF'].apply(score_vif)
else:
    vif_scores = pd.Series(dtype=float)

cv_scores = cv_data.apply(score_cv)
entropy_scores = entropy_data.apply(score_entropy)

# Combine metrics into weighted score
vif_weight = 0.2
cv_weight = 0.3
entropy_weight = 0.5

weighted_scores = pd.Series(0, index=X.columns)
if not vif_scores.empty:
    weighted_scores.update(vif_scores * vif_weight)
if not cv_scores.empty:
    weighted_scores.update(cv_scores * cv_weight)
if not entropy_scores.empty:
    weighted_scores.update(entropy_scores * entropy_weight)

# Final dataset quality score
final_score = weighted_scores.mean()

# Debug outputs
print(f"VIF Scores:\n{vif_scores}")
print(f"CV Scores:\n{cv_scores}")
print(f"Entropy Scores:\n{entropy_scores}")
print(f"Weighted Scores:\n{weighted_scores}")

# Print the final dataset quality score
print(f"Final Dataset Quality Score (excluding '{dv_column}'): {final_score:.3f}")

Enter the name of the dependent variable (DV) column to drop:  Severity


VIF Scores:
0    0.000000
1    0.000000
2    0.472342
3    0.000000
4    1.000000
Name: VIF, dtype: float64
CV Scores:
Noise Aug Temperature(F)     0.864445
Noise Aug Wind_Chill(F)      0.872783
Noise Aug Humidity(%)        0.872603
Noise Aug Pressure(in)       0.809820
Noise Aug Wind_Speed(mph)    0.947251
dtype: float64
Entropy Scores:
Noise Aug Temperature(F)     1.000000
Noise Aug Wind_Chill(F)      1.000000
Noise Aug Humidity(%)        1.000000
Noise Aug Pressure(in)       1.000000
Noise Aug Wind_Speed(mph)    1.000000
Aug Sunrise_Sunset           0.898436
Aug Civil_Twilight           0.795443
Aug Nautical_Twilight        0.703225
Aug Astronomical_Twilight    0.565514
dtype: float64
Weighted Scores:
Noise Aug Temperature(F)     0.500000
Noise Aug Wind_Chill(F)      0.500000
Noise Aug Humidity(%)        0.500000
Noise Aug Pressure(in)       0.500000
Noise Aug Wind_Speed(mph)    0.500000
Aug Sunrise_Sunset           0.449218
Aug Civil_Twilight           0.397721
Aug Nautical_Twiligh

In [21]:
import pandas as pd
import numpy as np
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.feature_selection import mutual_info_classif

# Load dataset
df = pd.read_csv("/Users/peterripperger/Desktop/Startup/data/Initial Products/Augmented_ECG.csv")

# Ask user for the dependent variable (DV) column to drop
dv_column = input("Enter the name of the dependent variable (DV) column to drop: ").strip()

# Ensure the DV column exists in the dataset
if dv_column not in df.columns:
    raise ValueError(f"Column '{dv_column}' not found in the dataset.")

# Drop the DV column and separate features
X = df.drop(columns=[dv_column])

# Automatically detect numeric columns
numeric_X = X.select_dtypes(include=[np.number])

# 1. Calculate Multicollinearity (VIF)
if not numeric_X.empty:
    vif_data = pd.DataFrame()
    vif_data['Feature'] = numeric_X.columns
    vif_data['VIF'] = [variance_inflation_factor(numeric_X.values, i) for i in range(numeric_X.shape[1])]
else:
    vif_data = pd.DataFrame(columns=['Feature', 'VIF'])

# 2. Calculate Coefficient of Variation (CV)
cv_data = X.std(numeric_only=True) / X.mean(numeric_only=True)

# 3. Calculate Entropy
def calculate_entropy(series):
    proportions = series.value_counts(normalize=True)
    return -np.sum(proportions * np.log2(proportions))

entropy_data = X.apply(calculate_entropy)

# 4. Scoring Functions with Desired Ranges
def score_vif(value):
    if 1 <= value <= 5:
        return 1.0  # Full score within range
    return max(0, min(1, 1 - abs(value - 5) / 10))  # Gradually reduce outside range

def score_cv(value):
    if 0.1 <= value <= 1:
        return 1.0  # Full score within range
    return max(0, min(1, 1 - abs(value - 1) / 5))  # Gradually reduce outside range

def score_entropy(value):
    if 0.7 <= value <= 1:
        return 1.0  # Full score within range
    return max(0, min(1, (value - 0.3) / 0.7))  # Gradually reduce outside range

# Calculate individual scores
if not vif_data.empty:
    vif_scores = vif_data['VIF'].apply(score_vif)
else:
    vif_scores = pd.Series(dtype=float)

cv_scores = cv_data.apply(score_cv)
entropy_scores = entropy_data.apply(score_entropy)

# Combine metrics into weighted score
vif_weight = 0.2
cv_weight = 0.3
entropy_weight = 0.5

weighted_scores = pd.Series(0, index=X.columns)
if not vif_scores.empty:
    weighted_scores.update(vif_scores * vif_weight)
if not cv_scores.empty:
    weighted_scores.update(cv_scores * cv_weight)
if not entropy_scores.empty:
    weighted_scores.update(entropy_scores * entropy_weight)

# Final dataset quality score
final_score = weighted_scores.mean()

# Debug outputs
print(f"VIF Scores:\n{vif_scores}")
print(f"CV Scores:\n{cv_scores}")
print(f"Entropy Scores:\n{entropy_scores}")
print(f"Weighted Scores:\n{weighted_scores}")

# Print the final dataset quality score
print(f"Final Dataset Quality Score (excluding '{dv_column}'): {final_score:.3f}")


Enter the name of the dependent variable (DV) column to drop:  187


VIF Scores:
0      0.000000
1      0.000000
2      0.000000
3      0.000000
4      0.000000
         ...   
182    1.000000
183    0.562549
184    0.680163
185    1.000000
186    1.000000
Name: VIF, Length: 187, dtype: float64
CV Scores:
0      1.000000
1      1.000000
2      1.000000
3      1.000000
4      1.000000
         ...   
182    0.000000
183    0.000000
184    0.000000
185    0.130927
186    0.466623
Length: 187, dtype: float64
Entropy Scores:
0      1
1      1
2      1
3      1
4      1
      ..
182    1
183    1
184    1
185    1
186    1
Length: 187, dtype: int64
Weighted Scores:
0      0.5
1      0.5
2      0.5
3      0.5
4      0.5
      ... 
182    0.5
183    0.5
184    0.5
185    0.5
186    0.5
Length: 187, dtype: float64
Final Dataset Quality Score (excluding '187'): 0.500
