<a href="https://colab.research.google.com/github/jpan2575/FWE458_Final/blob/main/FWE458_FinalCompiledCode.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import re

# Read the CSV file
from google.colab import drive
drive.mount('/content/drive')

filedir = '/content/drive/MyDrive/FWE458/'

In [None]:
#Compiling Datasets
import pandas as pd

# --- Configuration ---
# Base file from the previous step
base_file = "fully_combined_dataset.csv"

new_annual_files_info = {
    "RealEstateDev": "realestaste development.csv",
    "WasteDisposalVol": "wastedisposalvolume.csv",
    "NumWasteFactories": "numberofwastefactorie.csv",
    "MajorEnergyConsumption": "consumption of major energy.csv",
    "IndustryValue_Overall": "AnnualbyProvince industryvalue.csv", # To distinguish from previous 'IndustryValue'
    "IndustrialEnterprise": "AnnualbyProvinceindustrial enterprise.csv"
}
header_row_annual = 3 # Standard for these types of files

# --- Load the current base dataset ---
try:
    current_df = pd.read_csv(base_file)
    print(f"Successfully loaded base dataset: {base_file}")
    print(f"Base dataset shape: {current_df.shape}")
    print(f"Base dataset columns: {current_df.columns.tolist()}")

    # Ensure 'Region' and 'Year' in current_df are suitable for merging
    if 'Region' not in current_df.columns:
        raise ValueError("Critical Error: 'Region' column missing in the base_df.")
    if 'Year' not in current_df.columns:
        raise ValueError("Critical Error: 'Year' column missing in the base_df.")

    current_df['Region'] = current_df['Region'].astype(str).str.strip()
    current_df['Year'] = current_df['Year'].astype(int)

    # --- Process and merge each new annual file ---
    for value_col_name_prefix, file_name in new_annual_files_info.items():
        print(f"\nProcessing new file: {file_name} (to be prefixed as {value_col_name_prefix})")

        # Check if a column with this prefix/name already exists to prevent accidental overwrite
        # (though left merge would add suffixes like _x, _y, better to have unique names)
        if value_col_name_prefix in current_df.columns:
            print(f"Warning: Column '{value_col_name_prefix}' already exists in the dataframe. Skipping merge for this file to avoid duplication or conflict. Please choose a different name if this data is new.")
            # Alternative: Add a suffix, or allow user to decide. For now, skip.
            # current_df.rename(columns={value_col_name_prefix: value_col_name_prefix + "_old"}, inplace=True)
            continue

        try:
            annual_df_raw = pd.read_csv(file_name, header=header_row_annual)

            annual_df_raw = annual_df_raw.dropna(axis=1, how='all') # Drop fully empty year columns

            # Identify 'Region' column (usually the first one)
            if annual_df_raw.empty or annual_df_raw.shape[1] == 0:
                print(f"Warning: {file_name} is empty or has no columns after dropping all-NaN columns. Skipping.")
                continue

            id_vars_annual = annual_df_raw.columns[0]
            if id_vars_annual != 'Region':
                 # If the first column is not named Region, rename it.
                 print(f"Info: First column in {file_name} is '{id_vars_annual}'. Renaming to 'Region'.")
                 annual_df_raw.rename(columns={id_vars_annual: 'Region'}, inplace=True)
                 id_vars_annual = 'Region' # Update the variable

            annual_df_melted = pd.melt(annual_df_raw, id_vars=['Region'],
                                       var_name='Year', value_name=value_col_name_prefix)

            # Clean 'Year'
            annual_df_melted['Year'] = pd.to_numeric(annual_df_melted['Year'], errors='coerce')
            annual_df_melted.dropna(subset=['Year'], inplace=True)
            annual_df_melted['Year'] = annual_df_melted['Year'].astype(int)

            # Clean 'Region'
            annual_df_melted.dropna(subset=['Region'], inplace=True)
            annual_df_melted['Region'] = annual_df_melted['Region'].astype(str).str.strip()
            # Remove common metadata rows
            metadata_rows = ['Data Sources：National Bureau of Statistics',
                             'Data Source: National Bureau of Statistics',
                             'Data Sources: National Bureau of Statistics',
                             'Data Source: National Bureau of Statistics of China']
            annual_df_melted = annual_df_melted[~annual_df_melted['Region'].isin(metadata_rows)]

            # Clean value column (convert to numeric, coercing errors)
            annual_df_melted[value_col_name_prefix] = pd.to_numeric(annual_df_melted[value_col_name_prefix], errors='coerce')


            # Merge with the current combined dataframe
            print(f"Merging {value_col_name_prefix} into the main dataframe...")
            print(f"Shapes before merge: current_df={current_df.shape}, annual_df_melted ({value_col_name_prefix})={annual_df_melted.shape}")

            # Store original columns to check if new ones were added or if merge resulted in _x, _y
            original_cols = set(current_df.columns)
            current_df = pd.merge(current_df, annual_df_melted, on=['Region', 'Year'], how='left')
            new_cols = set(current_df.columns) - original_cols

            print(f"Shapes after merge: current_df={current_df.shape}")
            if not new_cols and value_col_name_prefix not in current_df.columns and f"{value_col_name_prefix}_x" in current_df.columns:
                 print(f"Warning: Merge might have resulted in suffixed columns (e.g., _x, _y) for {value_col_name_prefix} due to pre-existing column.")
            elif value_col_name_prefix in current_df.columns:
                 print(f"NaNs introduced in '{value_col_name_prefix}': {current_df[value_col_name_prefix].isna().sum()} / {len(current_df)}")
            else:
                print(f"Column {value_col_name_prefix} was not properly added. Columns added: {new_cols}")


        except FileNotFoundError:
            print(f"Error: File not found - {file_name}")
        except Exception as e:
            print(f"An error occurred while processing {file_name}: {e}")

    # --- Final inspection and save ---
    print("\nFinal combined dataframe head:")
    print(current_df.head())
    print("\nFinal combined dataframe info:")
    current_df.info()
    print(f"\nFinal combined dataframe shape: {current_df.shape}")

    output_file_name = "final_augmented_dataset.csv"
    current_df.to_csv(output_file_name, index=False)
    print(f"\nSuccessfully saved the further augmented dataset to: {output_file_name}")

except FileNotFoundError:
    print(f"Error: Base file '{base_file}' not found. Please ensure it was generated from the previous step.")
except ValueError as ve:
    print(ve)
except Exception as e:
    print(f"An unexpected error occurred: {e}")

In [None]:
#Time Series
df_filtered = df[['time', 'avg']].dropna()
df_filtered['time'] = df_filtered['time'].astype(int)
df_filtered['avg'] = pd.to_numeric(df_filtered['avg'], errors='coerce')
df_filtered = df_filtered.dropna()

# Remove outliers using IQR
Q1 = df_filtered['avg'].quantile(0.25)
Q3 = df_filtered['avg'].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
df_no_outliers = df_filtered[(df_filtered['avg'] >= lower_bound) & (df_filtered['avg'] <= upper_bound)]

# Plot the time series
plt.figure(figsize=(10, 6))
plt.plot(df_no_outliers['time'], df_no_outliers['avg'], marker='o', linestyle='', alpha=0.7)
plt.title('Time Series Plot Without Outliers')
plt.xlabel('Year')
plt.ylabel('μg/kg')
plt.grid(True)
plt.tight_layout()
plt.show()

In [None]:
#Raw Time Series
plt.figure(figsize=(10, 6))
plt.plot(df['time'], df['avg'], marker='o', linestyle='', alpha=0.7)
plt.title('Raw Data Time Series Plot')
plt.xlabel('Year')
plt.ylabel('μg/kg')
plt.grid(True)
plt.tight_layout()
plt.show()

In [None]:
#KNN
from sklearn.preprocessing import StandardScaler

# df
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

#Elbow
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt

wcss = []
for i in range(1, 11):  # Try k values from 1 to 10
    kmeans = KMeans(n_clusters=i, random_state=42)
    kmeans.fit(X_scaled)  # Use scaled features
    wcss.append(kmeans.inertia_)  # Inertia is the WCSS

plt.plot(range(1, 11), wcss)
plt.title('Elbow Method')
plt.xlabel('Number of Clusters (k)')
plt.ylabel('WCSS')
plt.show()

In [None]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D

# 3D plot of the clusters
fig = plt.figure(figsize=(10, 8))
ax = fig.add_subplot(111, projection='3d')
scatter = ax.scatter(data_no_outliers['avg'], data_no_outliers['min'], data_no_outliers['max'],
                    c=data_no_outliers['cluster'], cmap='viridis', s=50)
ax.set_xlabel('Average Concentration')
ax.set_ylabel('Minimum Concentration')
ax.set_zlabel('Maximum Concentration')
plt.title('3D Visualization of Clusters (k=4, Outliers Removed)')
plt.colorbar(scatter, label='Cluster')
plt.show()

# Scatterplot of 'avg' vs 'min', colored by cluster
plt.figure(figsize=(8, 6))
plt.scatter(data_no_outliers['avg'], data_no_outliers['min'], c=data_no_outliers['cluster'], cmap='viridis', s=50)
plt.xlabel('Average Concentration')
plt.ylabel('Minimum Concentration')
plt.title('Scatterplot of Average vs Minimum Concentration, Colored by Cluster (k=4, Outliers Removed)')
plt.colorbar(label='Cluster')
plt.show()

# Scatterplot of 'avg' vs 'max', colored by cluster
plt.figure(figsize=(8, 6))
plt.scatter(data_no_outliers['avg'], data_no_outliers['max'], c=data_no_outliers['cluster'], cmap='viridis', s=50)
plt.xlabel('Average Concentration')
plt.ylabel('Maximum Concentration')
plt.title('Scatterplot of Average vs Maximum Concentration, Colored by Cluster (k=4, Outliers Removed)')
plt.colorbar(label='Cluster')
plt.show()

In [None]:
from sklearn.metrics import silhouette_score  # Import silhouette_score

# Calculate Silhouette Score
silhouette_avg = silhouette_score(data_scaled, clusters)
print(f"Silhouette Score: {silhouette_avg:.3f}")

In [None]:
#China Map
!pip install cartopy
import pandas as pd
import matplotlib.pyplot as plt
import cartopy.crs as ccrs
import cartopy.feature as cfeature

# ... (previous code for data loading and cleaning) ...

# Convert 'avg' column to numeric, handling errors
df['avg'] = pd.to_numeric(df['avg'], errors='coerce')

# Create a figure and axes with cartopy projection
fig = plt.figure(figsize=(10, 6))
ax = fig.add_subplot(1, 1, 1, projection=ccrs.PlateCarree())

# Add basemap features for China
ax.add_feature(cfeature.LAND)
ax.add_feature(cfeature.COASTLINE)
ax.add_feature(cfeature.BORDERS, linestyle=':')
ax.add_feature(cfeature.LAKES, alpha=0.5)
ax.add_feature(cfeature.RIVERS)

# Set map extent to focus on China
ax.set_extent([73, 135, 18, 54])  # Adjust as needed

# Scatter plot for data points
scatter = ax.scatter(df['lon'], df['lat'], c=df['avg'],
                    cmap='viridis', s=20, alpha=0.7,
                    transform=ccrs.PlateCarree())  # Important: specify transform

# Add colorbar
cbar = fig.colorbar(scatter, ax=ax, label='PFAS (avg)')

# Set map title and labels
ax.set_title('PFAS Data Collection Map in China')
ax.set_xlabel('Longitude')
ax.set_ylabel('Latitude')

plt.show()

In [None]:
#Counts
import pandas as pd
import matplotlib.pyplot as plt

poid_counts = df['poid'].value_counts()

# Create the bar plot
poid_counts.plot(kind='bar', figsize=(10, 6))

# Customize the plot
plt.title('Counts of Each POID')
plt.xlabel('POID')
plt.ylabel('Count')
plt.xticks(rotation=45, ha='right')  # Rotate x-axis labels for readability
plt.tight_layout()  # Adjust layout to prevent overlapping elements

# Show the plot
plt.show()

In [None]:
#Decision Tree
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

# Load the dataset
df_augmentedcombined = pd.read_csv('augmentedcombined.csv')

# Drop rows where the target variable 'avg' is NaN
df_augmentedcombined.dropna(subset=['avg'], inplace=True)

# Clean the 'avg' column: replace '<0.1' with 0 and convert to numeric
df_augmentedcombined['avg'] = df_augmentedcombined['avg'].replace('<0.1', '0').astype(float)

# Define target variable
y = df_augmentedcombined['avg']

# Define features (exclude 'id' and 'avg')
X = df_augmentedcombined.drop(['id', 'avg'], axis=1)

# Identify categorical columns for one-hot encoding
# Assuming 'organ' and 'poid' are categorical
categorical_features = ['organ', 'poid']
# Identify numerical columns
numerical_features = X.columns.drop(categorical_features).tolist()

# Create transformers for preprocessing and imputation
preprocessor = ColumnTransformer(
    transformers=[
        ('num', SimpleImputer(strategy='mean'), numerical_features), # Impute numerical with mean
        ('cat', Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='constant', fill_value='missing')), # Impute categorical with a constant
            ('onehot', OneHotEncoder(handle_unknown='ignore')) # One-hot encode categorical
        ]), categorical_features)
    ])

# Create a pipeline with preprocessing and DecisionTreeRegressor
model = Pipeline(steps=[('preprocessor', preprocessor),
                      ('regressor', DecisionTreeRegressor(random_state=42))])

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the model
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_test) # This should be r2_score(y_test, y_pred) - Corrected in the next step.


# Print evaluation metrics
print(f'Mean Squared Error: {mse}')
print(f'Root Mean Squared Error: {rmse}')
print(f'R-squared: {r2}')

In [None]:
#Random Forest
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import LabelEncoder
import numpy as np

# Load the dataset
try:
    df = pd.read_csv("augmentedcombined.csv")
except Exception as e:
    print(f"Error loading CSV: {e}")
    raise

# Define the target variable and columns to exclude
target_variable = 'avg'
columns_to_exclude = ['province_c', 'locality_c', 'longitude', 'latitude', target_variable]

# Identify features
features = [col for col in df.columns if col not in columns_to_exclude]

# Separate features (X) and target (y)
X = df[features]
y = df[target_variable]

# Preprocessing
# 1. Handle missing values
# For simplicity, we'll fill missing numerical values with the mean and categorical with the mode.
# A more sophisticated approach might be needed depending on the dataset's characteristics.
for col in X.columns:
    if X[col].isnull().any():
        if pd.api.types.is_numeric_dtype(X[col]):
            X[col].fillna(X[col].mean(), inplace=True)
        else: # Assuming categorical
            X[col].fillna(X[col].mode()[0], inplace=True) # Take the first mode if multiple

if y.isnull().any():
    print(f"Warning: Target variable '{target_variable}' contains {y.isnull().sum()} missing values. These rows will be dropped.")
    # Drop rows where target is NaN
    df_cleaned = df.dropna(subset=[target_variable])
    X = df_cleaned[features]
    y = df_cleaned[target_variable]
    # Re-apply missing value imputation for X if its dimensions changed
    for col in X.columns:
        if X[col].isnull().any():
            if pd.api.types.is_numeric_dtype(X[col]):
                X[col].fillna(X[col].mean(), inplace=True)
            else:
                X[col].fillna(X[col].mode()[0], inplace=True)


# 2. Encode categorical features
# Identify categorical columns that need encoding
categorical_cols = X.select_dtypes(include=['object', 'category']).columns
label_encoders = {}

for col in categorical_cols:
    le = LabelEncoder()
    # Fit on all data (X[col]) to ensure all categories are known, then transform
    # Handle potential new categories in test set during a real-world scenario if splitting before encoding
    X[col] = le.fit_transform(X[col].astype(str)) # Convert to string to handle mixed types if any
    label_encoders[col] = le

# Split the data
try:
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
except ValueError as e:
    print(f"Error during data splitting: {e}")
    print(f"X shape: {X.shape}, y shape: {y.shape}")
    if X.shape[0] == 0 or y.shape[0] == 0:
        print("No data available for training after preprocessing. This might be due to all target values being NaN initially.")
    raise

# Train the Random Forest Regressor model
rf_regressor = RandomForestRegressor(random_state=42)
try:
    rf_regressor.fit(X_train, y_train)
except Exception as e:
    print(f"Error during model training: {e}")
    raise

# Make predictions
y_pred = rf_regressor.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
rmse = np.sqrt(mse)

# Print model performance
print(f"Random Forest Regressor Model Performance for target 'avg':")
print(f"Features used: {features}")
print(f"Mean Squared Error (MSE): {mse:.4f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.4f}")
print(f"R-squared (R2 ): {r2:.4f}")

# Feature importances (optional, but informative)
importances = rf_regressor.feature_importances_
feature_importance_df = pd.DataFrame({'Feature': features, 'Importance': importances})
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

print("\nFeature Importances:")
print(feature_importance_df)

# Check if any data was left for training.
if X.shape[0] == 0:
    final_message = "The Random Forest Regression could not be performed as there was no data left after handling missing values in the target variable 'avg'."
else:
    final_message = (
        "A Random Forest Regression model has been successfully trained on the dataset "
        f"to predict the target variable 'avg', excluding 'province_c', 'locality_c', 'longitude', and 'latitude'.\n\n"
        f"Model Performance:\n"
        f"- Mean Squared Error (MSE): {mse:.4f}\n"
        f"- Root Mean Squared Error (RMSE): {rmse:.4f}\n"
        f"- R-squared (R2): {r2:.4f}\n\n"
        "The features used for training were: " + ", ".join(features) + ".\n"
        "Feature importances have also been calculated and printed in the logs."
    )

print(f"\nFinal User Message: {final_message}")

In [None]:
#Fixed Random Forest
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

# Load the dataset
df_augmentedcombined = pd.read_csv('augmentedcombined.csv')

# Drop rows where the target variable 'avg' is NaN before outlier detection
df_augmentedcombined.dropna(subset=['avg'], inplace=True)

# Clean the 'avg' column: replace '<0.1' with 0 and convert to numeric
df_augmentedcombined['avg'] = df_augmentedcombined['avg'].replace('<0.1', '0').astype(float)

# --- Outlier Removal ---
# Calculate Q1, Q3, and IQR for the 'avg' column
Q1 = df_augmentedcombined['avg'].quantile(0.25)
Q3 = df_augmentedcombined['avg'].quantile(0.75)
IQR = Q3 - Q1

# Define bounds for outliers
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# Remove outliers
df_cleaned = df_augmentedcombined[(df_augmentedcombined['avg'] >= lower_bound) & (df_augmentedcombined['avg'] <= upper_bound)].copy()
# --- End Outlier Removal ---

# Define target variable from the cleaned data
y = df_cleaned['avg']

# Define features (exclude 'id' and 'avg') from the cleaned data
X = df_cleaned.drop(['id', 'avg'], axis=1)

# Identify categorical columns for one-hot encoding
# Assuming 'organ' and 'poid' are categorical
categorical_features = ['organ', 'poid']
# Identify numerical columns
numerical_features = X.columns.drop(categorical_features).tolist()

# Create transformers for preprocessing and imputation
preprocessor = ColumnTransformer(
    transformers=[
        ('num', SimpleImputer(strategy='mean'), numerical_features), # Impute numerical with mean
        ('cat', Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='constant', fill_value='missing')), # Impute categorical with a constant
            ('onehot', OneHotEncoder(handle_unknown='ignore')) # One-hot encode categorical
        ]), categorical_features)
    ])

# Create a pipeline with preprocessing and RandomForestRegressor
model = Pipeline(steps=[('preprocessor', preprocessor),
                      ('regressor', RandomForestRegressor(random_state=42))])

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the model
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

# Print evaluation metrics
print(f'Mean Squared Error (after outlier removal): {mse}')
print(f'Root Mean Squared Error (after outlier removal): {rmse}')
print(f'R-squared (after outlier removal): {r2}')

In [None]:
#XG Boost
import xgboost as xgb
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

# Load data
data = pd.read_csv('augmentedcombined.csv')

## STEP 1: Data Cleaning and Type Conversion
def convert_to_numeric(df, columns):
    for col in columns:
        if col in df.columns:
            # Convert to string first to handle mixed types, then to numeric
            df[col] = pd.to_numeric(df[col].astype(str), errors='coerce')
    return df

# Convert numeric columns
numeric_cols = ['avg', 'max', 'min', 'IndustryValue', 'EnvEmerg', 'NetIndustryValue',
                'PollutantEmission', 'WasteGas', 'RealEstateDev', 'WasteDisposalVol',
                'NumWasteFactories', 'MajorEnergyConsumption', 'IndustryValue_Overall',
                'IndustrialEnterprise']

data = convert_to_numeric(data, numeric_cols)

## STEP 2: IQR Outlier Removal Function (with NaN handling)
def remove_outliers_iqr(df, column):
    if column not in df.columns:
        return df

    # Calculate quartiles only for non-null values
    col_data = df[column].dropna()
    if len(col_data) == 0:
        return df

    Q1 = col_data.quantile(0.25)
    Q3 = col_data.quantile(0.75)
    IQR = Q3 - Q1

    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR

    return df[(df[column] >= lower_bound) & (df[column] <= upper_bound) | (df[column].isna())]

## STEP 3: Apply Outlier Removal
clean_data = data.copy()

# First remove outliers from target variable
clean_data = remove_outliers_iqr(clean_data, 'avg')

# Then remove outliers from numeric features (optional)
for col in numeric_cols:
    clean_data = remove_outliers_iqr(clean_data, col)

## STEP 4: Prepare Data for Modeling
X = clean_data.drop(columns=['avg', 'id'])
y = clean_data['avg']

# Handle categorical columns safely
categorical_cols = ['poid', 'organ']
for col in categorical_cols:
    if col in X.columns:
        # Convert to category codes, handling NaN values
        X[col] = X[col].astype('category').cat.codes

## STEP 5: Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42)

## STEP 6: XGBoost Model with Missing Value Handling
dtrain = xgb.DMatrix(X_train, label=y_train, enable_categorical=True)
dtest = xgb.DMatrix(X_test, label=y_test, enable_categorical=True)

params = {
    'objective': 'reg:squarederror',
    'tree_method': 'hist',
    'eta': 0.05,
    'max_depth': 6,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'eval_metric': 'rmse'
}

model = xgb.train(
    params,
    dtrain,
    num_boost_round=1000,
    evals=[(dtrain, 'train'), (dtest, 'test')],
    early_stopping_rounds=10,
    verbose_eval=50
)

## STEP 7: Evaluation
y_pred = model.predict(dtest)

print("\nEvaluation Metrics:")
print(f"RMSE: {mean_squared_error(y_test, y_pred, squared=False):.2f}")
print(f"MAE: {mean_absolute_error(y_test, y_pred):.2f}")
print(f"R²: {r2_score(y_test, y_pred):.4f}")

In [None]:
# Access the trained regressor model from the pipeline
regressor = model.named_steps['regressor']

# Get feature names after preprocessing
preprocessor = model.named_steps['preprocessor']
feature_names_out = preprocessor.get_feature_names_out()

# Get feature importances
feature_importances = regressor.feature_importances_

# Create a pandas Series for better visualization
feature_importances_series = pd.Series(feature_importances, index=feature_names_out)

# Sort feature importances
sorted_feature_importances = feature_importances_series.sort_values(ascending=False)

# Print sorted feature importances
print("Feature Importances (after outlier removal):")
print(sorted_feature_importances.to_markdown(numalign="left", stralign="left"))

In [None]:
#GNB
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
import numpy as np

# Load the dataset
df_augmentedcombined = pd.read_csv('augmentedcombined.csv')

# Drop rows where the target variable 'avg' is NaN
df_augmentedcombined.dropna(subset=['avg'], inplace=True)

# Clean the 'avg' column: replace '<0.1' with 0 and convert to numeric
df_augmentedcombined['avg'] = df_augmentedcombined['avg'].replace('<0.1', '0').astype(float)

# Categorize 'avg' into 'low', 'medium', 'high'
# Using quantiles to define bins
q_low = df_augmentedcombined['avg'].quantile(0.33)
q_high = df_augmentedcombined['avg'].quantile(0.66)

def categorize_avg(avg_value):
    if avg_value <= q_low:
        return 'low'
    elif avg_value <= q_high:
        return 'medium'
    else:
        return 'high'

df_augmentedcombined['avg_category'] = df_augmentedcombined['avg'].apply(categorize_avg)

# Define target variable (categorical 'avg')
y = df_augmentedcombined['avg_category']

# Define features (exclude 'id', original 'avg', and the new 'avg_category')
X = df_augmentedcombined.drop(['id', 'avg', 'avg_category'], axis=1)

# Identify categorical columns for one-hot encoding
# Assuming 'organ' and 'poid' are categorical
categorical_features = ['organ', 'poid']
# Identify numerical columns
numerical_features = X.columns.drop(categorical_features).tolist()

# Create transformers for preprocessing and imputation
# Gaussian Naive Bayes is sensitive to feature scaling, but for one-hot encoded features and
# imputation as done here, scaling is not strictly necessary within this preprocessor
# for GNB itself, though it could be beneficial for other models.
# However, to maintain consistency with previous steps and handle NaNs and categories,
# we keep the same preprocessing structure.
preprocessor = ColumnTransformer(
    transformers=[
        ('num', SimpleImputer(strategy='mean'), numerical_features), # Impute numerical with mean
        ('cat', Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='constant', fill_value='missing')), # Impute categorical with a constant
            ('onehot', OneHotEncoder(handle_unknown='ignore')) # One-hot encode categorical
        ]), categorical_features)
    ])

# Create a pipeline with preprocessing and GaussianNB classifier
model = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', GaussianNB())])

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the model
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

# Print evaluation metrics
print(f'Accuracy: {accuracy}')
print('Classification Report:')
print(report)

In [None]:
#SVM
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
import numpy as np

# Load the dataset
df_augmentedcombined = pd.read_csv('augmentedcombined.csv')

# Drop rows where the target variable 'avg' is NaN
df_augmentedcombined.dropna(subset=['avg'], inplace=True)

# Clean the 'avg' column: replace '<0.1' with 0 and convert to numeric
df_augmentedcombined['avg'] = df_augmentedcombined['avg'].replace('<0.1', '0').astype(float)

# Categorize 'avg' into 'low', 'medium', 'high'
# Using quantiles to define bins
q_low = df_augmentedcombined['avg'].quantile(0.33)
q_high = df_augmentedcombined['avg'].quantile(0.66)

def categorize_avg(avg_value):
    if avg_value <= q_low:
        return 'low'
    elif avg_value <= q_high:
        return 'medium'
    else:
        return 'high'

df_augmentedcombined['avg_category'] = df_augmentedcombined['avg'].apply(categorize_avg)

# Define target variable (categorical 'avg')
y = df_augmentedcombined['avg_category']

# Define features (exclude 'id', original 'avg', and the new 'avg_category')
X = df_augmentedcombined.drop(['id', 'avg', 'avg_category'], axis=1)

# Identify categorical columns for one-hot encoding
# Assuming 'organ' and 'poid' are categorical
categorical_features = ['organ', 'poid']
# Identify numerical columns
numerical_features = X.columns.drop(categorical_features).tolist()

# Create transformers for preprocessing and imputation
preprocessor = ColumnTransformer(
    transformers=[
        ('num', SimpleImputer(strategy='mean'), numerical_features), # Impute numerical with mean
        ('cat', Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='constant', fill_value='missing')), # Impute categorical with a constant
            ('onehot', OneHotEncoder(handle_unknown='ignore')) # One-hot encode categorical
        ]), categorical_features)
    ])

# Create a pipeline with preprocessing and SVC classifier
model = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', SVC(random_state=42))]) # Using default SVC parameters

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the model
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

# Print evaluation metrics
print(f'Accuracy: {accuracy}')
print('Classification Report:')
print(report)

In [None]:
#Logistic Regression
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
import numpy as np

# Load the dataset
df_augmentedcombined = pd.read_csv('augmentedcombined.csv')

# Drop rows where the target variable 'avg' is NaN
df_augmentedcombined.dropna(subset=['avg'], inplace=True)

# Clean the 'avg' column: replace '<0.1' with 0 and convert to numeric
df_augmentedcombined['avg'] = df_augmentedcombined['avg'].replace('<0.1', '0').astype(float)

# Define bins for categorization based on quantiles
# Use quantiles to create roughly equal-sized categories
low_threshold = df_augmentedcombined['avg'].quantile(0.33)
high_threshold = df_augmentedcombined['avg'].quantile(0.66)

# Create categorical 'avg' column
def categorize_avg(avg_value):
    if avg_value <= low_threshold:
        return 'low'
    elif avg_value <= high_threshold:
        return 'medium'
    else:
        return 'high'

df_augmentedcombined['avg_category'] = df_augmentedcombined['avg'].apply(categorize_avg)


# Define target variable
y = df_augmentedcombined['avg_category']

# Define features (exclude 'id', original 'avg', and the new 'avg_category')
X = df_augmentedcombined.drop(['id', 'avg', 'avg_category'], axis=1)

# Identify categorical columns for one-hot encoding
categorical_features = ['organ', 'poid']
# Identify numerical columns
numerical_features = X.columns.drop(categorical_features).tolist()

# Create transformers for preprocessing and imputation
preprocessor = ColumnTransformer(
    transformers=[
        ('num', SimpleImputer(strategy='mean'), numerical_features), # Impute numerical with mean
        ('cat', Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='constant', fill_value='missing')), # Impute categorical with a constant
            ('onehot', OneHotEncoder(handle_unknown='ignore')) # One-hot encode categorical
        ]), categorical_features)
    ])

# Create a pipeline with preprocessing and LogisticRegression
model = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', LogisticRegression(random_state=42, multi_class='auto', solver='liblinear'))]) # Using liblinear solver suitable for smaller datasets

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the model
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

# Print evaluation metrics
print(f'Accuracy: {accuracy}')
print(f'Classification Report:\n{class_report}')

In [None]:
#Random Forest
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

# Load the dataset
df_augmentedcombined = pd.read_csv('augmentedcombined.csv')

# Define the target variable and features
y = df_augmentedcombined['avg']
X = df_augmentedcombined.drop(['id', 'avg'], axis=1)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the Random Forest Regressor model
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# Print the evaluation metrics
print(f"Mean Squared Error: {mse}")
print(f"R-squared: {r2}")