In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Import necessary libraries
from sklearn.feature_selection import VarianceThreshold
import pandas as pd

# Load the dataset from the specified file path
file_path = "/content/drive/MyDrive/RegresiUTSTelkom.csv"
try:
    data = pd.read_csv(file_path)
    print("Dataset successfully loaded!")
except FileNotFoundError:
    print(f"File not found at {file_path}. Please verify the file path and try again.")
    exit()

# Rename all columns to a consistent naming format (x1, x2, ..., xn)
column_names = [f'x{i+1}' for i in range(data.shape[1])]
data.columns = column_names
print("Columns renamed successfully to:", column_names)

# Display basic information about the dataset and preview the first few rows
print("Dataset Information:")
print(data.info())
print("\nFirst five rows:")
print(data.head())

# Generate descriptive statistics for the dataset
print("\nDescriptive statistics:")
print(data.describe())

# Remove duplicate rows from the dataset
data = data.drop_duplicates()
print(f"Duplicates removed. The dataset now contains {data.shape[0]} rows and {data.shape[1]} columns.")

# Separate the target column ('x1') from the rest of the dataset
try:
    target = data['x1']
    data = data.drop(columns=['x1'])
    print("Target column 'x1' successfully separated.")
except KeyError:
    print("Target column 'x1' not found in the dataset.")
    exit()

# Identify and select features with correlation above a defined threshold
correlation_threshold = 0.1
correlation_with_target = data.corrwith(target).abs()
selected_features = correlation_with_target[correlation_with_target > correlation_threshold].index

if selected_features.empty:
    print("No features found with correlation above the threshold.")
    exit()
else:
    data_selected = data[selected_features]
    print(f"Selected features based on correlation threshold: {selected_features.tolist()}")

# Apply variance thresholding to remove features with low variance
variance_threshold = 0.1
selector = VarianceThreshold(threshold=variance_threshold)
try:
    data_high_variance = selector.fit_transform(data_selected)
    print("Low variance features removed successfully.")
except ValueError:
    print("Error: No feature met the variance threshold. Consider adjusting the threshold and try again.")
    exit()

# Convert the processed array back to a DataFrame
data_final = pd.DataFrame(data_high_variance, columns=[col for col, keep in zip(data_selected.columns, selector.get_support()) if keep])
print(f"Dataset now contains {data_final.shape[1]} features after variance thresholding.")

# Reintegrate the target column into the processed dataset
data_final['x1'] = target.values[:data_final.shape[0]]
print("Target column re-added to the processed dataset.")

# Save the processed dataset to a new file
processed_file_path = "/content/drive/MyDrive/RegresiUTSTelkomCONV.csv"
try:
    data_final.to_csv(processed_file_path, index=False)
    print("Processed dataset saved successfully!")
    print("File location:", processed_file_path)
except Exception as e:
    print(f"Error saving the processed dataset: {e}")

# Provide a summary of the final dataset
print(f"Final dataset dimensions: {data_final.shape}")

Dataset successfully loaded!
Columns renamed successfully to: ['x1', 'x2', 'x3', 'x4', 'x5', 'x6', 'x7', 'x8', 'x9', 'x10']
Dataset Information:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 515130 entries, 0 to 515129
Data columns (total 10 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   x1      515130 non-null  float64
 1   x2      515130 non-null  float64
 2   x3      515130 non-null  float64
 3   x4      515130 non-null  float64
 4   x5      515130 non-null  float64
 5   x6      515130 non-null  float64
 6   x7      515130 non-null  float64
 7   x8      515130 non-null  float64
 8   x9      515130 non-null  float64
 9   x10     515130 non-null  int64  
dtypes: float64(9), int64(1)
memory usage: 39.3 MB
None

First five rows:
         x1        x2        x3        x4        x5         x6        x7  \
0  48.73215  70.32679 -24.83777   8.76630  26.84939  202.18689 -12.19034   
1  50.95714  55.81851 -18.54940  -3.27872  28.70107   13.09302