In [28]:
# Import necessary libraries
import pandas as pd
from sklearn.feature_selection import mutual_info_regression
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt

# Load the dataset
file_path = 'dataset/data.csv'  # Adjust the path to your dataset
data = pd.read_csv(file_path)

# Select only the numeric columns from the dataset
numeric_cols = data.select_dtypes(include=['float64', 'int64']).columns
data_numeric = data[numeric_cols]

# Drop rows with missing values (NaN) in the numeric dataset
data_numeric_cleaned = data_numeric.dropna()

# Prepare the feature set (all numeric features except `temp`)
X = data_numeric_cleaned.drop(columns=['feelslike', 'severerisk', 'precipprob'])  # Drop 'temp' (target variable)
y = data_numeric_cleaned['feelslike']  # Target variable

# Step 1: Calculate Mutual Information
mi = mutual_info_regression(X, y)

# Create a DataFrame for mutual information
mi_df = pd.DataFrame({
    'Feature': X.columns,
    'Mutual Information': mi
}).sort_values(by='Mutual Information', ascending=False)

# Print mutual information
print("Mutual Information between Features and Target:")
print(mi_df)

# Step 2: Select top features based on MI (lấy tất cả những chỉ số MI trên 0.3)
top_features = mi_df['Feature'].head(7).values ## Có 7 chỉ số MI trên 0.3
print(f"Top features based on MI: {top_features}")




Mutual Information between Features and Target:
             Feature  Mutual Information
2               temp            3.190251
3       feelslikemax            1.750444
4       feelslikemin            1.502751
1            tempmin            1.494775
0            tempmax            1.419369
5                dew            1.158433
14  sealevelpressure            0.740701
15        cloudcover            0.280636
16        visibility            0.273655
17    solarradiation            0.266939
18       solarenergy            0.266572
19           uvindex            0.239078
6           humidity            0.231080
13           winddir            0.181604
7             precip            0.089658
11          windgust            0.082332
12         windspeed            0.033797
8        precipcover            0.031931
10         snowdepth            0.000000
9               snow            0.000000
20         moonphase            0.000000
Top features based on MI: ['temp' 'feelslikemax' '