In [10]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")

In [11]:
df = pd.read_csv(r"./data/coin_gecko_2022-03-17_cleaned.csv")

### Feature Selection Report: Data Loading
- Loaded the cleaned and feature-engineered dataset for feature selection and modeling.
- This dataset is ready for model input.


In [12]:
df.shape

(500, 11)

In [13]:
df

Unnamed: 0,coin,symbol,price,1h,24h,7d,24h_volume,mkt_cap,abs_7d_change,abs_24h_change,Liquidity
0,Bitcoin,BTC,40851.380000,0.001,0.000,-0.027,2.047612e+10,7.760774e+11,-1102.987260,0.000000,0.026384
1,Ethereum,ETH,2824.420000,0.004,0.029,0.034,1.364041e+10,3.390772e+11,96.030280,81.908180,0.040228
2,Tether,USDT,1.000000,-0.000,0.000,0.000,4.413140e+10,8.020588e+10,0.000000,0.000000,0.550227
3,BNB,BNB,389.610000,0.002,0.016,-0.010,1.425354e+09,6.556116e+10,-3.896100,6.233760,0.021741
4,USD Coin,USDC,0.999739,-0.001,0.000,-0.000,3.569816e+09,5.259607e+10,-0.000000,0.000000,0.067872
...,...,...,...,...,...,...,...,...,...,...,...
495,IRISnet,IRIS,0.055426,0.016,-0.003,-0.088,2.976839e+06,6.809024e+07,-0.004877,-0.000166,0.043719
496,Circuits of Value,COVAL,0.037961,0.002,-0.012,-0.054,3.667870e+05,6.782627e+07,-0.002050,-0.000456,0.005408
497,ARPA Chain,ARPA,0.069003,-0.000,0.008,-0.037,1.363376e+07,6.776284e+07,-0.002553,0.000552,0.201198
498,SuperRare,RARE,0.464613,-0.003,0.014,0.019,9.398219e+06,6.738822e+07,0.008828,0.006505,0.139464


### Feature Selection Report: Data Shape and Preview
- Checked the shape and previewed the data to confirm the number of features and samples.
- Ensured the target variable `Liquidity` is present.


In [14]:
X = df.drop('Liquidity', axis=1)
y = df['Liquidity']

### Feature Selection Report: Feature/Target Split
- Split the dataset into features (`X`) and target (`y`).
- The target variable is `Liquidity`, and all other columns are used as features.


In [15]:
X

Unnamed: 0,coin,symbol,price,1h,24h,7d,24h_volume,mkt_cap,abs_7d_change,abs_24h_change
0,Bitcoin,BTC,40851.380000,0.001,0.000,-0.027,2.047612e+10,7.760774e+11,-1102.987260,0.000000
1,Ethereum,ETH,2824.420000,0.004,0.029,0.034,1.364041e+10,3.390772e+11,96.030280,81.908180
2,Tether,USDT,1.000000,-0.000,0.000,0.000,4.413140e+10,8.020588e+10,0.000000,0.000000
3,BNB,BNB,389.610000,0.002,0.016,-0.010,1.425354e+09,6.556116e+10,-3.896100,6.233760
4,USD Coin,USDC,0.999739,-0.001,0.000,-0.000,3.569816e+09,5.259607e+10,-0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...
495,IRISnet,IRIS,0.055426,0.016,-0.003,-0.088,2.976839e+06,6.809024e+07,-0.004877,-0.000166
496,Circuits of Value,COVAL,0.037961,0.002,-0.012,-0.054,3.667870e+05,6.782627e+07,-0.002050,-0.000456
497,ARPA Chain,ARPA,0.069003,-0.000,0.008,-0.037,1.363376e+07,6.776284e+07,-0.002553,0.000552
498,SuperRare,RARE,0.464613,-0.003,0.014,0.019,9.398219e+06,6.738822e+07,0.008828,0.006505


In [17]:
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR


models = {
    "Linear Regression": LinearRegression(),
    "Decision Tree": DecisionTreeRegressor(),
    "Random Forest": RandomForestRegressor(),
    "Support Vector": SVR(),
    } 
 

### Feature Selection Report: Model Definitions
- Defined multiple regression models: Linear Regression, Decision Tree, Random Forest, and Support Vector.
- This allows comparison of different algorithms for the prediction task.


In [18]:
from sklearn import metrics
from sklearn.model_selection import train_test_split


In [19]:
X

Unnamed: 0,coin,symbol,price,1h,24h,7d,24h_volume,mkt_cap,abs_7d_change,abs_24h_change
0,Bitcoin,BTC,40851.380000,0.001,0.000,-0.027,2.047612e+10,7.760774e+11,-1102.987260,0.000000
1,Ethereum,ETH,2824.420000,0.004,0.029,0.034,1.364041e+10,3.390772e+11,96.030280,81.908180
2,Tether,USDT,1.000000,-0.000,0.000,0.000,4.413140e+10,8.020588e+10,0.000000,0.000000
3,BNB,BNB,389.610000,0.002,0.016,-0.010,1.425354e+09,6.556116e+10,-3.896100,6.233760
4,USD Coin,USDC,0.999739,-0.001,0.000,-0.000,3.569816e+09,5.259607e+10,-0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...
495,IRISnet,IRIS,0.055426,0.016,-0.003,-0.088,2.976839e+06,6.809024e+07,-0.004877,-0.000166
496,Circuits of Value,COVAL,0.037961,0.002,-0.012,-0.054,3.667870e+05,6.782627e+07,-0.002050,-0.000456
497,ARPA Chain,ARPA,0.069003,-0.000,0.008,-0.037,1.363376e+07,6.776284e+07,-0.002553,0.000552
498,SuperRare,RARE,0.464613,-0.003,0.014,0.019,9.398219e+06,6.738822e+07,0.008828,0.006505


In [20]:
y

0      0.026384
1      0.040228
2      0.550227
3      0.021741
4      0.067872
         ...   
495    0.043719
496    0.005408
497    0.201198
498    0.139464
499    0.000301
Name: Liquidity, Length: 500, dtype: float64

In [21]:
def evaluate_models(X, y, models):
   # separate dataset into train and test
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    models_list = []
    mse_scores = []
    mae_scores = []
    r2_scores = []
    
    for i in range(len(list(models))):
        model = list(models.values())[i]
        model.fit(X_train, y_train)  # Train model

        # Make predictions
        y_pred = model.predict(X_test)

        # Calculate evaluation metrics
        mse = metrics.mean_squared_error(y_test, y_pred)
        mae = metrics.mean_absolute_error(y_test, y_pred)
        r2 = metrics.r2_score(y_test, y_pred)
        
        model_name = list(models.keys())[i]
        print(f'---- Metrics for {model_name} ----')
        print(f"MSE: {mse}, MAE: {mae}, R²: {r2}")
        
        models_list.append(model_name)
        mse_scores.append(mse)
        mae_scores.append(mae)
        r2_scores.append(r2)
    
    print()
    
    # Create a report dataframe
    report = pd.DataFrame()
    report['Model_name'] = models_list
    report['MSE'] = mse_scores
    report['MAE'] = mae_scores
    report['R²'] = r2_scores        
    return report

### Feature Selection Report: Model Evaluation Function
- Created a function to evaluate models using MSE, MAE, and R² metrics.
- This function helps compare model performance in a structured way.


In [22]:
# Drop non-numeric columns
X_numeric = X.select_dtypes(include=[np.number])

# Re-run the evaluation
report = evaluate_models(X_numeric, y, models)

---- Metrics for Linear Regression ----
MSE: 10.21879375014683, MAE: 0.5372523843046164, R²: -106.37070668048902
---- Metrics for Decision Tree ----
MSE: 0.055907221910332844, MAE: 0.05719643816380732, R²: 0.41257274862122495
---- Metrics for Random Forest ----
MSE: 0.01205938267640939, MAE: 0.030273685705625657, R²: 0.8732898939194346
---- Metrics for Support Vector ----
MSE: 0.0910184767904711, MAE: 0.11178496297664636, R²: 0.043652468880277406



### Feature Selection Report: Numeric Feature Selection and Evaluation
- Selected only numeric features for model training and evaluation.
- Evaluated all defined models and compared their performance.


In [23]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.3)

X_train

Unnamed: 0,coin,symbol,price,1h,24h,7d,24h_volume,mkt_cap,abs_7d_change,abs_24h_change
5,XRP,XRP,0.796926,0.006,0.016,0.038,3.642947e+09,3.834236e+10,0.030283,0.012751
116,XDC Network,XDC,0.052299,0.012,0.034,-0.056,9.542402e+06,6.444781e+08,-0.002929,0.001778
45,Osmosis,OSMO,9.330000,0.002,-0.007,-0.077,6.142808e+07,3.010427e+09,-0.718410,-0.065310
16,Cronos,CRO,0.403370,-0.001,0.004,-0.013,6.492367e+07,1.021244e+10,-0.005244,0.001613
462,Rai Reflex Index,RAI,3.040000,0.003,0.003,0.002,1.057135e+06,7.761693e+07,0.006080,0.009120
...,...,...,...,...,...,...,...,...,...,...
106,cUSDT,CUSDT,0.021768,0.000,0.000,0.000,5.320090e+03,7.758409e+08,0.000000,0.000000
270,Civic,CVC,0.286170,-0.001,0.067,0.044,6.181465e+07,1.920557e+08,0.012591,0.019173
348,iExec RLC,RLC,1.810000,0.004,-0.001,0.010,4.304190e+06,1.297449e+08,0.018100,-0.001810
435,AllianceBlock,ALBT,0.229837,-0.003,0.038,0.053,3.801330e+05,8.895334e+07,0.012181,0.008734


### Feature Selection Report: Train/Test Split for Grid Search
- Split the data into training and testing sets for hyperparameter tuning.
- Ensured reproducibility with a fixed random state.


In [26]:
# Drop non-numeric columns from X_train and X_test
from sklearn.model_selection import GridSearchCV


X_train_numeric = X_train.select_dtypes(include=[np.number])
X_test_numeric = X_test.select_dtypes(include=[np.number])

# Define the parameter grid for RandomForestRegressor
param_grid = {
	'n_estimators': [50, 100, 200],
	'max_depth': [None, 10, 20, 30],
	'min_samples_split': [2, 5, 10],
	'min_samples_leaf': [1, 2, 4]
}

# Perform GridSearchCV
grid_search = GridSearchCV(estimator=RandomForestRegressor(random_state=42), param_grid=param_grid, cv=5, scoring='r2', verbose=2, n_jobs=-1)
grid_search.fit(X_train_numeric, y_train)

# Print the best parameters and best score
print("Best Parameters:", grid_search.best_params_)
print("Best R² Score:", grid_search.best_score_)

# Use the best estimator to make predictions
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test_numeric)

Fitting 5 folds for each of 108 candidates, totalling 540 fits
Best Parameters: {'max_depth': None, 'min_samples_leaf': 4, 'min_samples_split': 10, 'n_estimators': 100}
Best R² Score: -1.9274115382820427


### Feature Selection Report: Grid Search and Best Model Selection
- Performed GridSearchCV to find the best hyperparameters for Random Forest.
- Selected the best model based on cross-validated R² score.


In [27]:
# Train the best model
best_model.fit(X_train.select_dtypes(include=[np.number]), y_train)

# Make predictions
y_pred = best_model.predict(X_test.select_dtypes(include=[np.number]))

# Calculate evaluation metrics
rmse = np.sqrt(metrics.mean_squared_error(y_test, y_pred))
r2 = metrics.r2_score(y_test, y_pred)

print("Random Forest Regression")
print("RMSE: {:.4f}".format(rmse))
print("R² Score: {:.4f}".format(r2))

Random Forest Regression
RMSE: 0.3835
R² Score: 0.1924


### Feature Selection Report: Final Model Evaluation
- Evaluated the best model on the test set using RMSE and R² metrics.
- This provides a final assessment of model performance before deployment.
