In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [17]:
df=pd.read_csv(r'/content/data_with_liquidity.csv')

In [18]:
df

Unnamed: 0,date,coin,symbol,price,1h,24h,7d,24h_volume,mkt_cap,liquidity
0,2022-03-16,Mina Protocol,MINA,1.810000,0.015,0.105,-0.004,70360743.00,756143474.0,0.093052
1,2022-03-16,Holo,HOT,0.004148,0.027,0.027,0.028,63199499.00,734353201.0,0.086061
2,2022-03-16,Compound,COMP,109.580000,0.026,0.045,0.052,40756551.00,726097698.0,0.056131
3,2022-03-16,Bitkub Coin,KUB,7.890000,0.004,0.000,-0.034,2193655.00,698922310.0,0.003139
4,2022-03-16,Neutrino USD,USDN,1.000000,0.003,-0.001,0.007,15319422.00,687954763.0,0.022268
...,...,...,...,...,...,...,...,...,...,...
666,2022-03-17,IRISnet,IRIS,0.055426,0.016,-0.003,-0.088,2976839.00,68090240.0,0.043719
667,2022-03-17,Circuits of Value,COVAL,0.037961,0.002,-0.012,-0.054,366787.00,67826274.0,0.005408
668,2022-03-17,ARPA Chain,ARPA,0.069003,-0.000,0.008,-0.037,13633759.00,67762845.0,0.201198
669,2022-03-17,SuperRare,RARE,0.464613,-0.003,0.014,0.019,9398219.00,67388220.0,0.139464


In [19]:
# first see the highest liquidity
df['liquidity'].max()

0.870052271470561

In [20]:
# lowest liquidity
df['liquidity'].min()

0.0

In [21]:
## liquidity ranges from 0 to 1

### removing unnecesssary columns

In [22]:
# removing coin symbol date
df.drop(['coin','symbol','date'],axis=1,inplace=True)

In [23]:
df

Unnamed: 0,price,1h,24h,7d,24h_volume,mkt_cap,liquidity
0,1.810000,0.015,0.105,-0.004,70360743.00,756143474.0,0.093052
1,0.004148,0.027,0.027,0.028,63199499.00,734353201.0,0.086061
2,109.580000,0.026,0.045,0.052,40756551.00,726097698.0,0.056131
3,7.890000,0.004,0.000,-0.034,2193655.00,698922310.0,0.003139
4,1.000000,0.003,-0.001,0.007,15319422.00,687954763.0,0.022268
...,...,...,...,...,...,...,...
666,0.055426,0.016,-0.003,-0.088,2976839.00,68090240.0,0.043719
667,0.037961,0.002,-0.012,-0.054,366787.00,67826274.0,0.005408
668,0.069003,-0.000,0.008,-0.037,13633759.00,67762845.0,0.201198
669,0.464613,-0.003,0.014,0.019,9398219.00,67388220.0,0.139464


In [81]:
high_liquidity_rows = df[df['liquidity_level'] == 1]

In [82]:
high_liquidity_rows

Unnamed: 0,price,1h,24h,7d,24h_volume,mkt_cap,liquidity,liquidity_level
263,0.020986,0.048,0.056,0.009,58902495.0,92248152.0,0.638522,1
277,0.769839,0.005,0.044,-0.032,72974143.0,83873286.0,0.870052,1
309,0.007556,0.011,-0.054,0.052,58152200.0,70736052.0,0.822101,1
554,39.63,0.002,0.044,-0.01,70589902.0,124043364.0,0.569074,1
611,0.020213,0.008,-0.03,-0.113,55707710.0,90445041.0,0.615929,1
619,1.11,-0.011,0.009,0.089,56512453.0,85908280.0,0.657823,1


In [32]:
## adding a column to check high liquidity and low liquidty if liquidity is more than 0.5 then high(1) if not then low(0) liquidity
df['liquidity_level']=np.where(df['liquidity']>=0.5,1,0)

In [33]:
df

Unnamed: 0,price,1h,24h,7d,24h_volume,mkt_cap,liquidity,liquidity_level
0,1.810000,0.015,0.105,-0.004,70360743.00,756143474.0,0.093052,0
1,0.004148,0.027,0.027,0.028,63199499.00,734353201.0,0.086061,0
2,109.580000,0.026,0.045,0.052,40756551.00,726097698.0,0.056131,0
3,7.890000,0.004,0.000,-0.034,2193655.00,698922310.0,0.003139,0
4,1.000000,0.003,-0.001,0.007,15319422.00,687954763.0,0.022268,0
...,...,...,...,...,...,...,...,...
666,0.055426,0.016,-0.003,-0.088,2976839.00,68090240.0,0.043719,0
667,0.037961,0.002,-0.012,-0.054,366787.00,67826274.0,0.005408,0
668,0.069003,-0.000,0.008,-0.037,13633759.00,67762845.0,0.201198,0
669,0.464613,-0.003,0.014,0.019,9398219.00,67388220.0,0.139464,0


## now we split features and target


In [57]:
X=df.drop(['liquidity','liquidity_level'],axis=1)
y=df['liquidity']

## train test and split

In [58]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=42)

## Feature scaling

## Normalization of data generally involves scaling the data.

## Normalization is a process that transforms data to a specific range or distribution.
## Scaling is a common technique used within the normalization process to achieve this transformation.

## while normalization often involves scaling, it's not only scaling. Other techniques might also be used in normalization, although scaling is the most frequent and widely understood method.

In [59]:
# scaling data

In [60]:
scaler=StandardScaler()
X_train=scaler.fit_transform(X_train)
X_test=scaler.transform(X_test)

In [73]:
import pickle

# Save the scaler
with open('liquidity_scaler.pkl', 'wb') as file:
    pickle.dump(scaler, file)

In [61]:
## checking my scaled data
X_test,X_train

(array([[-2.32593216e-01, -1.05123699e+00, -1.10346485e-01,
         -9.94857416e-01,  2.13991283e+00,  1.80887745e+00],
        [-3.55072616e-01,  2.75292778e-01, -2.17607895e-02,
          2.50683103e-01, -6.25812161e-01, -3.66335881e-01],
        [-3.23219383e-01, -7.39112340e-01, -4.35160700e-01,
          1.39968390e-01, -5.72297781e-01, -2.13073438e-01],
        [-3.09017257e-01, -7.39112340e-01, -5.23746395e-01,
          1.12289712e-01, -7.06526665e-01, -9.13195865e-01],
        [-2.75933011e-01,  8.21510919e-01, -3.17046440e-01,
         -1.46539494e+00, -4.33086281e-01, -8.43388369e-01],
        [-3.51599202e-01, -5.83050014e-01, -4.94217830e-01,
         -5.65837904e-01, -4.95286216e-01,  1.80597582e+00],
        [-3.61340604e-01,  1.13363557e+00, -5.82803525e-01,
          1.12289712e-01, -5.06416442e-01, -9.41386260e-01],
        [-3.61658779e-01, -4.26987688e-01, -7.89503480e-01,
         -1.72834239e+00, -3.75698211e-01, -5.54324631e-01],
        [-3.63154350e-01, -7.391

## using random forest algorithm

Benefits of Random Forest Regressor

Robustness to Overfitting: Random Forests are less prone to overfitting compared to single decision trees due to the ensemble approach and feature randomness. This is crucial for generalization to unseen data and achieving better predictive performance on your test set (X_test, y_test) in your Colab notebook.

Handles Non-linearity: Random Forests can capture complex, non-linear relationships between features and the target variable (liquidity in your case). This is important if the relationship between your features (price, volume, etc.) and liquidity is not a simple linear one.

In [62]:
from sklearn.ensemble import RandomForestRegressor
model=RandomForestRegressor()
model.fit(X_train,y_train)

In [63]:
y_pred=model.predict(X_test)

In [64]:
## checking accuracy
from sklearn.metrics import r2_score
r2_score(y_test,y_pred)

0.9647826930831125

In [65]:
## checking training accuracy
y_train_pred=model.predict(X_train)
r2_score(y_train,y_train_pred)

0.9933430218414128

Addressing Potential Overfitting

Regularization: Consider adding regularization to your model, techniques like L1 or L2 regularization, or using a different model entirely that is less prone to overfitting to  training data.

Hyperparameter Tuning: Experiment with different hyperparameters (e.g. n_estimators, max_depth) of the RandomForestRegressor to find the optimal balance between training and testing performance.

Cross-Validation: Implement cross-validation to get a more robust estimate of my model's performance on unseen data.

In [66]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor

# Define the parameter grid
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 5, 10],
    'min_samples_split': [2, 5, 10]
}

# Create the GridSearchCV object
grid_search = GridSearchCV(estimator=RandomForestRegressor(random_state=42),
                           param_grid=param_grid,
                           scoring='r2',
                           cv=5,
                           verbose=2)

# Fit the grid search to the data
grid_search.fit(X_train, y_train)

# Get the best parameters and best score
print("Best parameters:", grid_search.best_params_)
print("Best R-squared:", grid_search.best_score_)

# Evaluate the model with the best parameters on the test set
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)
print("Test R-squared:", r2_score(y_test, y_pred))

Fitting 5 folds for each of 27 candidates, totalling 135 fits
[CV] END max_depth=None, min_samples_split=2, n_estimators=100; total time=   0.3s
[CV] END max_depth=None, min_samples_split=2, n_estimators=100; total time=   0.3s
[CV] END max_depth=None, min_samples_split=2, n_estimators=100; total time=   0.3s
[CV] END max_depth=None, min_samples_split=2, n_estimators=100; total time=   0.3s
[CV] END max_depth=None, min_samples_split=2, n_estimators=100; total time=   0.3s
[CV] END max_depth=None, min_samples_split=2, n_estimators=200; total time=   0.5s
[CV] END max_depth=None, min_samples_split=2, n_estimators=200; total time=   0.5s
[CV] END max_depth=None, min_samples_split=2, n_estimators=200; total time=   0.5s
[CV] END max_depth=None, min_samples_split=2, n_estimators=200; total time=   0.5s
[CV] END max_depth=None, min_samples_split=2, n_estimators=200; total time=   0.5s
[CV] END max_depth=None, min_samples_split=2, n_estimators=300; total time=   0.8s
[CV] END max_depth=None, 

In [70]:
# Predict on training data using the best model from GridSearchCV
y_train_pred = best_model.predict(X_train)

# Calculate R-squared for training data
train_r2 = r2_score(y_train, y_train_pred)
print("Training R-squared (GridSearchCV):", train_r2)

Training R-squared (GridSearchCV): 0.994696479004589


In [71]:
import pickle

with open('tuned_liquidity_model.pkl', 'wb') as file:
    pickle.dump(best_model, file)

In [54]:
importances = best_model.feature_importances_

# Create a DataFrame
feature_importances = pd.DataFrame({'feature': X.columns, 'importance': importances})

# Sort by importance
feature_importances = feature_importances.sort_values(by='importance', ascending=False)

# Display the DataFrame
print(feature_importances)

      feature  importance
4  24h_volume    0.654068
5     mkt_cap    0.328671
1          1h    0.005114
0       price    0.004528
2         24h    0.003989
3          7d    0.003630


## i am using only 4 columns as input     

### selected_features = ['24h_volume', 'mkt_cap', '1h', 'price']

In [72]:
import pickle

# Load the model
with open('tuned_liquidity_model.pkl', 'rb') as file:
    loaded_model = pickle.load(file)

In [84]:
import pickle
import pandas as pd
import numpy as np
with open('liquidity_scaler.pkl', 'rb') as file:
    scaler = pickle.load(file)

def predict_liquidity(volume, mkt_cap, h1, price):
    """Predicts liquidity based on user input features.

    Args:
        volume: 24h trading volume.
        mkt_cap: Market capitalization.
        h1: 1h price change.
        price: Current price.

    Returns:
        The predicted liquidity value.
    """
    # Use the correct feature names that the model and scaler were trained on
    features = ['24h_volume', 'mkt_cap', '1h', 'price']
    user_input = [volume, mkt_cap, h1, price]

    # Create the input DataFrame with the correct column names
    input_df = pd.DataFrame([user_input], columns=features)

    # Get the columns that the scaler was trained with
    scaler_columns = scaler.feature_names_in_

    # Select only those columns and ensure they are in the correct order
    input_df_scaled = input_df[[col for col in scaler_columns if col in input_df.columns]]

    # Reorder the columns to match the order the scaler was trained on
    input_df_scaled = input_df_scaled.reindex(columns=scaler_columns, fill_value=0)

    scaled_input = scaler.transform(input_df_scaled)
    prediction = loaded_model.predict(scaled_input)[0]
    liquidity_level = "High" if prediction >= 0.5 else "Low"

    return prediction, liquidity_level

# Get user input and predict
volume = float(input("Enter the 24h volume: "))
mkt_cap = float(input("Enter the market cap: "))
h1 = float(input("Enter the 1h price change: "))
price = float(input("Enter the current price: "))

predicted_liquidity, liquidity_level = predict_liquidity(volume, mkt_cap, h1, price)
print(f"Predicted liquidity: {predicted_liquidity}")
print(f"Liquidity Level: {liquidity_level}")

Enter the 24h volume: 65467843
Enter the market cap: 98235243
Enter the 1h price change: 0.03
Enter the current price: 39.07
Predicted liquidity: 0.6357783042317445
Liquidity Level: High
