# Create a model that predicts the top 10 stock code for dividend investing

- Using boosted tree regression to test the accuracy and compare it with another algorithm identified by pycaret


In [1]:
# Import libraries
import numpy as np
import pandas as pd
import psycopg2
from sqlalchemy import create_engine
from urllib.parse import quote 
from datetime import timedelta

In [2]:
# Connect to database
conn_string = 'postgresql://postgres:Jiawei1105@localhost/dividendinvesting'

#connect
db = create_engine(conn_string)
conn = db.connect()
conn = psycopg2.connect(conn_string)
conn.autocommit = True
cursor = conn.cursor()

### Create a table, 'train_test_tab' in database where it will contains data starting from Year 2018 to 2021 for each stock code and it will be used to split it into training and test data

In [3]:
# Using sql query to get recent date
sql2 ='''SELECT "sum_stockCode", sum_year, sum_status, "sum_dividend(RM)", "sum_averageDividend_y_%", "sum_averageAnnual_dp(%)", "sum_averageQuar_eps", "sum_averageQuar_roe(%)", "final_averageOverall_Score"
FROM train_test_tab;;'''
cursor.execute(sql2)
train_test_table = cursor.fetchall()
df = pd.DataFrame(train_test_table, columns =["sum_stockCode", 'sum_year', 'sum_status', "sum_dividend(RM)", "sum_averageDividend_y_%", "sum_averageAnnual_dp(%)", "sum_averageQuar_eps", "sum_averageQuar_roe(%)", "final_averageOverall_Score"])
conn.commit()
conn.close()

In [4]:
df

Unnamed: 0,sum_stockCode,sum_year,sum_status,sum_dividend(RM),sum_averageDividend_y_%,sum_averageAnnual_dp(%),sum_averageQuar_eps,sum_averageQuar_roe(%),final_averageOverall_Score
0,0001,2020,Y,0.0150,0.971,44.0,0.034,10.1,8.0
1,0001,2021,Y,0.0150,1.343,44.0,0.034,8.7,9.0
2,0002,2018,Y,0.0500,3.396,0.0,0.118,10.5,6.0
3,0002,2019,Y,0.0740,4.306,0.0,0.161,13.7,6.0
4,0002,2020,Y,0.0900,3.847,0.0,0.208,16.5,7.0
...,...,...,...,...,...,...,...,...,...
3202,9962,2018,Y,0.0050,1.558,31.0,0.016,1.8,9.0
3203,9962,2019,Y,0.0050,2.058,16.0,0.030,3.2,7.0
3204,9962,2020,Y,0.0050,2.101,19.0,0.026,2.6,7.0
3205,9962,2021,Y,0.0100,3.362,30.0,0.033,3.4,10.0


# Modeling with Boosted Trees

In [5]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.model_selection import train_test_split

In [6]:
# Split your data into training and testing sets (70% as training and 30% as test data)
X_train, X_test, y_train, y_test = train_test_split(df.drop(["sum_stockCode", 'sum_year', 'sum_status', "sum_dividend(RM)",'final_averageOverall_Score'], axis=1),
                                                    df['final_averageOverall_Score'],
                                                    test_size=0.3,
                                                    random_state=42)

In [7]:
# Train a gradient boosting regressor with default hyperparameters
gbr = GradientBoostingRegressor(random_state=42)
gbr.fit(X_train, y_train)

In [8]:
# Make predictions on the test set
y_pred = gbr.predict(X_test)

In [9]:
# Compute the mean squared error (MSE) and R-squared on the test set
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

In [10]:
print(f"Mean squared error: {mse:.2f}")
print(f"R-squared: {r2:.2f}")

Mean squared error: 0.07
R-squared: 0.99


In [11]:
# Connect to database
conn_string = 'postgresql://postgres:Jiawei1105@localhost/dividendinvesting'

# Connect
db = create_engine(conn_string)
conn = db.connect()
conn = psycopg2.connect(conn_string)
conn.autocommit = True
cursor = conn.cursor()

### Create a table, 'pred_tab', in the database where it will contains data starting from Year 2022 to current for each stock code and it will be used for prediction

In [12]:
# Using sql query to get recent date
sql3 ='''SELECT "sum_stockCode", sum_year, sum_status, "sum_dividend(RM)", "sum_averageDividend_y_%", "sum_averageAnnual_dp(%)", "sum_averageQuar_eps", "sum_averageQuar_roe(%)", "final_averageOverall_Score"
FROM public.pred_tab;'''
cursor.execute(sql3)
pred_tab = cursor.fetchall()
df2 = pd.DataFrame(pred_tab, columns =["sum_stockCode", 'sum_year', 'sum_status', "sum_dividend(RM)", "sum_averageDividend_y_%", "sum_averageAnnual_dp(%)", "sum_averageQuar_eps", "sum_averageQuar_roe(%)", "final_averageOverall_Score"])
conn.commit()
conn.close()

In [13]:
df2

Unnamed: 0,sum_stockCode,sum_year,sum_status,sum_dividend(RM),sum_averageDividend_y_%,sum_averageAnnual_dp(%),sum_averageQuar_eps,sum_averageQuar_roe(%),final_averageOverall_Score
0,0001,2022,Y,0.0050,0.282,57.0,0.035,8.3,7.0
1,0002,2022,Y,0.2550,5.418,23.0,0.420,27.0,13.0
2,0005,2022,N,0.0000,0.000,0.0,0.000,0.0,2.0
3,0006,2022,N,0.0000,0.000,0.0,0.000,0.0,2.0
4,0007,2022,N,0.0000,0.000,0.0,0.000,0.0,2.0
...,...,...,...,...,...,...,...,...,...
743,9881,2022,Y,0.0150,2.727,33.0,0.045,3.3,9.0
744,9938,2022,N,0.0000,0.000,0.0,0.000,0.0,2.0
745,9954,2022,Y,0.0060,1.714,44.0,0.014,9.1,9.0
746,9962,2022,Y,0.0050,1.754,21.0,0.024,2.4,9.0


In [14]:
df4=pd.DataFrame(df2['sum_stockCode'])

In [15]:
df4

Unnamed: 0,sum_stockCode
0,0001
1,0002
2,0005
3,0006
4,0007
...,...
743,9881
744,9938
745,9954
746,9962


In [16]:
# Define all the predictor variables
x = df2[["sum_averageDividend_y_%", "sum_averageAnnual_dp(%)", "sum_averageQuar_eps", "sum_averageQuar_roe(%)"]]

In [17]:
y_pred = gbr.predict(x)

In [18]:
df4['score']=y_pred

In [19]:
df4

Unnamed: 0,sum_stockCode,score
0,0001,6.882681
1,0002,12.639580
2,0005,1.936232
3,0006,1.936232
4,0007,1.936232
...,...,...
743,9881,8.984306
744,9938,1.936232
745,9954,8.826096
746,9962,8.729320


In [20]:
sorted_df4 = df4.sort_values("score", ascending=False)

In [21]:
# Top 10 stock code 
sorted_df4.head(10)

Unnamed: 0,sum_stockCode,score
684,8044,16.34516
253,3255,14.353548
217,1929,13.758281
376,5139,13.732528
242,2836,13.469135
393,5168,13.407688
276,4006,13.222784
713,9059,13.045933
575,7106,13.045633
482,5819,12.963717


In [22]:
mse2 = mean_squared_error(df2['final_averageOverall_Score'],df4['score'])
r2 = r2_score(df2['final_averageOverall_Score'],df4['score'])
mae = mean_absolute_error(df2['final_averageOverall_Score'],df4['score'])
rmse = np.sqrt(mse2)

In [23]:
print(f"Mean squared error: {mse:.2f}")
print(f"R-squared: {r2:.2f}")
print(f"Mean Absolute Error: {mae:.2f}")
print(f"Root Mean Squared Error: {rmse:.2f}")

Mean squared error: 0.07
R-squared: 1.00
Mean Absolute Error: 0.13
Root Mean Squared Error: 0.24
