In [13]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go

import random
import math

from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.datasets import make_blobs
from sklearn.metrics import log_loss

In [87]:
pd.set_option('display.float_format', '{:.1f}'.format)

In [9]:
df = pd.read_csv("../datasets/economy_asia_countries.csv", index_col=0)

df.head(3)

Unnamed: 0,Country,Country Code,Year,Exports (% of GDP),FDI (% of GDP),GDP (USD),Imports (% of GDP),Inflation (%),Density (p/km2),Population,Unemployment (% of total labor force)
0,Brunei Darussalam,BRN,1960,0.0,0.0,0.0,0.0,0.0,16.994497,85346.0,0.0
1,Brunei Darussalam,BRN,1961,0.0,0.0,0.0,0.0,0.0,16.994497,89561.0,0.0
2,Brunei Darussalam,BRN,1962,0.0,0.0,0.0,0.0,0.0,17.805882,93837.0,0.0


In [16]:
df_pop = df[["Country", "Year", "Population"]]
df_pop.head(3)

Unnamed: 0,Country,Year,Population
0,Brunei Darussalam,1960,85346.0
1,Brunei Darussalam,1961,89561.0
2,Brunei Darussalam,1962,93837.0


In [10]:
def linear_regression_1(x,y,x_for_pred) :
    x=np.array(x).reshape((-1, 1))
    y=np.array(y)
    x_for_pred=np.array(x_for_pred).reshape((-1, 1))
    model = LinearRegression().fit(x, y)
    y_pred = model.predict(x_for_pred)

    y_pre_pred = model.predict(x)
    pred_error=math.sqrt(mean_squared_error(y,y_pre_pred))/math.sqrt((max(y_pre_pred)-min(y_pre_pred))**2 +(max(x)-min(x))**2)
    return y_pred, y_pre_pred, pred_error
def linear_regression_2(x,y,x_for_pred) :
    x=np.array(x)
    x_=PolynomialFeatures(degree=2, include_bias=False).fit_transform(x.reshape((-1, 1)))
    y=np.array(y)
    x_for_pred=np.array(x_for_pred).reshape((-1, 1))
    x_for_pred=PolynomialFeatures(degree=2, include_bias=False).fit_transform(x_for_pred)
    
    model = LinearRegression().fit(x_, y)
    
    y_pred = model.predict( x_for_pred)
    y_pre_pred = model.predict(x_)
    pred_error=math.sqrt(mean_squared_error(y,y_pre_pred))/math.sqrt((max(y_pre_pred)-min(y_pre_pred))**2 +(max(x)-min(x))**2)
    return y_pred, y_pre_pred, pred_error

In [43]:
y_pred, y_pre_pred, pred_error = linear_regression_1(df['Year'].unique(),
                                                     df[df['Country']=='Cambodia']['Population'],[2022,2025,2030,2035,2040,2050])

In [29]:
y_pred2, y_pre_pred2, pred_error2 = linear_regression_2(df['Year'].unique(),
                                                     df[df['Country']=='Thailand']['Population'],[2022,2025,2030,2035,2040,2050])

In [44]:
fig=px.scatter(x=df['Year'].unique(), y=df[df['Country']=='Cambodia']['Population']) 
fig.add_traces(go.Scatter(x=df['Year'].unique(), y= y_pre_pred, mode='lines', name='Regression Fit'))
fig.update_layout(
    xaxis_title="Year",
    yaxis_title="Population",
    title={
        'text': "Kiểm tra bài toán Hồi quy đa thức bậc 1 cho dân số của Brunei Darussalam",
        'x' : 'Year',
        'y':0.95,
        'x':0.45,
        'xanchor': 'center',
        'yanchor': 'top'})
fig.show()

In [28]:
y_pred

array([5685599.28503436, 5900811.33368587, 6259498.08143839,
       6618184.82919091, 6976871.5769434 , 7694245.07244843])

In [33]:
fig=px.scatter(x=df['Year'].unique(), y=df[df['Country']=='Thailand']['Population']) 
fig.add_traces(go.Scatter(x=df['Year'].unique(), y= y_pre_pred2, mode='lines', name='Regression Fit'))
fig.update_layout(
    xaxis_title="Year",
    yaxis_title="Population",
    title={
        'text': "Kiểm tra bài toán Hồi quy đa thức bậc 2 cho dân số của Thái Lan",
        'x' : 'Year',
        'y':0.95,
        'x':0.45,
        'xanchor': 'center',
        'yanchor': 'top'})
fig.show()

In [37]:
def gen_linear(df,funct,x_for_pred):
    x=df['Year'].unique()
    for i in df['Country'].unique():
    
        mean_pred_error=0
        pred_error=[]
        df_=df[df['Country']==i]
        dict_pred={}
        dict_pred['Year']=x_for_pred
        col_pred=['Population']
        error_linear=0
        for col in col_pred : 
            dict_pred[col], pre_pred, error_linear=funct(x,df_[col],x_for_pred)
            pred_error.append(error_linear)
        mean_pred_error=sum(pred_error)/len(pred_error)
        dict_pred['Country']=[i]*len(x_for_pred)
        dict_pred['LossFunctCheck']=[mean_pred_error]*len(x_for_pred)
        
        #Có những nước với dụ đoán GIẢM có nguy cơ diệt vong (0 dân)
        dict_pred['Population']=[i if i>0 else 1000 for i in dict_pred.pop('Population')]
        dict_pred = [dict(zip(dict_pred,t)) for t in zip(*dict_pred.values())]
        for row in dict_pred :
            yield row
LD1=[i for i in gen_linear(df,linear_regression_1,[2022,2025,2030,2035,2040,2050]) ]
LD1=pd.DataFrame(LD1)

In [38]:
LD1

Unnamed: 0,Year,Country,LossFunctCheck,Population
0,2022,Brunei Darussalam,0.018659,4.667663e+05
1,2025,Brunei Darussalam,0.018659,4.859109e+05
2,2030,Brunei Darussalam,0.018659,5.178185e+05
3,2035,Brunei Darussalam,0.018659,5.497262e+05
4,2040,Brunei Darussalam,0.018659,5.816339e+05
...,...,...,...,...
91,2025,Vietnam,0.021791,1.046666e+08
92,2030,Vietnam,0.021791,1.102659e+08
93,2035,Vietnam,0.021791,1.158651e+08
94,2040,Vietnam,0.021791,1.214644e+08


In [42]:
fig=px.scatter(LD1[LD1['Year']==2050], x="Country", y="LossFunctCheck",range_y=[0,0.2],
        color="Country")

fig.update_xaxes(visible=False)
fig.show()

In [66]:
LD=[i for i in gen_linear(df[df['Year']>=1990],linear_regression_1,[2022,2025,2030,2035,2040,2050]) ]
LD=pd.DataFrame(LD)
LD

Unnamed: 0,Year,Country,LossFunctCheck,Population
0,2022,Brunei Darussalam,0.027370,4.629987e+05
1,2025,Brunei Darussalam,0.027370,4.809812e+05
2,2030,Brunei Darussalam,0.027370,5.109522e+05
3,2035,Brunei Darussalam,0.027370,5.409231e+05
4,2040,Brunei Darussalam,0.027370,5.708940e+05
...,...,...,...,...
91,2025,Vietnam,0.020216,1.016808e+08
92,2030,Vietnam,0.020216,1.063854e+08
93,2035,Vietnam,0.020216,1.110899e+08
94,2040,Vietnam,0.020216,1.157944e+08


In [67]:
fig=px.scatter(LD[LD['Year']==2050], x="Country", y="LossFunctCheck",range_y=[0,0.5],
        color="Country")
fig.update_xaxes(visible=False)
fig.show()

In [68]:
LD2 = [i for i in gen_linear(df.loc[(df['Year']>=1990) & (df["Country"].isin(["Japan", "Korea, Rep.", "Singapore", "Thailand"]))]
                             ,linear_regression_2,[2022, 2025,2030,2035,2040,2050]) ]
LD2 = pd.DataFrame(LD2)

In [69]:
fig=px.scatter(LD2[LD2['Year']==2050], x="Country", y="LossFunctCheck",range_y=[0,0.5],
        color="Country")
fig.update_xaxes(visible=False)
fig.show()

In [80]:
LD = LD.drop(LD[LD["Country"].isin(["Japan", "Korea, Rep.", "Singapore", "Thailand"])].index)
LD

Unnamed: 0,Year,Country,LossFunctCheck,Population
0,2022,Brunei Darussalam,0.027370,4.629987e+05
1,2025,Brunei Darussalam,0.027370,4.809812e+05
2,2030,Brunei Darussalam,0.027370,5.109522e+05
3,2035,Brunei Darussalam,0.027370,5.409231e+05
4,2040,Brunei Darussalam,0.027370,5.708940e+05
...,...,...,...,...
91,2025,Vietnam,0.020216,1.016808e+08
92,2030,Vietnam,0.020216,1.063854e+08
93,2035,Vietnam,0.020216,1.110899e+08
94,2040,Vietnam,0.020216,1.157944e+08


In [81]:
LD = LD.append(LD2)


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



In [91]:
LD.to_csv("../datasets/population_prediction.csv", index=False)