---
### Install Behind Proxy

- 1. pip install --proxy http://user:password@proxyserver:port TwitterApi

- 2. pip install --proxy http://yudhia:afandi305@proxy.kpc.co.id:8080 'pkg_name'

In [1]:
from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

def file_path(question, ext):
    """
    This Function will be used to specify the csv file of the imported database
    ----
    Parameters :
    1. questions : String -> This is the Question that will be promt user to input file
    2. extension : String -> Allowed file extension, in this case is csv
    """
    while True:
        try:
            file = Path(input(question))
            if file.exists() and file.is_file and file.suffix in ext:
                return file
        except:
            print('File not suitable, please retry')

In [3]:
csv = file_path('Masukkan File CSV  : ','.csv')
dataset = pd.read_csv(csv)


Masukkan File CSV  :  SGTData_Pandas31012025B.csv


In [4]:
dataset.head()

Unnamed: 0,Month,OB (Bcm),Coal (Ton),Distance OB (KM),Distance Coal (KM),Fuel
0,Jan-17,35058035.7,4205899.604,3.035133,9.153033,51331446.0
1,Feb-17,32749915.93,3738394.116,2.954104,9.865582,50819764.44
2,Mar-17,33460435.31,4277640.663,2.92029,9.388769,55697679.0
3,Apr-17,36270798.22,4595871.334,2.883726,10.0631,54963784.0
4,May-17,39840063.5,4517361.051,2.950965,10.078527,57712934.0


In [5]:
dataset.describe().round(2)

Unnamed: 0,OB (Bcm),Coal (Ton),Distance OB (KM),Distance Coal (KM),Fuel
count,108.0,108.0,108.0,108.0,108.0
mean,35727858.02,4578806.95,3.57,12.34,53136399.88
std,3584977.22,1879226.98,0.51,2.12,19799567.69
min,26566588.35,2325464.39,2.78,9.15,0.01
25%,33158212.83,3765946.64,3.08,10.4,53593200.57
50%,35541013.17,4093750.6,3.49,11.45,57202040.14
75%,38272121.17,4519362.25,4.07,14.6,63242655.5
max,47597123.41,10522012.01,4.61,15.82,79881538.0


In [6]:
dataset = dataset.drop(['Month'], axis=1)

In [7]:
len(dataset)

108

In [8]:
len(dataset.columns)

5

In [9]:
dataset.dtypes

OB (Bcm)              float64
Coal (Ton)            float64
Distance OB (KM)      float64
Distance Coal (KM)    float64
Fuel                  float64
dtype: object

---
### Feature selection - Fuel for the Target and the other is Features

In [10]:
target = dataset.iloc[:, 4].name
target

'Fuel'

In [11]:
features = dataset.iloc[:,:-1].columns.tolist()
features

['OB (Bcm)', 'Coal (Ton)', 'Distance OB (KM)', 'Distance Coal (KM)']

---
### # Correlations of features with target variable

In [12]:
correlations = dataset.corr()
correlations['Fuel']

OB (Bcm)              0.402906
Coal (Ton)           -0.908796
Distance OB (KM)     -0.290213
Distance Coal (KM)   -0.285109
Fuel                  1.000000
Name: Fuel, dtype: float64

Using a filter method, we are going to select which features are to be used subsequently. Although this is arbitrary and contestable, we will remove all features with an absolute correlation with the target variable which is smaller than 0.5.

In [None]:
cor_target = abs(correlations['Fuel'])
removed_features = cor_target[cor_target < 0.5]
removed_features

In [None]:
dataset = dataset.drop(['Coal (Ton)'], axis = 1)


In [None]:
dataset

In [None]:
# Plot Pearson correlation matrix
fig_1 = plt.figure(figsize=(14, 10))
new_correlations = dataset.corr()
sns.heatmap(new_correlations, annot=True, cmap='Greens', annot_kws={'size': 12})
plt.title('Pearson Correlation Matrix')
plt.show()

From the above figure, we can see that some features are highly correlated with each other. We will -arbitrarily- search for correlations above 0.5 and remove the features with the lowest correlation with the target variable (Fuel).

In [None]:
# Determine the highest intercorrelations
highly_correlated_features = new_correlations[new_correlations > 0.5]
highly_correlated_features.fillna('-')

In [None]:
features = dataset.iloc[:, :-1].columns.tolist()
features

In [None]:
len_of_features = len(features)
len_of_features

In [None]:
target = dataset.iloc[:,3].name
target

In [None]:
dataset

---
### Multivariate linear regression

In this section, we are going to implement our linear regression algorithm. At the very beggining, we are going to normalize the features so as to ensure an efficient convergence of the model.

Error : not same in dimension : dataset.iloc[:, :-1] = (dataset - dataset.mean())/dataset.std()

In [None]:
dataset.iloc[:, :-1] = (dataset.iloc[:, :-1] - dataset.iloc[:, :-1].mean()) / dataset.iloc[:, :-1].std()

dataset


In [None]:
# Create X, y and theta
X = dataset.iloc[:, :-1]
ones = np.ones([len(dataset), 1])
X = np.concatenate((ones, X), axis=1)

#y = dataset.iloc[:, 3].values
y = dataset.iloc[:, 3].values.reshape(-1, 1)  # Ensure y is a column vector
len_of_features = X.shape[1] - 1
theta = np.zeros([1, len_of_features + 1])

In [None]:
# Store target
target = y

# Display the size of the matrices
X.shape, y.shape, theta.shape

# Display the size of the matrices
#print("X shape:", X.shape)
#print("y shape:", y.shape)
#print("theta shape:", theta.shape)

In [None]:
# Define compute Fuel function
def compute_fuel(X, y, theta):
    H = X @ theta.T
    J = np.power((H - y), 2)
    sum = np.sum(J)/(2 * len(X))
    return sum

In [None]:
# Set iterations and alpha (learning rate)
alpha = 0.01
iterations = 500

In [None]:
# Define gradientdescent function
def gradientdescent(X, y, theta, iterations, alpha):
    cost = np.zeros(iterations)
    for i in range(iterations):
        H = X @ theta.T
        gradient = (alpha / len(X)) * (X.T @ (H - y))
        theta = theta - gradient.T
        cost[i] = compute_fuel(X, y, theta)
    return theta, cost

In [None]:
# Do Gradient Descent and display final theta
final_theta, cost = gradientdescent(X, y, theta, iterations, alpha)
final_theta.round(2)

# Display the final parameters and cost
print("Optimized theta:", theta)
print("Cost over iterations:", cost)

In [None]:
# Compute and display final cost
final_fuel = compute_fuel(X, y, final_theta)
final_fuel.round(2)

In [None]:
# Plot Iterations vs. Fuel figure
fig_2, ax = plt.subplots(figsize=(10, 8))
ax.plot(np.arange(iterations), cost, 'r')
ax.set_xlabel('Iterations')
ax.set_ylabel('Fuel')
ax.set_title('Iterations vs. Fuel')
plt.show()

Furthermore, we will define a function which computes the Root Mean Squared Error (RMSE) so as to measure the differences between the values predicted by our model and the observed values we had in the beggining of our analysis. RMSE is an absolute measure of fit and it is in the same units as our response variable, i.e. the Fuel in Liters.

In [None]:
# Define rmse function
def rmse(target, final_theta):
    predictions = X @ final_theta.T
    return np.sqrt(((predictions[:, 0] - target[:, 0]) ** 2).mean())

# Compute and display Root Mean Squared Error
rmse_val = rmse(target, final_theta)
rmse_val.round(2)

In [None]:
# Display sample prediction for first observation
predictions = X @ final_theta.T
str(predictions[0].round(2))

In [None]:
str(predictions[1].round(2))

In [None]:
str(predictions[2].round(2))

In [None]:
str(predictions[3].round(2))

### Test The Prediction Model
- Open New Dataset to test Prediction Model
- Data is Production Plan 2025

In [None]:
csv = file_path('Masukkan File CSV  : ','.csv')
new_dataset = pd.read_csv(csv)

In [None]:
new_dataset.head()


In [None]:
new_dataset.describe().round(2)

In [None]:
new_dataset.dtypes

In [None]:
new_dataset = new_dataset.drop(['Month','Coal (Ton)'], axis=1)
new_dataset

print("Original dataset columns:", dataset.columns)
print("New dataset columns:", new_dataset.columns)

In [None]:
# Calculate the mean and standard deviation of the original training data

mean = dataset.iloc[:, :-1].mean().values
std = dataset.iloc[:, :-1].std().values

# Check the standard deviation for the 'OB (Bcm)' column
ob_bcm_std = std[dataset.columns.get_loc('OB (Bcm)')]

# Normalize the features of the new dataset
new_dataset.iloc[:, :-1] = (new_dataset.iloc[:, :-1] - mean) / std

#new_dataset.iloc[:, :-1] = (new_dataset.iloc[:, :-1] - dataset.iloc[:, :-1].mean()) / dataset.iloc[:, :-1].std()
print("Original dataset mean:", mean)
print("original dataset std:", std)
print("Standard Dev OB Bcm:", ob_bcm_std)

In [None]:
# Check the normalized values
print("Normalized 'OB (Bcm)' column:")
print(new_dataset['OB (Bcm)'])

In [None]:
# Add the intercept term
ones = np.ones([new_dataset.shape[0], 1])
new_dataset = np.concatenate((ones, new_dataset), axis=1)

In [None]:
# Define the predict function
def predict(X, theta):
    return X @ theta.T

# Make predictions for the new dataset
new_predictions = predict(new_dataset, final_theta)