In [1]:
#import library
import pandas as pd 
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
import tensorflow as tf
import matplotlib.pyplot as plt 

In [2]:
#import data
df = pd.read_csv('datamobil.csv',sep = ';',encoding = 'UTF-8')
df.head()

Unnamed: 0.1,Unnamed: 0,horsepower,price
0,0,111.0,13495.0
1,1,111.0,16500.0
2,2,154.0,16500.0
3,3,102.0,13950.0
4,4,115.0,17450.0


In [3]:
#drop 'Unnamed: 0' columns
df = df.drop(columns = ['Unnamed: 0'])

In [4]:
#copying the data into the new variable df1
df1 = df.copy()

In [5]:
#variable "rata_harga" used for grouping the mean of the price 
rata_harga = df1.groupby('horsepower')['price'].mean().reset_index()
#rename the "price" columns into "rata harga"
rata_harga = rata_harga.rename(columns={'price': 'rata harga'})
#merge the dataset with rata_harga
df1 = df1.merge(rata_harga, on='horsepower', how='left')
#drop the "price" column
df1 = df1.drop(columns = ['price'])

In [6]:
#split into train data and test data
x = np.array(df1['horsepower']).reshape(-1,1)
y = np.array(df1['rata harga']).reshape(-1,1)

In [7]:
#initialize the linear regression to variable "lr"
lr = LinearRegression()

In [8]:
#initialize the Standard Scaler to variable "scaler"
scaler = StandardScaler()

In [9]:
#find the sum of duplicated data
df1.duplicated().sum()

142

In [10]:
#drop the value of duplicates 
df1 = df1.drop_duplicates()

In [11]:
#scalling the data
x_fit = scaler.fit_transform(x)
y_fit = scaler.fit_transform(y)

In [12]:
#fitting the data into the model
model = lr.fit(x_fit, y_fit)
a = model.coef_[0]
b = model.intercept_

In [13]:
print('a = {}'.format(a))
print('b = {}'.format(b))

a = [0.83125408]
b = [-1.36585858e-16]


In [14]:
#predict the data
predicted = model.predict(x_fit)

In [15]:
print(predicted[:5])

[[ 0.170811  ]
 [ 0.170811  ]
 [ 1.12717467]
 [-0.02935814]
 [ 0.25977506]]


In [16]:
#importing the scikit learn metrics for the losses and estimation
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

In [17]:
# calculating mse 
mse = mean_squared_error(y_fit,predicted)
# calculating RMSE
rmse = np.sqrt(mse)

# calculating R2
r2 = r2_score(y_fit, predicted)

# calculating mae 
mae = mean_absolute_error(y_fit, predicted)

print('mae =',round(mae,2))
print('mse =',round(mse,2))
print('rmse =',np.round(rmse,2))
print('r2 score =',np.round(r2,2))

mae = 0.36
mse = 0.31
rmse = 0.56
r2 score = 0.69


In [18]:
#initialize the normalization in tensorflow
normalizer = tf.keras.layers.Normalization(axis = -1)

In [19]:
#normalization data 
normalizer.adapt(np.array(df1).reshape(-1,1))

In [20]:
print(normalizer.mean.numpy())

[[7791.2285]]


In [21]:
first = df1[:1]

with np.printoptions(precision=2, suppress=True):
    print('First example:', first)
    print()
    print('Normalized:', normalizer(first).numpy())

First example:    horsepower  rata harga
0       111.0     13237.0

Normalized: [[-0.77  0.55]]


In [22]:
#made model
tensorflow_model = tf.keras.Sequential([
    tf.keras.layers.Dense(units=1, input_shape = [1])
])

#model summary
tensorflow_model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 1)                 2         
                                                                 
Total params: 2
Trainable params: 2
Non-trainable params: 0
_________________________________________________________________


In [23]:
#decide the optimizer and loss method
tensorflow_model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=0.1),
    loss='mean_absolute_error', metrics = ['mae'])

In [24]:
#fitting the data into the model
tensorflow_model.fit(
    x_fit,
    y_fit,
    epochs=10,
    # Suppress logging.
    verbose='auto',
    # Calculate validation results on 20% of the training data.
    validation_data = None)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x1c62272e110>

as we can see that in the mean absolute error/mae for scikit-learn is 0.36 and in tensorflow 0.36, 
means that linear regresion in tensorflow and scikit-learn is same.
perhaps we can add some layers or change the losses and optimizer method. 
and add some parameter for more accuracy