In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import mean_squared_error
import pickle

# Load data
data = pd.read_csv('/content/drive/MyDrive/retail_sales_dataset.csv')

# Menampilkan beberapa baris pertama data untuk memeriksa isinya
print(data.head())

# Fitur dan target
X = data[['Gender', 'Age', 'Product Category', 'Quantity', 'Price per Unit']]
y = data['Total Amount']

# One-hot encoding untuk fitur kategorikal
encoder = OneHotEncoder(drop='first', sparse=False)
X_encoded = encoder.fit_transform(X[['Gender', 'Product Category']])

# Konversi hasil encoding ke DataFrame dan gabungkan dengan fitur numerik
X_encoded_df = pd.DataFrame(X_encoded, columns=encoder.get_feature_names_out(['Gender', 'Product Category']))
X_final = pd.concat([X.drop(['Gender', 'Product Category'], axis=1), X_encoded_df], axis=1)

# Bagi data menjadi set pelatihan dan pengujian
X_train, X_test, y_train, y_test = train_test_split(X_final, y, test_size=0.2, random_state=42)

# Inisialisasi dan latih model regresi linear
model = LinearRegression()
model.fit(X_train, y_train)

# Evaluasi model
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print(f'Mean Squared Error: {mse}')

# Simpan model dan encoder
with open('model.pkl', 'wb') as f:
    pickle.dump(model, f)

with open('encoder.pkl', 'wb') as f:
    pickle.dump(encoder, f)

   Transaction ID        Date Customer ID  Gender  Age Product Category  \
0               1  2023-11-24     CUST001    Male   34           Beauty   
1               2  2023-02-27     CUST002  Female   26         Clothing   
2               3  2023-01-13     CUST003    Male   50      Electronics   
3               4  2023-05-21     CUST004    Male   37         Clothing   
4               5  2023-05-06     CUST005    Male   30           Beauty   

   Quantity  Price per Unit  Total Amount  
0         3              50           150  
1         2             500          1000  
2         1              30            30  
3         1             500           500  
4         2              50           100  
Mean Squared Error: 41883.99204291284


