## Importing Libraries

In [525]:
import pandas as pd
import numpy as np
from pathlib import Path
import os
import numpy as np
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns
import xgboost as xgb

## Loading Data to Pandas DataFrame

In [526]:
df = pd.read_csv('real-estate.csv')

In [527]:
print(df.head())

   No  X1 transaction date  X2 house age  \
0   1             2012.917          32.0   
1   2             2012.917          19.5   
2   3             2013.583          13.3   
3   4             2013.500          13.3   
4   5             2012.833           5.0   

   X3 distance to the nearest MRT station  X4 number of convenience stores  \
0                                84.87882                               10   
1                               306.59470                                9   
2                               561.98450                                5   
3                               561.98450                                5   
4                               390.56840                                5   

   X5 latitude  X6 longitude  Y house price of unit area  
0     24.98298     121.54024                        37.9  
1     24.98034     121.53951                        42.2  
2     24.98746     121.54391                        47.3  
3     24.98746     121.54391  

## Dropping Unnecessary Columns

In [528]:
df = df.drop(columns=['No', 'X1 transaction date'])

## Splitting Data to Train and Test

In [529]:
train, test = train_test_split(df, test_size=0.2, random_state=20, shuffle=True)

## Visualizing Data Distribution and Correlation

In [530]:
sns.pairplot(df[df.columns])
plt.show()


## Splitting Features and Targets

In [531]:
X_train = train.iloc[:, 0:-1]
y_train = train.iloc[:, -1]
X_test = test.iloc[:, 0:-1]
y_test = test.iloc[:, -1]

In [532]:
# n_dimensions
n, d = np.shape(X_train)
n_t = np.shape(X_test)[0]

In [533]:
print(d, n, n_t)

5 331 83


In [534]:
X_train = X_train.to_numpy()
X_train_old = X_train.copy()
y_train = y_train.to_numpy()
X_test = X_test.to_numpy()
X_test_old = X_test.copy()
y_test = y_test.to_numpy()
lis = [X_train, X_test, d]

def add_bias():
    lis[0] = np.append(np.ones(n).reshape(n, 1), lis[0], axis=1)
    lis[1] = np.append(np.ones(n_t).reshape(n_t, 1), lis[1], axis=1)
    lis[2] += 1

# add_bias()
# X_train, X_test, d = lis

## Evaluation Function

In [535]:
# MSE evaluation
def evaluate(X, y, parameters):
    diff = y - np.matmul(X, parameters)
    eval = np.sum(np.power(diff, 2))/np.size(diff)
    return eval

## Linear Regression

In [536]:
# derivative
# beta = (X_train.T * X_train)^(-1) * X_train.T * y_train
beta_linear = np.matmul(np.linalg.inv(np.matmul(X_train.T, X_train)), np.matmul(X_train.T, y_train))

# test
print(evaluate(X_test, y_test, beta_linear))
print(beta_linear)

52.33244456006061
[-2.93019547e-01 -4.42037513e-03  1.25778754e+00  2.22884035e+02
 -4.54391784e+01]


## Polynomial Regression

In [537]:
# derivative
# beta = (X_train.T * X_train)^(-1) * X_train.T * y_train
X_poly = np.append(X_train, np.power(X_train_old, 2), axis=1)
beta_linear = np.matmul(np.linalg.inv(np.matmul(X_poly.T, X_poly)), np.matmul(X_poly.T, y_train))

# test
X_test_poly = np.append(X_test, np.power(X_test_old, 2), axis=1)
print(evaluate(X_test_poly, y_test, beta_linear))
print(beta_linear)

40.72222661661381
[-1.01675378e+00 -1.22210056e-02  5.58597443e-01 -3.33244588e+05
  6.84201613e+04  1.87478484e-02  1.59841278e-06  8.50228858e-03
  6.67786438e+03 -2.81497311e+02]


## Ridge Regression

In [539]:
l = 100

# derivative
# beta = (X_train.T * X_train + lambda * I)^(-1) * X_train.T * y_train
beta_ridge = np.matmul(np.linalg.inv(np.matmul(X_train.T, X_train) + l * np.identity(d)), np.matmul(X_train.T, y_train))

# test
print(evaluate(X_test, y_test, beta_ridge))
print(beta_ridge)

57.08434691091277
[-0.27008115 -0.00518566  1.32823035  0.14839317  0.32246376]


## Lasso Regression

In [567]:
l = 1
alpha = 0.000000001
iter = 100000
beta_lasso = np.zeros(d)

# train
# beta = beta - learning_rate *(X_train.T * X_train * beta + l/2 * sign(beta) - X_train.T * y_train)
for _ in range(iter):
    beta_lasso = beta_lasso - alpha * (
        np.matmul(np.matmul(X_train.T, X_train), beta_lasso) + l / 2 * np.sign(beta_lasso) - np.matmul(X_train.T, y_train))

# test
print(evaluate(X_test, y_test, beta_lasso))
print(beta_lasso)

58.61647887412847
[-0.23960494 -0.00675607  0.23675963  0.07961039  0.38310518]


## Elastic Net Regression

In [564]:
l = 1
alpha = 0.000000001
iter = 100000
beta_elastic_net = np.zeros(d)

# train
# beta = beta - learning_rate *(X_train.T * X_train * beta + l/2 * sign(beta) + l * beta - X_train.T * y_train)
for _ in range(iter):
    beta_elastic_net = beta_elastic_net - alpha * (
        np.matmul(np.matmul(X_train.T, X_train), beta_elastic_net) + 
        l / 2 * np.sign(beta_elastic_net) + l * beta_elastic_net - np.matmul(X_train.T, y_train))

# test
print(evaluate(X_test, y_test, beta_elastic_net))
print(beta_elastic_net)

58.61651177736179
[-0.23959844 -0.00675608  0.23674879  0.07961023  0.38310464]


## XGBoost Regression 

In [542]:

beta_xgboost = xgb.XGBRegressor()
beta_xgboost.fit(X_train, y_train)

diff = y_test - beta_xgboost.predict(X_test)
eval = np.sum(np.power(diff, 2))/np.size(diff)
print(eval)

43.97082047620512


In [543]:
class XGboostReg:
    def __init__(self, steps=5):
        self.steps = steps
        self.lin_reg_params = []
    def fit(self, X, y):
        n = np.size(y)
        l = np.ones(n) * sum([yi for yi in y]) / n 
        for _ in range(self.steps):
            resid_y = np.array([(y[i] - l[i]) for i in range(n)]) * 2 / n
            self.lin_reg_fit(X, resid_y)
            l = self.pred(X)
            
            
    def lin_reg_fit(self, X, y):
        self.lin_reg_params.append(np.matmul(np.linalg.inv(np.matmul(X.T, X)), np.matmul(X.T, y)))
    def lin_reg_pred(self, X):
        return np.matmul(X, self.lin_reg_params)
    def pred(self, X):
        n = np.shape(X)[0]
        y = np.zeros(n)
        y = sum([np.matmul(X, self.lin_reg_params[j]) for j in range(len(self.lin_reg_params))])
        return y
            
        
        

In [544]:
beta_xgboost = XGboostReg(steps=1000)
beta_xgboost.fit(X_train, y_train)

diff = y_test - beta_xgboost.pred(X_test)
eval = np.sum(np.power(diff, 2))/np.size(diff)
print(eval)

52.341203821243774


# Assosiation Rules

In [545]:
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori, association_rules, fpgrowth

In [546]:
df = pd.read_csv('grocery.csv', header=None)

In [547]:
# Preprocessing Data
transactions = []
for i in range(len(df)):
    l = []
    set = df.iloc[i, :][0].split(',')
    for item in set:
        l.append(item)
    transactions.append(l)

In [548]:
# Applying TransactionEncoder
t_encoder = TransactionEncoder()
t_encoded = t_encoder.fit(transactions).transform(transactions)
# To pandas dataframe
df_t_encoded = pd.DataFrame(t_encoded, columns=t_encoder.columns_)

In [549]:
# Frequent Itemsets
frequent_itemsets = fpgrowth(df_t_encoded, min_support=0.2,use_colnames=True)
frequent_itemsets

Unnamed: 0,support,itemsets
0,0.65,(BREAD)
1,0.35,(BISCUIT)
2,0.25,(MILK)
3,0.3,(CORNFLAKES)
4,0.35,(TEA)
5,0.2,(BOURNVITA)
6,0.25,(MAGGI)
7,0.4,(COFFEE)
8,0.3,(SUGER)
9,0.2,"(BREAD, BISCUIT)"


In [550]:
# Association Rules
rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.5)
rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
0,(BISCUIT),(BREAD),0.35,0.65,0.2,0.571429,0.879121,-0.0275,0.816667,-0.174603
1,(MILK),(BREAD),0.25,0.65,0.2,0.8,1.230769,0.0375,1.75,0.25
2,(CORNFLAKES),(COFFEE),0.3,0.4,0.2,0.666667,1.666667,0.08,1.8,0.571429
3,(COFFEE),(CORNFLAKES),0.4,0.3,0.2,0.5,1.666667,0.08,1.4,0.666667
4,(TEA),(BREAD),0.35,0.65,0.2,0.571429,0.879121,-0.0275,0.816667,-0.174603
5,(TEA),(MAGGI),0.35,0.25,0.2,0.571429,2.285714,0.1125,1.75,0.865385
6,(MAGGI),(TEA),0.25,0.35,0.2,0.8,2.285714,0.1125,3.25,0.75
7,(SUGER),(COFFEE),0.3,0.4,0.2,0.666667,1.666667,0.08,1.8,0.571429
8,(COFFEE),(SUGER),0.4,0.3,0.2,0.5,1.666667,0.08,1.4,0.666667
9,(SUGER),(BREAD),0.3,0.65,0.2,0.666667,1.025641,0.005,1.05,0.035714
