
# 1. Understand the Data



In [None]:
import pandas as pd

df=pd.read_csv('data/data.csv') # Read the CSV

print(f'shape : {df.shape}  \n')

df.columns = [name.strip() for name in list(df.columns)]  # Clean the columns names

print(f'columns : {df.columns.to_list()}  \n')

print(f'info : \n{df.info()}')

##print(f'Reading : \n{df.describe()}')

In [None]:
#handling duplicates and null

print('duplicated data points : ', df.duplicated().sum())

df = df.drop_duplicates()


print('null data points : \n', df.isnull().sum())
df = df.dropna()

print('shape : ', df.shape)

# 2. Data Range Selection

In [None]:
##function for data selection such as tail, head, random and number of data points
def choose_data_order(d_frame):
  sample_number = int(input("Enter number of records for each town \n** Note: Not less than 10, Not More Than 40\n"))

  if sample_number >= 10 and sample_number <=40:
    choice = input("Type \n 1 for Top data \n 2 for Bottom data \n 3 for Random data \n")

    if choice == '1':
      d_frame = d_frame.groupby('town').head(sample_number)
    elif choice == '2':
      d_frame = d_frame.groupby('town').tail(sample_number)
    elif choice == '3':
      d_frame = d_frame.groupby('town').sample(sample_number)
    else:
      print("Invalid Choice")
      return None
    return d_frame

  else:
    print("Invalid Sample Number")
    return None


trimmed_df = choose_data_order(df)


#output the chosen data
if trimmed_df is not None:
  print('\n\n',trimmed_df['town'].value_counts())
  print(trimmed_df.shape)


# 3. Data Splitting

In [None]:
# select the dependent and target
#All Features: 'closest_mrt_dist', 'cbd_dist', 'floor_area_sqm', 'years_remaining','flat_model','town', 'storey_range'
numerical_features = ['closest_mrt_dist', 'cbd_dist', 'floor_area_sqm', 'years_remaining']
categorical_features = ['flat_model','town', 'storey_range']

X = trimmed_df[ numerical_features + categorical_features ]
y=trimmed_df['resale_price'].values

# Import the libary to split the data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,
                                                    test_size = 0.3,
                                                    random_state=42)


# Print the size of the training and testing data
print(X_train.shape, X_test.shape)

# 4. Use scikit-learn library to train the model


In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder


##Choosing Encoding Method
chosen_encoding_method = input('Which Encoding Method do you like to use\nEnter 1 for One-Hot Encoding\nEnter 2 for Label Encoding\n')

if chosen_encoding_method == '1':

  preprocessor = ColumnTransformer(
      transformers=[
          ('num', StandardScaler(),numerical_features ),
          ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features )
      ])


elif chosen_encoding_method == '2':

  preprocessor = ColumnTransformer(
      transformers=[
          ('num', StandardScaler(), numerical_features),
          ('cat', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1), categorical_features)
      ])

else:
  print("Invalid Input. Try again")


##Choosing Poly or Linear Regression
chosen_model = input('Which model do you like to use?\nEnter 1 for Polynomial Regression\nEnter 2 for Linear Regression\n')

if chosen_model == '1':
  steps = [('preprocessor', preprocessor),
          ('poly', PolynomialFeatures(degree = 5, include_bias=False)),
          ('liReg', LinearRegression())]

  pipeline = Pipeline(steps)

  parameters = {"poly__degree":[2,3]}
  model = GridSearchCV(pipeline, parameters,
                          cv=5,
                          scoring='neg_mean_squared_error',
                          verbose= True)

  model.fit(X_train, y_train)


  #Output best parameter for polynomial regression
  best_parameters = model.best_params_
  print ('Best order for your polynomial regression is :', best_parameters)


elif chosen_model == '2':
  steps = [('preprocessor', preprocessor),
          ('liReg', LinearRegression())]

  model = Pipeline(steps)
  model.fit(X_train, y_train)

else:
  print('Invalid Input. Try again')


# 5. Perform the predictions


In [6]:
  #predict the sales for the testing data
  ytest_pred = model.predict(X_test)
  #predict the sales for the training data
  ytrain_pred = model.predict(X_train)

# 6. Visualize the results.


In [None]:
# import matplotlib for visualization
import matplotlib.pyplot as plt

plt.plot(  y_train, ytrain_pred, 'k*')
plt.plot(  y_test, ytest_pred, 'r*')

plt.title("Predicted value vs. Actual value")
plt.legend(["Training data", "Testing data"])
plt.show()

# 7. Evaluate the results using metrics


In [None]:
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

# evauate on the training set
print('Training')
mae = mean_absolute_error(y_train, ytrain_pred)
print('Mean absolute error: ', mae)
mse = mean_squared_error(y_train, ytrain_pred)
print('Mean squared error: ', mse)
r2 = r2_score(y_train, ytrain_pred)
print('R2 score is: ', r2)
print('\nTesting')
# evauate on the testing set
mae = mean_absolute_error(y_test, ytest_pred)
print('Mean absolute error: ', mae)
mse = mean_squared_error(y_test, ytest_pred)
print('Mean squared error: ', mse)
r2 = r2_score(y_test, ytest_pred)
print('R2 score is: ', r2)

# 8. Compare the results from One-Hot and Label Encoding Methods

In [None]:
import matplotlib.pyplot as plt
import numpy as np



categories = ['Testing', 'Training']


## Enter the results of R2 scores here

one_hot_encoding = [0.0, 0.0]

label_encoding = [0.0, 0.0]





x = np.arange(len(categories))  # the label locations
width = 0.35  # the width of the bars

fig, ax = plt.subplots()

# Plotting One-Hot bars
rects1 = ax.bar(x - width/2, one_hot_encoding, width, label='One-Hot', color='darkblue')

# Plotting RMSE bars
rects2 = ax.bar(x + width/2, label_encoding, width, label='Label (Best', color='orange')

# Add some text for labels, title and custom x-axis tick labels, etc.
ax.set_ylabel('R2 Score')
ax.set_ylim([0.7, 0.9])  # Setting Y scale from 0 to 1
ax.set_title('One-hot and Label Enconding Comparison')
ax.set_xticks(x)
ax.set_xticklabels(categories)
ax.legend()

# Display the plot
plt.show()