In [None]:
import pandas as pd
from pandasai import SmartDataframe
from pandasai.llm import OpenAI

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeRegressor, export_text
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [None]:
from sklearn.metrics import r2_score
from sklearn.model_selection import GridSearchCV

In [None]:
!jupyter nbconvert --to webpdf Project_final_decision_tree.ipynb

In [None]:
house_price = pd.read_csv('Real_estate _1.csv')
house_price.head()

In [None]:
house_price.columns = house_price.columns.str.strip().str.lower().str.replace(" ", "_")

In [None]:
house_price.rename(columns={'no': 'transaction_id'}, inplace=True)

In [None]:
def cap_outliers(series, upper_percentile=0.97):
    upper_bound = series.quantile(upper_percentile)
    return series.clip(upper=upper_bound)

In [None]:
house_price['x3_distance_to_the_nearest_mrt_station'] = cap_outliers(house_price['x3_distance_to_the_nearest_mrt_station'])
print(f"{'x3_distance_to_the_nearest_mrt_station'}: capped at 97th percentile = {house_price['x3_distance_to_the_nearest_mrt_station'].max()}")

In [None]:
outlier_values = [78.3, 117.5, 78.0]
house_price = house_price[~house_price['y_house_price_of_unit_area'].isin(outlier_values)]

In [None]:
house_price = house_price.drop(columns=['transaction_id', 'x1_transaction_date'])
house_price.head()

In [None]:
llm = OpenAI(api_token="YOUR_API_KEY_HERE")
house_price = SmartDataframe(house_price, config ={"llm": llm})

In [None]:
house_price.chat('use wisestep to find the best predictors for y_house_price_of_unit_area')

In [None]:
house_price.chat('define x2_house_age, x3_distance_to_the_nearest_mrt_station, x4_number_of_convenience_stores, x5_latitude	,x6_longitude  as a best predictors')

In [None]:
house_price.chat('define y_house_price_of_unit_area column as an outcome for the predictive model')

In [None]:
house_price.chat('visualize correlation between outcome and best predictors')

# Splittig data

In [None]:
house_price.chat('divide the original data frame house_price into two data frames with independent variable X and dependent variable y')

In [None]:
house_price.chat('split y_house_price_of_unit_area and best predictors into training dataset= 60% and validation dataset = 40%')

# Best Hyperparameters

In [None]:
# We will use Hyperparameters parameters that can be fine-tuned to improve the accuracy of our machine learning model.

In [None]:
# Checking for Best Hyperparameters
house_price.chat('look at the best hyperparameter combination of max_depth, min_samples_split, min_samples_leaf, max_leaf_nodes, max_features that GridSearchCV has chosen for us')

In [None]:
# max_depth: It denotes the tree’s maximum depth. It supports any int value or “None”. If “None”, nodes are expanded until all leaves are pure or contain fewer than min samples split samples.
# min_samples_split: It refers to the minimum number of samples needed to split an internal node. It supports any int or float value and the default is 2.
# min_samples_leaf: It refers to the minimum no. of samples required at the leaf node. By default, it is 1. It can be any int or float value and the default is 1.
# max_features: It indicates the number of features to be considered in order to find the best split. It can have the values ‘auto,”sqrt,’ ‘log2’, ‘None,’ int, or float. It is set to 1.0 by default.

In [None]:
house_price.chat('create an object of DecisionTreeRegressor with max_depth =30, min_samples_split=2, max_features = log2, max_leaf_nodes= None, min_samples_leaf = 1,   random_state=1')

# Training accuracy

In [None]:
house_price.chat('alculate the training accuracy using the R2 score')

In [None]:
#  training accuracy is very hight = 99,62%
# Both spreads are almost completely overlapping one another, indicating that training accuracy is actually quite high and also a possibility of some overfitting.

In [None]:
 house_price.chat('use a scatter plot to see the training accuracy')

In [None]:
house_price.chat('calculate regression statistic')
# this time pandasAI calculated testing accuracy againg and gave us different result = 98.78%

In [None]:
house_price.chat('calculate ME, MAE, MPE, MAPE for thaining dataset ')

# Testing accuracy

In [None]:
house_price.chat('show the testing accuracy')

In [None]:
# Testing accuracy is 
# There is overfitting in the model. Our training accuracy is between 99-98% while our testing accuracy is 67.71%.

In [None]:
house_price.chat('calculate ME, MPE, MAPE for testing dataset ')

In [None]:
 house_price.chat('visualize the testing accuracy of decision tree regressor')

# Visualizing Regression Decision Tree with Graphviz

In [None]:
# downloading Grahpviz package
from sklearn import tree
import graphviz

In [None]:
house_price.chat('visualize the decision tree itself by using the tree module of sklearn and Graphviz package')