## Necesssary Python Library are imported inside the notebook

In [2]:
import pandas as pd
import numpy as np
from ydata_profiling import ProfileReport
from sklearn import preprocessing   
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error

### The dataset is imported and the null value datas is being removed so that the data can be proceed for the Data Visualization and Machine Learning

In [3]:
chennai_data = pd.read_csv("C:/Users/joelb/OneDrive/Documents/Datasets/train-chennai-sale.xls")
chennai_data = chennai_data.dropna(how='any')
chennai_data.columns

FileNotFoundError: [Errno 2] No such file or directory: 'train-chennai-sale.csv'

In [None]:
chennai_data.describe()

### The data is being copied to another varibale using swallow copy as this method does not affect the original data

In [None]:
updated_cd = chennai_data.copy()

In [None]:
updated_cd.head()

### The Time Difference between the Date of Sale and the Date of Build is compared so that it will be helpful in our analytics

In [None]:
updated_cd['Time_Difference'] = (pd.to_datetime(updated_cd['DATE_SALE']) - pd.to_datetime(updated_cd['DATE_BUILD'])).dt.days
updated_cd['Time_Difference'] = updated_cd['Time_Difference']/365

In [None]:
updated_cd

In [None]:
updated_cd['Total_Price'] = updated_cd['REG_FEE']+updated_cd['COMMIS']+updated_cd['SALES_PRICE']

In [None]:
updated_cd = updated_cd.drop(['PRT_ID', 'INF_PRICE', '1+0.04', 'POWER(Z,Y)'], axis=1)

In [None]:
updated_cd['DS_Year'] = updated_cd['DATE_SALE'].str[-4:]
updated_cd['DS_Month'] = updated_cd['DATE_SALE'].str[3:5]
updated_cd['DS_Day'] = updated_cd['DATE_SALE'].str[0:2]

In [None]:
updated_cd['DB_Year'] = updated_cd['DATE_BUILD'].str[-4:]
updated_cd['DB_Month'] = updated_cd['DATE_BUILD'].str[3:5]
updated_cd['DB_Day'] = updated_cd['DATE_BUILD'].str[0:2]

### Here i used a python library called ydata_profiling, as this library can be used for showing the analytics report of the overall data and the individual column. This data can be exported into html file for further reference.

In [None]:
profile = ProfileReport(updated_cd, title="Profiling Report")
profile
#profile.to_file("chennai_sales_profile_report.html")

In [None]:
updated_cd

In [None]:
total_price_by_area = updated_cd.groupby('AREA')['Total_Price'].sum().reset_index()
total_price_by_area = total_price_by_area.sort_values(by='Total_Price', ascending=False)
print(total_price_by_area)

In [None]:
plt.figure(figsize=(10, 6))
plt.bar(total_price_by_area['AREA'], total_price_by_area['Total_Price'], color='skyblue')
plt.title('Total Price by Area')
plt.xlabel('Area')
plt.ylabel('Total Price')
plt.xticks(rotation=45)
plt.show()

In [None]:
time_difference = updated_cd.sort_values(by='Time_Difference', ascending=False).head(5)
time_difference

# Machine Learning(Linear Regression)

In [None]:
ml_data = updated_cd.copy()

In [None]:
ml_data.columns

### The below columns are removed from the data as this is not suitable for the machine learning, features is selected manually based on the understanding of the data

In [None]:
ml_data = ml_data.drop(columns=['DATE_SALE', 'DATE_BUILD', 'REG_FEE', 'COMMIS', 'SALES_PRICE','DS_Month', 'DS_Year','DS_Day', 'DB_Year', 'DB_Month', 'DB_Day'])

In [None]:
ml_data

In [None]:
le = preprocessing.LabelEncoder() 

In [None]:
ml_data['AREA'] = le.fit_transform(ml_data['AREA'])
ml_data['SALE_COND'] = le.fit_transform(ml_data['SALE_COND'])
ml_data['PARK_FACIL'] = le.fit_transform(ml_data['PARK_FACIL'])
ml_data['BUILDTYPE'] = le.fit_transform(ml_data['BUILDTYPE'])
ml_data['UTILITY_AVAIL'] = le.fit_transform(ml_data['UTILITY_AVAIL'])
ml_data['STREET'] = le.fit_transform(ml_data['STREET'])
ml_data['MZZONE'] = le.fit_transform(ml_data['MZZONE'])

In [None]:
ml_data

In [None]:
ml_data.corr()

In [None]:
ml_data.describe()

### In the above describe we can see that for INT_SQFT and Total_Sale alone we have more difference between the min and max. Before proceeding with the Standard Scaling, we can see the data distribution in Box Plot and Histogram. From this, the Scaling needs to be done on the Total_Sale as the data range is more and for Linear Regression Model it is best to do scaling before model fit.

In [None]:
# Create a box plot
plt.figure(figsize=(10, 6))
sns.boxplot(data=ml_data['INT_SQFT'])
plt.title('Box Plot of Square Feet')
plt.show()

In [None]:
plt.figure(figsize=(10, 6))
sns.boxplot(data=ml_data['Total_Price'])
plt.title('Box Plot of Features')
plt.show()

In [None]:
plt.figure(figsize=(8, 6))
plt.hist(ml_data['Total_Price'], bins=20, edgecolor='black')
plt.title('Histogram of Total_Price')
plt.xlabel('Total_Price')
plt.ylabel('Frequency')
plt.show()

In [None]:
ml_data.isnull().sum()

In [None]:
soft_scaler = StandardScaler()
ml_data['Total_Price'] = soft_scaler.fit_transform(ml_data[['Total_Price']] )
ml_data['INT_SQFT'] = soft_scaler.fit_transform(ml_data[['INT_SQFT']] )

### Here the final preprocessed ml_data is taken into two different data variable, the 95% of the data will be used for the ml model and the remaining data will be used for the ml prediction

In [None]:
ml_data_updated = ml_data.iloc[1:6700,:]
test_data = ml_data.iloc[6701:,:]

In [None]:
ml_data_updated

In [None]:
x = ml_data_updated.iloc[:,0:16]
y=ml_data_updated.iloc[:,16]

In [None]:
x

In [None]:
x_train, x_test, y_train , y_test = train_test_split(x, y, test_size=0.25, random_state=42)

In [None]:
x_train.shape, x_test.shape, y_train.shape, y_test.shape

In [None]:
print(x_test)

In [None]:
model = LinearRegression()
model.fit(x_train, y_train)

In [None]:
y_pred = model.predict(x_test)

In [None]:
y_pred

In [None]:
y_pred.shape

### In Regression accuracy from the Sklearn wont worry, so we will be proceeding with the R2_Score, as R2_Score is similar to the accuracy score. For manually calculating the Accuracy_Score there is an another method by calculating the mean squared error and subtracting it by 1 we will get the accuracy of the model. In the Below output we can see that the R2_Score and the Accuracy is the same as we know that both have the same function to show the accuracy of the model

### Here we got the accuracy value as round of 79%, which is a good score for predicting the total price of the house

In [None]:
r2 = r2_score(y_test, y_pred)
print(f'The R2 Score for the Linear Regression for the Given Chennai House Price Dataset is {r2*100}')

In [None]:
acc = 1 - mean_squared_error(y_test, y_pred)
print(f'The Accuracy Score for the Linear Regression for the Given Chennai House Price Dataset is {acc}')

### The Test_Data is being used for the Model Prediction to see how well the model is predicting the output based on the given input features

In [None]:
test_data

In [None]:
test_data = test_data.drop('Total_Price', axis=1)

In [None]:
unscaled_predictions = soft_scaler.inverse_transform(model.predict(test_data).reshape(-1, 1))
unscaled_predictions