# Simple GLM Template

## Data Preprocessing

### Importing necessary libraries

In [54]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

### Importing and viewing the dataseet

In [55]:
# open up dataset using panda and view it 
df = pd.read_csv('out.csv')
df.head()

Unnamed: 0,furnishingstatus,area,bedrooms,bathrooms,stories,parking,guestroom,basement,hotwaterheating,airconditioning,mainroad,prefarea,price
0,furnished,7420.0,4.0,2.0,3.0,2.0,no,no,no,yes,yes,yes,
1,furnished,,4.0,4.0,4.0,3.0,no,no,no,yes,yes,no,12250000.0
2,semi-furnished,9960.0,3.0,2.0,2.0,2.0,no,yes,no,no,yes,yes,12250000.0
3,furnished,7500.0,4.0,2.0,2.0,3.0,no,,no,yes,yes,yes,12215000.0
4,furnished,7420.0,4.0,1.0,2.0,2.0,yes,yes,no,yes,yes,no,11410000.0


In [56]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 545 entries, 0 to 544
Data columns (total 13 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   furnishingstatus  533 non-null    object 
 1   area              533 non-null    float64
 2   bedrooms          539 non-null    float64
 3   bathrooms         536 non-null    float64
 4   stories           539 non-null    float64
 5   parking           540 non-null    float64
 6   guestroom         540 non-null    object 
 7   basement          536 non-null    object 
 8   hotwaterheating   537 non-null    object 
 9   airconditioning   541 non-null    object 
 10  mainroad          536 non-null    object 
 11  prefarea          534 non-null    object 
 12  price             530 non-null    float64
dtypes: float64(6), object(7)
memory usage: 55.5+ KB


### Cleaning the dataset

In [57]:
cols=df.columns
df.columns

Index(['furnishingstatus', 'area', 'bedrooms', 'bathrooms', 'stories',
       'parking', 'guestroom', 'basement', 'hotwaterheating',
       'airconditioning', 'mainroad', 'prefarea', 'price'],
      dtype='object')

#### Handling duplicate values

In [58]:
boolean = False  # Tracks if any duplicates exist across columns
for col in cols:
    is_duplicate = df.duplicated(subset=[col]).any()  # Check for duplicates in the current column
    print(f"{col} : {is_duplicate}")  # Print the result for the current column
    if is_duplicate:
        boolean = True  # Update the boolean if any duplicates are found

if not boolean:  # If boolean is still False, no duplicates were found
    print("There are no duplicate entries in the dataset.")
else:
    print("Duplicates were found in the dataset.")


furnishingstatus : True
area : True
bedrooms : True
bathrooms : True
stories : True
parking : True
guestroom : True
basement : True
hotwaterheating : True
airconditioning : True
mainroad : True
prefarea : True
price : True
Duplicates were found in the dataset.


#### Handling missing data

In [59]:
# df = data.copy()
missing_values = df.isnull().sum()
total_missing_values = df.isnull().sum().sum()
print(f'Missing values per column:\n{missing_values}')
print('Total missing data:', total_missing_values)
missing_val_percent=(total_missing_values/df.shape[0])*100
print(f'total percentage of missing value is {missing_val_percent:.2f}%')

Missing values per column:
furnishingstatus    12
area                12
bedrooms             6
bathrooms            9
stories              6
parking              5
guestroom            5
basement             9
hotwaterheating      8
airconditioning      4
mainroad             9
prefarea            11
price               15
dtype: int64
Total missing data: 111
total percentage of missing value is 20.37%


#### missing_val_percent < 5% ? dropna() : Impute (depends on size of dataset)

##### Option-1: Remove all missing data columns

In [60]:
if not missing_val_percent < 5:
    df = df.dropna().reset_index(drop=True)

##### Option-2: Use SkLearn Library

In [61]:
# Categorical columns
cat_col = [col for col in df.columns if df[col].dtype == 'object']
# Numerical columns
num_col = [col for col in df.columns if df[col].dtype != 'object']

print('Categorical columns :',cat_col)
print('Numerical columns :',num_col)

Categorical columns : ['furnishingstatus', 'guestroom', 'basement', 'hotwaterheating', 'airconditioning', 'mainroad', 'prefarea']
Numerical columns : ['area', 'bedrooms', 'bathrooms', 'stories', 'parking', 'price']


In [62]:
yes_no_cols = [col for col in cat_col if df[col].nunique() == 2]

for col in yes_no_cols:
    if col in df.columns:
        df[col] = LabelEncoder().fit_transform(df[col])

In [26]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer

# OneHotEncode multi-category columns (e.g., 'furnishingstatus')
multi_cat_cols = [col for col in cat_col if df[col].nunique() > 2]  # Multi-category columns
if multi_cat_cols:
    ct = ColumnTransformer(
        transformers=[('encoder', OneHotEncoder(), multi_cat_cols)], remainder='passthrough'
    )
    df = pd.DataFrame(ct.fit_transform(df), columns=ct.get_feature_names_out())
df.columns

Index(['encoder__furnishingstatus_furnished',
       'encoder__furnishingstatus_semi-furnished',
       'encoder__furnishingstatus_unfurnished', 'remainder__area',
       'remainder__bedrooms', 'remainder__bathrooms', 'remainder__stories',
       'remainder__parking', 'remainder__guestroom', 'remainder__basement',
       'remainder__hotwaterheating', 'remainder__airconditioning',
       'remainder__mainroad', 'remainder__prefarea', 'remainder__price'],
      dtype='object')

In [63]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
# OneHotEncode multi-category columns without changing column names
multi_cat_cols = [col for col in cat_col if df[col].nunique() > 2]
if multi_cat_cols:
    onehot = OneHotEncoder(sparse_output=False)
    onehot_encoded = onehot.fit_transform(df[multi_cat_cols])
    
    # Retrieve new column names for encoded categories
    onehot_columns = onehot.get_feature_names_out(multi_cat_cols)
    
    # Combine the rest of the data with encoded categories
    df = pd.concat(
        [pd.DataFrame(onehot_encoded, columns=onehot_columns, index=df.index), 
         df.drop(columns=multi_cat_cols)], axis=1
    )

##### Split data into features X and target y

In [64]:
X= df.iloc[:, :-1].values #dependent variables
y= df.iloc[:, -1].values  #independent variable

##### Spliting into training and test data

In [65]:
# Train-test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [66]:
print(X_train)

[[0. 0. 1. ... 1. 1. 0.]
 [0. 0. 1. ... 1. 1. 1.]
 [0. 1. 0. ... 1. 1. 0.]
 ...
 [0. 0. 1. ... 0. 1. 0.]
 [0. 0. 1. ... 1. 0. 1.]
 [0. 1. 0. ... 1. 1. 0.]]


In [67]:
# Categorical columns
cat_col = [col for col in df.columns if df[col].dtype == 'object']
# Numerical columns
num_col = [col for col in df.columns if df[col].dtype != 'object']

print('Categorical columns :',cat_col)
print('Numerical columns :',num_col)

Categorical columns : []
Numerical columns : ['furnishingstatus_furnished', 'furnishingstatus_semi-furnished', 'furnishingstatus_unfurnished', 'area', 'bedrooms', 'bathrooms', 'stories', 'parking', 'guestroom', 'basement', 'hotwaterheating', 'airconditioning', 'mainroad', 'prefarea', 'price']


In [68]:
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(strategy='mean')  # For numerical columns
df[num_col] = imputer.fit_transform(df[num_col])

#### Feature Scaling

In [69]:
df.head()

Unnamed: 0,furnishingstatus_furnished,furnishingstatus_semi-furnished,furnishingstatus_unfurnished,area,bedrooms,bathrooms,stories,parking,guestroom,basement,hotwaterheating,airconditioning,mainroad,prefarea,price
0,0.0,1.0,0.0,9960.0,3.0,2.0,2.0,2.0,0.0,1.0,0.0,0.0,1.0,1.0,12250000.0
1,1.0,0.0,0.0,7420.0,4.0,1.0,2.0,2.0,1.0,1.0,0.0,1.0,1.0,0.0,11410000.0
2,0.0,1.0,0.0,7500.0,3.0,3.0,1.0,2.0,0.0,1.0,0.0,1.0,1.0,1.0,10850000.0
3,0.0,1.0,0.0,8580.0,4.0,3.0,4.0,2.0,0.0,0.0,0.0,1.0,1.0,1.0,10150000.0
4,1.0,0.0,0.0,8100.0,4.0,1.0,2.0,2.0,1.0,1.0,0.0,1.0,1.0,1.0,9870000.0


### Modelling

In [70]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# Train the model
regressor = LinearRegression()
regressor.fit(X_train, y_train)

# Predictions
y_pred = regressor.predict(X_test)

# Evaluation
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print("Mean Squared Error:", mse)
print("R² Score:", r2)


Mean Squared Error: 1420924886086.5803
R² Score: 0.6495833965840456


### Visualizing

In [78]:
# Visualization of actual vs. predicted values
# plt.figure(figsize=(8, 6))
# plt.scatter(X_train, y_train, color='blue', alpha=0.6)
# # plt.plot(X_train, regressor.predict(X_train),
#     # color='red', linestyle='--', linewidth=2, label='Perfect Fit'
# # )
# plt.title('Actual vs. Predicted Values')
# plt.xlabel('Actual Target Value')
# plt.ylabel('Predicted Target Value')
# plt.legend()
print(X_train.shape,y_train.shape)

(357, 14) (357,)
