#
# Train Test Split and Model Training
#

In [44]:
import pandas as pd

from sklearn.metrics import mean_absolute_error, mean_squared_error


###
## Importing Dataset
###

In [45]:
a = pd.read_csv('california500')
a

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,Price
0,8.3252,41.0,6.984127,1.023810,322.0,2.555556,37.88,-122.23,4.526
1,8.3014,21.0,6.238137,0.971880,2401.0,2.109842,37.86,-122.22,3.585
2,7.2574,52.0,8.288136,1.073446,496.0,2.802260,37.85,-122.24,3.521
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,3.413
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,3.422
...,...,...,...,...,...,...,...,...,...
495,5.3794,52.0,6.861017,1.123729,1316.0,2.230508,37.86,-122.25,3.769
496,3.2619,52.0,4.792053,1.017219,1508.0,1.997351,37.85,-122.26,3.096
497,2.7054,52.0,5.001227,1.128834,1800.0,2.208589,37.85,-122.27,1.823
498,2.2431,47.0,6.371166,1.226994,719.0,2.205521,37.85,-122.27,1.727


###
## Correlation
###

In [46]:
a.corr()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,Price
MedInc,1.0,0.000833,0.747765,-0.232999,-0.096629,-0.161258,0.27534,0.172878,0.740523
HouseAge,0.000833,1.0,0.085889,-0.04239,-0.13092,0.033556,0.242748,-0.234744,0.046072
AveRooms,0.747765,0.085889,1.0,0.147446,-0.201921,-0.096355,0.108352,0.225048,0.472232
AveBedrms,-0.232999,-0.04239,0.147446,1.0,0.011412,-0.029488,-0.090304,-0.091184,-0.257147
Population,-0.096629,-0.13092,-0.201921,0.011412,1.0,0.048615,0.021011,0.004222,0.057044
AveOccup,-0.161258,0.033556,-0.096355,-0.029488,0.048615,1.0,-0.324972,0.216899,-0.238548
Latitude,0.27534,0.242748,0.108352,-0.090304,0.021011,-0.324972,1.0,-0.79737,0.567696
Longitude,0.172878,-0.234744,0.225048,-0.091184,0.004222,0.216899,-0.79737,1.0,-0.141148
Price,0.740523,0.046072,0.472232,-0.257147,0.057044,-0.238548,0.567696,-0.141148,1.0


###
## Seperating Variables
###

In [57]:
x = a[['MedInc', 'HouseAge', 'AveRooms', 'AveBedrms', 'Population', 'AveOccup', 'Latitude', 'Longitude']]
x

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
0,8.3252,41.0,6.984127,1.023810,322.0,2.555556,37.88,-122.23
1,8.3014,21.0,6.238137,0.971880,2401.0,2.109842,37.86,-122.22
2,7.2574,52.0,8.288136,1.073446,496.0,2.802260,37.85,-122.24
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25
...,...,...,...,...,...,...,...,...
495,5.3794,52.0,6.861017,1.123729,1316.0,2.230508,37.86,-122.25
496,3.2619,52.0,4.792053,1.017219,1508.0,1.997351,37.85,-122.26
497,2.7054,52.0,5.001227,1.128834,1800.0,2.208589,37.85,-122.27
498,2.2431,47.0,6.371166,1.226994,719.0,2.205521,37.85,-122.27


In [65]:
y = a['Price']
y

0      4.526
1      3.585
2      3.521
3      3.413
4      3.422
       ...  
495    3.769
496    3.096
497    1.823
498    1.727
499    1.508
Name: Price, Length: 500, dtype: float64

###
## Importing train_test_split
###

In [66]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split( x, y, test_size = 0.2, random_state = 0)

###### train_test_split: This is a function from scikit-learn's model_selection module used to split arrays or matrices into random train and test subsets. It's commonly used in machine learning for evaluating models by splitting the data into training and testing sets.

###### x: This variable represents the features or independent variables. In this context, it's typically a DataFrame or numpy array containing the input features of your dataset.

###### y: This variable represents the target variable or dependent variable. It's usually a pandas Series or numpy array containing the labels or target values associated with the input features in x.

###### test_size = 0.2: This parameter specifies the proportion of the dataset to include in the testing set. In this case, it's set to 0.2, which means 20% of the data will be used for testing, and the remaining 80% will be used for training.

###### random_state = 0: This parameter controls the randomness of the data splitting process. Setting it to a specific value (e.g., 0) ensures reproducibility. If you use the same random_state value across different runs of the code, you'll get the same split each time, which is useful for reproducibility and debugging.

###### x_train, x_test, y_train, y_test: These variables store the resulting training and testing sets after splitting. x_train and y_train contain the training features and target values, respectively, while x_test and y_test contain the testing features and target values, respectively.

###
## Shapes
###

In [67]:
x.shape, y.shape, x_train.shape, x_test.shape, y_train.shape, y_test.shape

((500, 8), (500,), (400, 8), (100, 8), (400,), (100,))

###
## Importing LinearRegression
###

In [68]:
from sklearn.linear_model import LinearRegression

model = LinearRegression()
model.fit( x_train, y_train )

###
## Predicting Values of Unseen Part of DataSet i.e: x_test (Undependent Values of Test  DataSet)
###

In [69]:
y_predict = model.predict(x_test)
y_predict


array([1.31554746, 1.36198892, 3.3994476 , 2.27189296, 2.08456962,
       1.8826626 , 0.73296091, 2.4390206 , 1.63335271, 2.17987091,
       1.08750684, 1.32933017, 2.44943574, 1.94687046, 1.22232643,
       0.79509683, 2.2725105 , 1.97395068, 3.29328326, 1.39644392,
       1.24462193, 2.05021784, 1.62641474, 2.22343635, 1.7651393 ,
       1.99597452, 1.78481229, 2.02893906, 1.93033722, 2.2639445 ,
       2.79776793, 2.97566026, 3.48690335, 1.31954017, 1.45543977,
       2.06485011, 1.00100819, 2.12695797, 1.2401365 , 1.52750584,
       0.30880715, 1.60806376, 4.2548607 , 1.90115675, 1.36343686,
       4.40568731, 1.56342487, 0.9345887 , 3.58693044, 1.72269181,
       1.71023072, 1.4046597 , 0.32038823, 1.81091811, 3.71902693,
       2.21003067, 0.92902463, 1.35161451, 2.06616552, 1.08429559,
       0.79974395, 1.95477802, 2.58381574, 2.92720122, 1.60091751,
       0.99078202, 2.17758115, 2.18534102, 2.23044411, 1.04582611,
       3.0608505 , 0.73657688, 0.67104215, 1.94332734, 1.36516

###
## Comparision of "Orignal y_test Values" vs ''Predicted y_test Values"
###

In [84]:
print('DataFrame of orignal and predicted House Prices(y_test) of Test DataSet')
pd.DataFrame({'Orignal y_test Values' : y_test, 'Predicted y_test Values' : model.predict(x_test)})

DataFrame of orignal and predicted House Prices(y_test) of Test DataSet


Unnamed: 0,Orignal y_test Values,Predicted y_test Values
90,1.625,1.315547
254,1.024,1.361989
283,3.710,3.399448
445,1.856,2.271893
461,2.417,2.084570
...,...,...
372,1.079,1.256723
56,0.875,1.120878
440,1.448,2.329012
60,0.757,1.699519
