In [1]:
from sklearn import linear_model
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import pandas as pd
import warnings

warnings.filterwarnings(action='ignore')

In [2]:
df = pd.read_csv("data/housing.csv")
df

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY
...,...,...,...,...,...,...,...,...,...,...
20635,-121.09,39.48,25.0,1665.0,374.0,845.0,330.0,1.5603,78100.0,INLAND
20636,-121.21,39.49,18.0,697.0,150.0,356.0,114.0,2.5568,77100.0,INLAND
20637,-121.22,39.43,17.0,2254.0,485.0,1007.0,433.0,1.7000,92300.0,INLAND
20638,-121.32,39.43,18.0,1860.0,409.0,741.0,349.0,1.8672,84700.0,INLAND


In [3]:
df.isnull().sum()

longitude               0
latitude                0
housing_median_age      0
total_rooms             0
total_bedrooms        207
population              0
households              0
median_income           0
median_house_value      0
ocean_proximity         0
dtype: int64

In [4]:
df = df.fillna(0)
df.isnull().sum()

longitude             0
latitude              0
housing_median_age    0
total_rooms           0
total_bedrooms        0
population            0
households            0
median_income         0
median_house_value    0
ocean_proximity       0
dtype: int64

In [5]:
# Categorical Data change to Numeric Data
encoder = OrdinalEncoder()
X = pd.DataFrame(df['ocean_proximity'])
encoder.fit(X)
# Deep copy
house_ordinal = df.copy()
house_ordinal['ocean_proximity'] = pd.DataFrame(encoder.transform(X))

house_ordinal

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,3.0
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,3.0
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,3.0
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,3.0
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,3.0
...,...,...,...,...,...,...,...,...,...,...
20635,-121.09,39.48,25.0,1665.0,374.0,845.0,330.0,1.5603,78100.0,1.0
20636,-121.21,39.49,18.0,697.0,150.0,356.0,114.0,2.5568,77100.0,1.0
20637,-121.22,39.43,17.0,2254.0,485.0,1007.0,433.0,1.7000,92300.0,1.0
20638,-121.32,39.43,18.0,1860.0,409.0,741.0,349.0,1.8672,84700.0,1.0


In [6]:
# Scaling using standardScalr
scaler = StandardScaler()
house_scaled = scaler.fit_transform(house_ordinal)
house_scaled = pd.DataFrame(house_scaled, columns=house_ordinal.columns)

house_scaled

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-1.327835,1.052548,0.982143,-0.804819,-0.954593,-0.974429,-0.977033,2.344766,2.129631,1.291089
1,-1.322844,1.043185,-0.607019,2.045890,1.356913,0.861439,1.669961,2.332238,1.314156,1.291089
2,-1.332827,1.038503,1.856182,-0.535746,-0.810272,-0.820777,-0.843637,1.782699,1.258693,1.291089
3,-1.337818,1.038503,1.856182,-0.624215,-0.703806,-0.766028,-0.733781,0.932968,1.165100,1.291089
4,-1.337818,1.038503,1.856182,-0.462404,-0.597339,-0.759847,-0.629157,-0.012881,1.172900,1.291089
...,...,...,...,...,...,...,...,...,...,...
20635,-0.758826,1.801647,-0.289187,-0.444985,-0.374942,-0.512592,-0.443449,-1.216128,-1.115804,-0.116739
20636,-0.818722,1.806329,-0.845393,-0.888704,-0.904909,-0.944405,-1.008420,-0.691593,-1.124470,-0.116739
20637,-0.823713,1.778237,-0.924851,-0.174995,-0.112325,-0.369537,-0.174042,-1.142593,-0.992746,-0.116739
20638,-0.873626,1.778237,-0.845393,-0.355600,-0.292135,-0.604429,-0.393753,-1.054583,-1.058608,-0.116739


In [7]:
# Spit the data from target
x, y = house_scaled.drop(['median_house_value'], axis=1), house_scaled['median_house_value']

In [8]:
x

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity
0,-1.327835,1.052548,0.982143,-0.804819,-0.954593,-0.974429,-0.977033,2.344766,1.291089
1,-1.322844,1.043185,-0.607019,2.045890,1.356913,0.861439,1.669961,2.332238,1.291089
2,-1.332827,1.038503,1.856182,-0.535746,-0.810272,-0.820777,-0.843637,1.782699,1.291089
3,-1.337818,1.038503,1.856182,-0.624215,-0.703806,-0.766028,-0.733781,0.932968,1.291089
4,-1.337818,1.038503,1.856182,-0.462404,-0.597339,-0.759847,-0.629157,-0.012881,1.291089
...,...,...,...,...,...,...,...,...,...
20635,-0.758826,1.801647,-0.289187,-0.444985,-0.374942,-0.512592,-0.443449,-1.216128,-0.116739
20636,-0.818722,1.806329,-0.845393,-0.888704,-0.904909,-0.944405,-1.008420,-0.691593,-0.116739
20637,-0.823713,1.778237,-0.924851,-0.174995,-0.112325,-0.369537,-0.174042,-1.142593,-0.116739
20638,-0.873626,1.778237,-0.845393,-0.355600,-0.292135,-0.604429,-0.393753,-1.054583,-0.116739


In [9]:
y

0        2.129631
1        1.314156
2        1.258693
3        1.165100
4        1.172900
           ...   
20635   -1.115804
20636   -1.124470
20637   -0.992746
20638   -1.058608
20639   -1.017878
Name: median_house_value, Length: 20640, dtype: float64

In [10]:
def linearRegression(dataX, dataY, testSize):
    reg = linear_model.LinearRegression()

    # Split the dataset into training and testing
    x_train, x_test, y_train, y_test = train_test_split(dataX, dataY, test_size=testSize, random_state=42, shuffle=True)
    reg.fit(x_train, y_train)

    # Compute data and find result
    resultCoef = reg.coef_
    resultInter = reg.intercept_
    resultTrainsoc = reg.score(x_train, y_train)
    resultTestSoc = reg.score(x_test, y_test)

    # data collect
    return resultCoef, resultInter, resultTrainsoc,resultTestSoc


In [11]:
# Split the dataset train:test = 4:1
result1Coef, result1Inter, result1Trainsoc, result1TestSoc = linearRegression(x, y, 0.2)

print("[Train : test = 4 : 1]\n")
print('Coefficients: ', result1Coef)
print('Intercept: ', result1Inter)
print('TrainSet score: %.2f' % result1Trainsoc)
print('TestSet score: %.4f' % result1TestSoc)

[Train : test = 4 : 1]

Coefficients:  [-0.74324069 -0.78788497  0.12916171 -0.15382273  0.42632804 -0.37876811
  0.15308005  0.66696867 -0.00392516]
Intercept:  -0.006291849996386151
TrainSet score: 0.64
TestSet score: 0.5993


In [12]:
# Split the dataset train:test = 3:2
result2Coef, result2Inter, result2Trainsoc, result2TestSoc = linearRegression(x, y, 0.4)

print("[Train : Test = 3 : 2]\n")
print('Coefficients: ', result2Coef)
print('Intercept: ', result2Inter)
print('TrainSet score: %.2f' % result2Trainsoc)
print('TestSet score: %.4f' % result2TestSoc)

[Train : Test = 3 : 2]

Coefficients:  [-0.74699474 -0.78876111  0.12496187 -0.16206356  0.45970442 -0.37468376
  0.12105963  0.66613177 -0.00494608]
Intercept:  -0.007866047041059233
TrainSet score: 0.64
TestSet score: 0.6196
