In [1]:
import pandas as pd
import numpy as np

from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score

from statistics import mean, stdev

In [2]:
train = pd.read_csv('datasets/train_cleaned.csv').set_index('pid')

In [3]:
target = 'saleprice'
cols = [col for col in train.columns if col != target]

ss = StandardScaler()
ss.fit(train)

train_ss = pd.DataFrame(ss.transform(train), columns = train.columns)

In [4]:
y = train[target]
X = train_ss[cols]

lr = LinearRegression(n_jobs = -1)
lr.fit(X,y)

LinearRegression(n_jobs=-1)

In [5]:
lr.predict(X)

array([185051.42082605, 222281.20666939, 137242.3113485 , ...,
       177611.24743304, 115652.90308129, 192394.14645635])

In [6]:
lr.score(X,y)

0.9162064109051988

In [7]:
scores = cross_val_score(lr, X, y, cv = 10,
                         n_jobs = -1)

f'{round(mean(scores), 2)} \u00B1 {round(2 * stdev(scores), 2)}'

'0.91 ± 0.02'

In [8]:
test = pd.read_csv('datasets/test_cleaned.csv').set_index('pid')

In [12]:
zero_data = np.zeros(shape=(test.shape[0],1))

In [13]:
# https://stackoverflow.com/questions/22963263/creating-a-zero-filled-pandas-data-frame
d = pd.DataFrame(zero_data, columns=['saleprice'], index = test.index)

In [14]:
test = test.iloc[:,:39].join(d).join(test.iloc[:,39:])

In [15]:
test_ss = pd.DataFrame(ss.transform(test), columns=test.columns)

In [16]:
X_test = test_ss[cols]

In [21]:
predictions = lr.predict(X_test)

In [22]:
predictions = pd.Series(predictions, name='SalePrice')

In [24]:
submission = pd.DataFrame()
submission['Id'] = test.index
submission['SalePrice'] = predictions

In [25]:
submission

Unnamed: 0,Id,SalePrice
0,902301120,153149.578553
1,905108090,156676.047784
2,528218130,230503.260924
3,902207150,103131.425250
4,535105100,161848.438302
...,...,...
873,527377110,187838.117889
874,535126140,218420.016384
875,904100040,122154.580823
876,527425140,113915.878322


In [26]:
submission.to_csv('datasets/05_22_submission.csv', index = False)

In [28]:
sub_test = pd.read_csv('datasets/05_22_submission.csv')
sub_test.head()

Unnamed: 0,Id,SalePrice
0,902301120,153149.578553
1,905108090,156676.047784
2,528218130,230503.260924
3,902207150,103131.42525
4,535105100,161848.438302
