In [25]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
import collections
from sklearn.metrics import mean_squared_error
from sklearn import preprocessing

In [2]:
data = pd.read_csv('train.csv')

In [3]:
data.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [4]:
X = data.drop(['SalePrice','Id'],axis=1)
Y = data[['SalePrice']]

## Handle Missing Data

In [5]:
#check if some column only have NaN values
columns = X.columns
for col in columns:
    if X[col].isnull().all():
        X.drop(col,axis=1, inplace=True)

In [6]:
num_columns = X.select_dtypes(exclude=['object'])
categ_columns = X.select_dtypes(['object'])

In [7]:
#Replace NAN in numerical column data by the mean of the column
X[num_columns.columns] = X[num_columns.columns].groupby(num_columns.columns, axis = 1).transform(lambda x: x.fillna(x.mean()))

In [8]:
def most_frequent_word(col):
    col = [x for x in col if str(x) != 'nan']
    counter = collections.Counter(col)
    return counter.most_common()[0][0]

In [9]:
for col in categ_columns:
    X[col].fillna(most_frequent_word(col),inplace=True)

In [10]:
X.head()

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,60,RL,65.0,8450,Pave,l,Reg,Lvl,AllPub,Inside,...,0,0,o,e,e,0,2,2008,WD,Normal
1,20,RL,80.0,9600,Pave,l,Reg,Lvl,AllPub,FR2,...,0,0,o,e,e,0,5,2007,WD,Normal
2,60,RL,68.0,11250,Pave,l,IR1,Lvl,AllPub,Inside,...,0,0,o,e,e,0,9,2008,WD,Normal
3,70,RL,60.0,9550,Pave,l,IR1,Lvl,AllPub,Corner,...,0,0,o,e,e,0,2,2006,WD,Abnorml
4,60,RL,84.0,14260,Pave,l,IR1,Lvl,AllPub,FR2,...,0,0,o,e,e,0,12,2008,WD,Normal


## Using One-Hot encoding

In [11]:
#use one-hot encoding
one_hot_encoded_X = pd.get_dummies(X)

In [12]:
one_hot_encoded_X.head()

Unnamed: 0,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,...,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
0,60,65.0,8450,7,5,2003,2003,196.0,706,0,...,0,0,0,1,0,0,0,0,1,0
1,20,80.0,9600,6,8,1976,1976,0.0,978,0,...,0,0,0,1,0,0,0,0,1,0
2,60,68.0,11250,7,5,2001,2002,162.0,486,0,...,0,0,0,1,0,0,0,0,1,0
3,70,60.0,9550,7,5,1915,1970,0.0,216,0,...,0,0,0,1,1,0,0,0,0,0
4,60,84.0,14260,8,5,2000,2000,350.0,655,0,...,0,0,0,1,0,0,0,0,1,0


In [30]:
X_scaled = preprocessing.scale(one_hot_encoded_X)

  """Entry point for launching an IPython kernel.


## Split train and test

In [32]:
#Split train x test
X_train, X_test, y_train, y_test = train_test_split(X_scaled, Y, shuffle=True, random_state=42)

## Train

In [33]:
#train
reg = LinearRegression().fit(X_train, y_train)

## Test

In [34]:
#test
y_pred = reg.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
score = reg.score(X_test, y_test)

In [35]:
print(mse)

4.269605451769342e+34
