In [1]:
from sklearn import linear_model
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt


## LOAD DATA:

In [2]:
# Load Data
train_data = pd.read_csv("train.csv")
test_data = pd.read_csv("test.csv")

In [3]:
# Show first 5 entries of Train Data
train_data.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [4]:
# Show first 5 entries of Test Data
test_data.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,1461,20,RH,80.0,11622,Pave,,Reg,Lvl,AllPub,...,120,0,,MnPrv,,0,6,2010,WD,Normal
1,1462,20,RL,81.0,14267,Pave,,IR1,Lvl,AllPub,...,0,0,,,Gar2,12500,6,2010,WD,Normal
2,1463,60,RL,74.0,13830,Pave,,IR1,Lvl,AllPub,...,0,0,,MnPrv,,0,3,2010,WD,Normal
3,1464,60,RL,78.0,9978,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,6,2010,WD,Normal
4,1465,120,RL,43.0,5005,Pave,,IR1,HLS,AllPub,...,144,0,,,,0,1,2010,WD,Normal


In [5]:
# Store Ids of test data for later in the csv creation to kaggle submition
test_ids = test_data["Id"]

In [6]:
# Load Targets 
y = train_data["SalePrice"]
# Convert to numpy array
y = np.array(y)

print(y.shape)

(1460,)


## Prepare Data for Machine Learning:

**Don't look much the next piece of code** basicly do:
    - Numerical Data:
        Change NaN for the median of the column
    - Categorical Data:
        Change to Int
        Hot One encode Categorical Data

In [7]:
from sklearn.preprocessing import Imputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder

imputer = Imputer(strategy="median")
scaler = StandardScaler()
hot_enc = OneHotEncoder()

def prepare_data(dataframe, feature_list=list(), training=True):

    if not feature_list:
        feature_list = list(dataframe)
        
    dataframe = dataframe[feature_list].copy()
    
    # Split dataframe in numerical and categorial data
    num_data = dataframe.select_dtypes(include=[np.number])
    cat_data = dataframe.select_dtypes(include=[object])

    if not num_data.empty:
        
        # Replace all NaN with the median in numerial data
        if training:
            imputer.fit(num_data)
        X_num = imputer.transform(num_data)
        
        # Scale between -1, 1
        if training:
            scaler.fit(X_num)
        
        X_num = scaler.transform(X_num)

        # Check if have categorical data 
        if cat_data.empty:  
            return X_num
    
    if not cat_data.empty:

        # Replace all NaN with "None" as other category in categorical data 
        cat_data.fillna('None', inplace=True)

        facto_cat_data = pd.DataFrame()
        # Factorize each categorical column (string -> int)
        for feature in list(cat_data):
            facto_cat_data[feature], _ = pd.factorize(cat_data[feature],)
            
        # Hot encode
        if training:
            hot_enc.fit(facto_cat_data.values)
        X_cat_1hot = hot_enc.transform(facto_cat_data.values).todense()

        # Check if have numerical data 
        if num_data.empty:
            return X_cat_1hot

    # Merge Numerical Data with One Hot encoded categorical data
    X = np.append(X_num, X_cat_1hot, axis=1)
    
    return X


## Extract our Training Features

 ***TODO: Select Better Features***

In [8]:
# Load Selected features
# X = np.array([  # TODO: Select better features
#                 train_data["YearBuilt"], 
#                 train_data["YrSold"], 
#                 train_data["LotArea"]
#             ])

# T = np.array([  # TODO: Select better features
#                 test_data["YearBuilt"], 
#                 test_data["YrSold"], 
#                 test_data["LotArea"]
#             ])

training_features = ["YearBuilt", "YrSold", "LotArea", "Street"] # <- TODO


X = prepare_data(train_data, training_features, True) # Training Data need to be True
T = prepare_data(test_data, training_features, False) # Test Data need to be False

X.shape, T.shape

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  downcast=downcast, **kwargs)


((1460, 5), (1459, 5))

In [None]:
#Training Features
#             #TamaNo    #Cuartos
# X = np.array([[1000,   2],
#               [2000,   2], 
#               [3000,   3], 
#               [10000,  5]])
# # Training Tagets
# y = np.array([2500, 5800, 7800, 18000])
# Test Data
#T = [[4000, 3], [5000, 5], [6000, 2], [7000, 8]]

# Initialize Regression Object
reg = linear_model.LinearRegression()

# Training
reg.fit(X, y)


In [None]:
# Predictions

# Predict
pred = reg.predict(T)

In [None]:
# For this problem Kaggle do not accept negatives values
# NOTE: We know negatives value are wrong

# TODO Select one

# Option 1: Saturate
if False:
    pred[pred < 0] = 0

# Option 2: Absolute Value
if True:
    pred = np.abs(pred)

In [None]:
# Create a "table" each index name is column name
# Kaggle Format
df_dict = {"SalePrice" : pred,
           "Id" : test_ids }

# Convert to Pandas DataFrame
df = pd.DataFrame(df_dict)

# Show Some data
df.head()

In [None]:
# Save in to csv file
pred_filename = "predictions-with-" + "+".join(training_features) + ".csv"  # Formatting
df.to_csv(pred_filename, index=False)
print("Output file: " + pred_filename)