### Notebook Report

In [211]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LinearRegression

from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from category_encoders.one_hot import OneHotEncoder
from sklearn.linear_model import LinearRegression,\
                                 Lasso, Ridge

from scipy import stats

In [212]:
def print_results(scores_list):
    return f'''
    [min, max] = 
                        {[round(min(scores_list), 2), round(max(scores_list), 2)]}
    
    confidence interval: 
                         {round(mean(scores_list), 2)} \u00B1 {round(2 * stdev(scores_list), 2)}
    '''

#### Loading the datasets

In [213]:
train = pd.read_csv('../datasets/train.csv', keep_default_na=False)

In [214]:
test = pd.read_csv('../datasets/test.csv', keep_default_na=False)

#### Cleaning column names

In [215]:
train.columns = train.columns.str.lower().str.replace(' ','_' )
test.columns = test.columns.str.lower().str.replace(' ','_' )

In [216]:
#column names look good
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2051 entries, 0 to 2050
Data columns (total 81 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   id               2051 non-null   int64 
 1   pid              2051 non-null   int64 
 2   ms_subclass      2051 non-null   int64 
 3   ms_zoning        2051 non-null   object
 4   lot_frontage     2051 non-null   object
 5   lot_area         2051 non-null   int64 
 6   street           2051 non-null   object
 7   alley            2051 non-null   object
 8   lot_shape        2051 non-null   object
 9   land_contour     2051 non-null   object
 10  utilities        2051 non-null   object
 11  lot_config       2051 non-null   object
 12  land_slope       2051 non-null   object
 13  neighborhood     2051 non-null   object
 14  condition_1      2051 non-null   object
 15  condition_2      2051 non-null   object
 16  bldg_type        2051 non-null   object
 17  house_style      2051 non-null   

A few of these columns are not the right dtype when we compare with the data dictionary, will convert those columns below. 

#### Make columns the right dtype

**Train fix dtypes:**

In [217]:
train['ms_subclass'] = train['ms_subclass'].map(str)

In [218]:
train['sale_type'].dtype

dtype('O')

In [219]:
make_int = ['bsmtfin_sf_1', 'bsmtfin_sf_2', 'bsmt_unf_sf',
            'total_bsmt_sf', 'bsmt_full_bath', 'bsmt_half_bath',
            'garage_yr_blt', 'garage_cars', 'garage_area']

Below we will assume an empty string means that the feature is missing in that property

In [220]:
for col in make_int:
    train[col] = train[col].replace('', 0)

In [221]:
for col in make_int: 
    print(col, sum(train[col]==''))

bsmtfin_sf_1 0
bsmtfin_sf_2 0
bsmt_unf_sf 0
total_bsmt_sf 0
bsmt_full_bath 0
bsmt_half_bath 0
garage_yr_blt 0
garage_cars 0
garage_area 0


In [222]:
for col in make_int:
    train[col] = train[col].map(int)

In [223]:
train['bsmtfin_sf_1'].dtype

dtype('int64')

**Test dtype fix:**

In [224]:
test['ms_subclass'] = test['ms_subclass'].map(str)

In [225]:
for col in make_int:
    test[col] = test[col].replace('', 0)

In [226]:
for col in make_int:
    test[col] = test[col].map(int)

In [227]:
test[make_int].dtypes

bsmtfin_sf_1      int64
bsmtfin_sf_2      int64
bsmt_unf_sf       int64
total_bsmt_sf     int64
bsmt_full_bath    int64
bsmt_half_bath    int64
garage_yr_blt     int64
garage_cars       int64
garage_area       int64
dtype: object

Great, the columns have been make into ints, now they are properly numerical

#### Check for nulls

In [228]:
train.isnull().sum().sum()

0

In [229]:
test.isnull().sum().sum()

0

### Make Ordinal Columns

While I do not anticipate using all the columns I am about to tranform into ordinal in my project statement model, I cannot anticipate which of thes columns may become useful in the predictive model for kraggle. So I will go ahead and make sure the dataframe has the right ordinal columns. 

For Categorical data, I will use OneHotEncoder

In [230]:
# Ex, Gd, TA, Fa, Po
make_ordinal_5 = ['exter_qual', 'exter_cond', 'heating_qc', 
                  'kitchen_qual']

In [231]:
for col in make_ordinal_5: 
    print(col, sum(train[col]==''))

exter_qual 0
exter_cond 0
heating_qc 0
kitchen_qual 0


Great - no empty strings, so we move on to encode these ordinal columns

In [232]:
for col in make_ordinal_5:
    print(col, train[col].unique())

exter_qual ['Gd' 'TA' 'Ex' 'Fa']
exter_cond ['TA' 'Gd' 'Fa' 'Ex' 'Po']
heating_qc ['Ex' 'TA' 'Gd' 'Fa' 'Po']
kitchen_qual ['Gd' 'TA' 'Fa' 'Ex']


In [233]:
ordinal_5 = {
    'Ex': 5,
    'Gd': 4,
    'TA': 3,
    'Fa': 2,
    'Po': 1
}

for col in make_ordinal_5:
    train[col] = train[col].map(ordinal_5)

train[make_ordinal_5].tail()

Unnamed: 0,exter_qual,exter_cond,heating_qc,kitchen_qual
2046,4,3,5,4
2047,3,3,5,3
2048,3,3,4,3
2049,3,3,3,3
2050,3,3,4,4


My values look good, now they are ordinal :)

In [234]:
# Ex, Gd, TA, Fa, Po, NA
make_ordinal_0_5 = ['bsmt_qual', 'bsmt_cond', 'fireplace_qu',
                    'garage_qual', 'garage_cond', 'pool_qc']

In [235]:
for col in make_ordinal_0_5: 
    print(col, sum(train[col]==''))

bsmt_qual 1
bsmt_cond 1
fireplace_qu 0
garage_qual 1
garage_cond 1
pool_qc 0


In [236]:
for col in make_ordinal_0_5:
    train[col] = train[col].replace('', train[col].value_counts().idxmax())

I have imputed the colums in make_ordinal_0_5 with their mode

In [237]:
ordinal_0_5 = {
    'Ex': 5,
    'Gd': 4,
    'TA': 3,
    'Fa': 2,
    'Po': 1,
    'NA': 0
}

In [238]:
for col in make_ordinal_0_5:
    train[col] = train[col].map(ordinal_0_5)

In [239]:
train[make_ordinal_0_5].tail()

Unnamed: 0,bsmt_qual,bsmt_cond,fireplace_qu,garage_qual,garage_cond,pool_qc
2046,4,3,4,3,3,0
2047,3,3,0,3,3,0
2048,3,3,3,2,2,0
2049,3,3,4,3,3,0
2050,3,3,3,3,3,0


Great, those columns look properly ordinal 

In [240]:
# Gd, Av, Mn, No, Na
make_ordinal_0_4 = ['bsmt_exposure']

In [241]:
for col in make_ordinal_0_4: 
    print(col, sum(train[col]==''))

bsmt_exposure 4


In [242]:
for col in make_ordinal_0_4:
    train[col] = train[col].replace('', train[col].value_counts().idxmax())

In [243]:
for col in make_ordinal_0_4:
    print(col, train[col].unique())

bsmt_exposure ['No' 'Gd' 'Av' 'NA' 'Mn']


In [244]:
ordinal_0_4 = {
    'Gd': 4,
    'Av': 3,
    'Mn': 2,
    'No': 1,
    'NA': 0
}

In [245]:
for col in make_ordinal_0_4:
    train[col] = train[col].map(ordinal_0_4)

In [246]:
train[make_ordinal_0_4].tail()

Unnamed: 0,bsmt_exposure
2046,3
2047,1
2048,1
2049,1
2050,1


In [247]:
# GdPrv, MnPrv, GdWo, MnWw, NA
make_ordinal_fence = ['fence']

In [248]:
for col in make_ordinal_fence: 
    print(col, sum(train[col]==''))

fence 0


In [249]:
for col in make_ordinal_fence:
    print(col, train[col].unique())

fence ['NA' 'MnPrv' 'GdPrv' 'GdWo' 'MnWw']


In [250]:
ordinal_fence = {
    'GdPrv': 4,
    'MnPrv': 3,
    'GdWo': 2,
    'MnWw': 1,
    'NA': 0
}

In [251]:
for col in make_ordinal_fence:
    train[col] = train[col].map(ordinal_fence)

In [252]:
train[make_ordinal_fence].tail()

Unnamed: 0,fence
2046,0
2047,0
2048,0
2049,0
2050,0


Fence is now ordinal

In [253]:
# GLQ, ALQ, BLQ, Rec, LwQ, Unf, NA 
make_ordinal_0_6 = ['bsmtfin_type_1', 'bsmtfin_type_2']

In [254]:
for col in make_ordinal_0_6: 
    print(col, sum(train[col]==''))

bsmtfin_type_1 1
bsmtfin_type_2 2


In [255]:
for col in make_ordinal_0_6:
    train[col] = train[col].replace('', train[col].value_counts().idxmax())

In [256]:
for col in make_ordinal_0_6:
    print(col, train[col].unique())

bsmtfin_type_1 ['GLQ' 'Unf' 'ALQ' 'Rec' 'NA' 'BLQ' 'LwQ']
bsmtfin_type_2 ['Unf' 'Rec' 'NA' 'BLQ' 'GLQ' 'LwQ' 'ALQ']


In [257]:
ordinal_0_6 = {
    'GLQ': 6,
    'ALQ': 5,
    'BLQ': 4,
    'Rec': 3,
    'LwQ': 2,
    'Unf': 1,
    'NA': 0
}

In [258]:
for col in make_ordinal_0_6:
    train[col] = train[col].map(ordinal_0_6)

In [259]:
train[make_ordinal_0_6].tail()

Unnamed: 0,bsmtfin_type_1,bsmtfin_type_2
2046,6,1
2047,4,1
2048,1,1
2049,3,2
2050,1,1


The columns look good!

#### Make a function to do this for test or train data

In [260]:
def make_ordinal_train(col_list, ord_dict):
    for col in col_list:
        train[col] = train[col].replace('', train[col].value_counts().idxmax())
    
    for col in col_list:
        train[col] = train[col].map(ord_dict)

In [261]:
def make_ordinal_test(col_list, ord_dict):
    for col in col_list:
        test[col] = test[col].replace('', train[col].value_counts().idxmax())
        # note that we are imputing with the train's mode
    
    for col in col_list:
        test[col] = test[col].map(ord_dict)

In [262]:
# Typ, Min1, Min2, Mod, Maj1, Maj2, Sev, Sal
make_ordinal_8 = ['functional']

In [263]:
ordinal_0_8 = {
    'Typ': 8,
    'Min2': 7,
    'Min1': 6,
    'Mod': 5,
    'Maj1': 4,
    'Maj2': 3,
    'Sev': 2,
    'Sal': 1
}

In [264]:
make_ordinal_train(make_ordinal_8, ordinal_0_8)

In [265]:
train[make_ordinal_8].head(3)

Unnamed: 0,functional
0,8
1,8
2,8


Function works and the column looks good!

In [266]:
#Fin, RFn, Unf, Na
make_ordinal_0_3 = ['garage_finish']

In [267]:
ordinal_0_3 = {
    'Fin': 3,
    'RFn': 2,
    'Unf': 1,
    'NA': 0   
}

In [268]:
make_ordinal_train(make_ordinal_0_3, ordinal_0_3)

In [269]:
train['garage_finish'].unique()

array([2, 1, 3, 0])

#### Make Ordinals for test

In [270]:
dictionaries = [ordinal_5, ordinal_0_5, 
                ordinal_0_4, ordinal_fence,
                ordinal_0_6, ordinal_0_8, 
                ordinal_0_3
               ]

In [271]:
make_ord_list = [make_ordinal_5, make_ordinal_0_5,
                 make_ordinal_0_4, make_ordinal_fence,
                 make_ordinal_0_6, make_ordinal_8,
                 make_ordinal_0_3
                ]

In [272]:
for i, col in enumerate(make_ord_list):
    make_ordinal_test(col, dictionaries[i])

In [273]:
test[make_ordinal_0_3].head()

Unnamed: 0,garage_finish
0,Unf
1,Fin
2,RFn
3,Unf
4,RFn


Have make the columns in test ordinal!

#### Looking at shapes, and nulls

In [274]:
train.shape

(2051, 81)

In [275]:
test.shape

(878, 80)

In [276]:
train.isnull().sum().sum()

0

In [279]:
test.isnull().sum().sum()

0

No aparent nulls for now

#### EDA

EDA on numericals

In [9]:
X = train[['Gr Liv Area']]
y = train['SalePrice']   

In [15]:
lr = LinearRegression() #instantiate the linear model
lr.fit(X=X, y=y)

LinearRegression()

In [17]:
X_test = test[['Gr Liv Area']]

preds = lr.predict(X_test)

In [19]:
test['SalePrice'] = preds  

In [20]:
submission = test[['Id', 'SalePrice']]

In [21]:
submission.shape

(878, 2)

In [22]:
pwd

'/Users/luisagonzalez/Library/CloudStorage/OneDrive-Personal/Documents/General_Assembly/Projects/project-2'

In [23]:
submission.to_csv('./datasets/basic_model.csv', index=False)  