# Play around with different models

I have 4 initial clean dataframes for 'big' models that till have many columns:
1. df1 = missing values dropped, and category 1 columns dropped
2. df2 = missing values dropped, and category 1 & 2 columns dropped
3. df3 = missing values dropped, and cat1 dropped
4. df4 = missing values dropped, and cat 1 & 2 dropped

Imports, read in data, start with overfit models and then simplify. Afterwards, start with simpler model and work my way up. Look back at heatmaps to eliminate more columns/ examine coefficients and p-values within df to eliminate more columns.

Basic Steps for each model:
1. Define X and y (Remove ID variables) 
2. Train-test split
3. Pre-processing
    - One hot encoding
    - Simple Imputer (for df3 & df4)
    - Interaction Variables/ Polynomial Features (try with and without)
    - Manually create some interaction variables
4. Instantiate Linear Regression Model
5. Analyze scores & submit to Kaggle

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import PolynomialFeatures, StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer

from sklearn import metrics
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet

In [2]:
# Model 1 with df1 (simplest clean dataset- missings dropped, some columns dropped)

df1 = pd.read_csv('datasets/df1.csv')

In [3]:
df1.head()

Unnamed: 0.1,Unnamed: 0,Id,PID,MS SubClass,MS Zoning,Lot Frontage,Lot Area,Street,Lot Shape,Land Contour,...,Enclosed Porch,3Ssn Porch,Screen Porch,Pool Area,Fence,Misc Val,Mo Sold,Yr Sold,Sale Type,SalePrice
0,1,544,531379050,60,RL,43.0,11492,Pave,IR1,Lvl,...,0,0,0,0,,0,4,2009,WD,220000
1,2,153,535304180,20,RL,68.0,7922,Pave,Reg,Lvl,...,0,0,0,0,,0,1,2010,WD,109000
2,3,318,916386060,60,RL,73.0,9802,Pave,Reg,Lvl,...,0,0,0,0,,0,4,2010,WD,174000
3,4,255,906425045,50,RL,82.0,14235,Pave,IR1,Lvl,...,0,0,0,0,,0,3,2010,WD,138500
4,5,138,535126040,20,RL,137.0,16492,Pave,IR1,Lvl,...,0,0,0,0,,0,6,2010,WD,190000


In [4]:
df1.shape

(1598, 75)

In [5]:
df1_X = df1.drop(columns=['Unnamed: 0', 'Id', 'PID', 'SalePrice'])
df1_y = df1['SalePrice']

In [6]:
X1_train, X1_test, y1_train, y1_test = train_test_split(df1_X, df1_y, test_size= 0.2, random_state=24)

In [7]:
#Make lists of the numeric and string columns:
df1_str = list(df1_X.select_dtypes(include=['object']).columns)
df1_num = list(df1_X.select_dtypes(include=['int', 'float']).columns)


In [8]:
ct = ColumnTransformer([
    ('poly', PolynomialFeatures(include_bias=False), df1_num),
    ('scaler', StandardScaler(), df1_num), 
    ('oh', OneHotEncoder(sparse_output=False, drop='first'), df1_str)
], remainder='passthrough')  # 'passthrough' allows non-transformed columns to pass through

# Fit and transform the training data using the ColumnTransformer
X1_train_transformed = ct.fit_transform(X1_train)

# Transform the test data using the fitted ColumnTransformer
X1_test_transformed = ct.transform(X1_test)

ValueError: Found unknown categories ['Mansard'] in column 10 during transform

#Pre-processing:
#Referred to lesson 305 Feature Engineering
ct = ColumnTransformer([
    ('poly', PolynomialFeatures(include_bias=False), df1_num),
    ('oh', OneHotEncoder(sparse_output=False, drop='first'), df1_str)
], remainder='drop')

In [None]:
#lr = LinearRegression()
#lr.fit(df1_X, df1_y)

In [9]:
df5 = pd.read_csv('datasets/df5.csv')

In [10]:
df5.head()

Unnamed: 0.1,Unnamed: 0,SalePrice,Id,Overall Qual,Gr Liv Area,Garage Area,Garage Cars,Total Bsmt SF,1st Flr SF,Year Built,...,BsmtFin SF 1,Wood Deck SF,Neighborhood,Condition 1,Condition 2,Bldg Type,House Style,Overall Cond,Functional,Sale Type
0,0,130500,109,6,1479,475.0,2.0,725.0,725,1976,...,533.0,0,Sawyer,RRAe,Norm,1Fam,2Story,8,Typ,WD
1,1,220000,544,7,2122,559.0,2.0,913.0,913,1996,...,637.0,0,SawyerW,Norm,Norm,1Fam,2Story,5,Typ,WD
2,2,109000,153,5,1057,246.0,1.0,1057.0,1057,1953,...,731.0,0,NAmes,Norm,Norm,1Fam,1Story,7,Typ,WD
3,3,174000,318,5,1444,400.0,2.0,384.0,744,2006,...,0.0,100,Timber,Norm,Norm,1Fam,2Story,5,Typ,WD
4,4,138500,255,6,1445,484.0,2.0,676.0,831,1900,...,0.0,0,SawyerW,Norm,Norm,1Fam,1.5Fin,8,Typ,WD


In [11]:
df5_X = df5.drop(columns=['Unnamed: 0', 'Id', 'SalePrice'])
df5_y = df5['SalePrice']

In [12]:
X5_train, X5_test, y5_train, y5_test = train_test_split(df5_X, df5_y, test_size= 0.2, random_state=24)

In [16]:
df5_train_str = list(X5_train.select_dtypes(include=['object']).columns)
df5_test_str = list(X5_test.select_dtypes(include=['object']).columns)
df5_train_num = list(X5_train.select_dtypes(include=['int', 'float']).columns)
df5_test_num = list(X5_test.select_dtypes(include=['int', 'float']).columns)

In [17]:
imputer = SimpleImputer(strategy='mean')
X5_train_transformed = imputer.fit_transform(X5_train[df5_train_num])


ct = ColumnTransformer([
    ('poly', PolynomialFeatures(include_bias=False), X5_train_transformed),
    ('scaler', StandardScaler(), X5_train_transformed),
    ('oh', OneHotEncoder(sparse_output=False, drop='first'), df5_train_str)
], remainder='passthrough')

# Fit and transform the training data using the ColumnTransformer
X5_train_transformed = ct.fit_transform(X5_train)

# Transform the test data using the fitted ColumnTransformer
X5_test_transformed = ct.transform(X5_test)

ValueError: No valid specification of the columns. Only a scalar, list or slice of all integers or all strings, or boolean mask is allowed

In [19]:
df6 = pd.read_csv('datasets/df5.csv')

df6 = df6.dropna()

df6_X = df6.drop(columns=['Unnamed: 0', 'Id', 'SalePrice'])
df6_y = df6['SalePrice']

X6_train, X6_test, y6_train, y6_test = train_test_split(df6_X, df6_y, test_size= 0.2, random_state=24)

df6_train_str = list(X6_train.select_dtypes(include=['object']).columns)
df6_test_str = list(X6_test.select_dtypes(include=['object']).columns)
df6_train_num = list(X6_train.select_dtypes(include=['int', 'float']).columns)
df6_test_num = list(X6_test.select_dtypes(include=['int', 'float']).columns)

ct = ColumnTransformer([
    ('poly', PolynomialFeatures(include_bias=False), df6_train_num),
    ('scaler', StandardScaler(), df6_train_num),
    ('oh', OneHotEncoder(sparse_output=False, handle_unknown='ignore', drop='first'), df6_train_str)
], remainder='passthrough')

# Fit and transform the training data using the ColumnTransformer
X6_train_transformed = ct.fit_transform(X6_train)

# Transform the test data using the fitted ColumnTransformer
X6_test_transformed = ct.transform(X6_test)

lr6 = LinearRegression()

lr6.fit(X6_train_transformed, y6_train)

r2_train = lr6.score(X6_train_transformed, y6_train)

r2_test = lr6.score(X6_test_transformed, y6_test)

y6_pred = lr6.predict(X6_test_transformed)

print(r2_train)
print(r2_test)
print(metrics.mean_squared_error(y6_test, y6_pred))

0.9419309322015498
0.8988941760085423
679339772.2771369




In [None]:
df6 = pd.read_csv('datasets/df5.csv')
df6 = df6.dropna()
df6_X = df6.drop(columns=['Unnamed: 0', 'Id', 'SalePrice'])
df6_y = df6['SalePrice']
X6_train, X6_test, y6_train, y6_test = train_test_split(df6_X, df6_y, test_size=0.2, random_state=24)
df6_train_str = list(X6_train.select_dtypes(include=['object']).columns)
df6_test_str = list(X6_test.select_dtypes(include=['object']).columns)
df6_train_num = list(X6_train.select_dtypes(include=['int', 'float']).columns)
df6_test_num = list(X6_test.select_dtypes(include=['int', 'float']).columns)

# Create a ColumnTransformer with PolynomialFeatures, StandardScaler, and OneHotEncoder
ct = ColumnTransformer([
    ('poly', PolynomialFeatures(include_bias=False), df6_train_num),
    ('scaler', StandardScaler(), df6_train_num),
    ('oh', OneHotEncoder(sparse_output=False, drop='first'), df6_train_str)
], remainder='passthrough')

# Fit and transform the training data using the ColumnTransformer
X6_train_transformed = ct.fit_transform(X6_train)

# Transform the test data using the fitted ColumnTransformer
X6_test_transformed = ct.transform(X6_test)

# Create an ElasticNet model
elastic_net = ElasticNet()
param_grid = {
    'alpha': [0.01, 0.1, 1.0, 10.0],      # Values of alpha to test
    'l1_ratio': [0.1, 0.3, 0.5, 0.7, 0.9]  # Values of l1_ratio to test
}

# Create a GridSearchCV object
grid_search = GridSearchCV(estimator=elastic_net, param_grid=param_grid, cv=5)

# Fit the GridSearchCV object to the training data
grid_search.fit(X6_train_transformed, y6_train)

# Get the best hyperparameters from the search
best_alpha = grid_search.best_params_['alpha']
best_l1_ratio = grid_search.best_params_['l1_ratio']

print(f"Best alpha: {best_alpha}")
print(f"Best l1_ratio: {best_l1_ratio}")

In [None]:
df6 = pd.read_csv('datasets/df5.csv')
df6 = df6.dropna()
df6_X = df6.drop(columns=['Unnamed: 0', 'Id', 'SalePrice'])
df6_y = df6['SalePrice']
X6_train, X6_test, y6_train, y6_test = train_test_split(df6_X, df6_y, test_size=0.2, random_state=4)
df6_train_str = list(X6_train.select_dtypes(include=['object']).columns)
df6_test_str = list(X6_test.select_dtypes(include=['object']).columns)
df6_train_num = list(X6_train.select_dtypes(include=['int', 'float']).columns)
df6_test_num = list(X6_test.select_dtypes(include=['int', 'float']).columns)

# Create a ColumnTransformer with PolynomialFeatures, StandardScaler, and OneHotEncoder
ct = ColumnTransformer([
    ('poly', PolynomialFeatures(include_bias=False), df6_train_num),
    ('scaler', StandardScaler(), df6_train_num),
    ('oh', OneHotEncoder(sparse_output=False, drop='first'), df6_train_str)
], remainder='passthrough')

# Fit and transform the training data using the ColumnTransformer
X6_train_transformed = ct.fit_transform(X6_train)

# Transform the test data using the fitted ColumnTransformer
X6_test_transformed = ct.transform(X6_test)

# Create an ElasticNet model with regularization
elastic_net = ElasticNet(alpha=.01, l1_ratio=0.5)  # You can adjust alpha and l1_ratio as needed

# Fit the ElasticNet model to the training data
elastic_net.fit(X6_train_transformed, y6_train)

# Calculate the R-squared score on the training and test data
train_score = elastic_net.score(X6_train_transformed, y6_train)
test_score = elastic_net.score(X6_test_transformed, y6_test)

# Make predictions on the test data
y6_pred = elastic_net.predict(X6_test_transformed)

# Calculate the mean squared error
mse = metrics.mean_squared_error(y6_test, y6_pred)

print(f"Training R-squared: {train_score}")
print(f"Test R-squared: {test_score}")
print(f"Mean Squared Error: {mse}")

In [None]:
df6 = pd.read_csv('datasets/df5.csv')
df6 = df6.dropna()
df6_X = df6.drop(columns=['Unnamed: 0', 'Id', 'SalePrice'])
df6_y = df6['SalePrice']
X6_train, X6_test, y6_train, y6_test = train_test_split(df6_X, df6_y, test_size=0.2, random_state=4)
df6_train_str = list(X6_train.select_dtypes(include=['object']).columns)
df6_test_str = list(X6_test.select_dtypes(include=['object']).columns)
df6_train_num = list(X6_train.select_dtypes(include=['int', 'float']).columns)
df6_test_num = list(X6_test.select_dtypes(include=['int', 'float']).columns)

# Create a ColumnTransformer with PolynomialFeatures, StandardScaler, and OneHotEncoder
ct = ColumnTransformer([
    ('poly', PolynomialFeatures(include_bias=False), df6_train_num),
    ('scaler', StandardScaler(), df6_train_num),
    ('oh', OneHotEncoder(sparse_output=False, drop='first'), df6_train_str)
], remainder='passthrough')

# Fit and transform the training data using the ColumnTransformer
X6_train_transformed = ct.fit_transform(X6_train)

# Transform the test data using the fitted ColumnTransformer
X6_test_transformed = ct.transform(X6_test)

# Create a Lasso model with regularization
lasso = Lasso(alpha=.01)  # You can adjust the alpha parameter as needed

# Fit the Lasso model to the training data
lasso.fit(X6_train_transformed, y6_train)

# Calculate the R-squared score on the training and test data
train_score = lasso.score(X6_train_transformed, y6_train)
test_score = lasso.score(X6_test_transformed, y6_test)

# Make predictions on the test data
y6_pred = lasso.predict(X6_test_transformed)

# Calculate the mean squared error
mse = metrics.mean_squared_error(y6_test, y6_pred)

print(f"Training R-squared: {train_score}")
print(f"Test R-squared: {test_score}")
print(f"Mean Squared Error: {mse}")