In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [4]:


# Load the training data
train_data = pd.read_csv('train.csv')
train_data.head()
# Load the test data
test_data = pd.read_csv('test.csv')
test_data.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,1461,20,RH,80.0,11622,Pave,,Reg,Lvl,AllPub,...,120,0,,MnPrv,,0,6,2010,WD,Normal
1,1462,20,RL,81.0,14267,Pave,,IR1,Lvl,AllPub,...,0,0,,,Gar2,12500,6,2010,WD,Normal
2,1463,60,RL,74.0,13830,Pave,,IR1,Lvl,AllPub,...,0,0,,MnPrv,,0,3,2010,WD,Normal
3,1464,60,RL,78.0,9978,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,6,2010,WD,Normal
4,1465,120,RL,43.0,5005,Pave,,IR1,HLS,AllPub,...,144,0,,,,0,1,2010,WD,Normal


In [5]:

# Check the shape of the data
print('Training data shape:', train_data.shape)
print('Test data shape:', test_data.shape)


Training data shape: (1460, 81)
Test data shape: (1459, 80)


In [6]:

# Check the data types and missing values
print('\nTraining data info:')
print(train_data.info())
print('\nTest data info:')
print(test_data.info())


Training data info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   object 
 3   LotFrontage    1201 non-null   float64
 4   LotArea        1460 non-null   int64  
 5   Street         1460 non-null   object 
 6   Alley          91 non-null     object 
 7   LotShape       1460 non-null   object 
 8   LandContour    1460 non-null   object 
 9   Utilities      1460 non-null   object 
 10  LotConfig      1460 non-null   object 
 11  LandSlope      1460 non-null   object 
 12  Neighborhood   1460 non-null   object 
 13  Condition1     1460 non-null   object 
 14  Condition2     1460 non-null   object 
 15  BldgType       1460 non-null   object 
 16  HouseStyle     1460 non-null   object 
 17  OverallQual    1460 non-null   

In [7]:

# Fill numerical missing values with median
for column in train_data.columns:
    if train_data[column].dtype != 'object':
        train_data[column].fillna(train_data[column].median(), inplace=True)
        
        if column in test_data.columns:
            test_data[column].fillna(test_data[column].median(), inplace=True)


In [8]:

# Fill categorical missing values with mode
for column in train_data.columns:
    if train_data[column].dtype == 'object':
        train_data[column].fillna(train_data[column].mode()[0], inplace=True)
        
        if column in test_data.columns:
            test_data[column].fillna(test_data[column].mode()[0], inplace=True)


In [9]:

# Check if there are any missing values left in the training data
print('Missing values in training data:', train_data.isnull().sum().sum())

# Check if there are any missing values left in the test data
print('Missing values in test data:', test_data.isnull().sum().sum())


# One-hot encode the categorical variables
train_data = pd.get_dummies(train_data)
test_data = pd.get_dummies(test_data)


Missing values in training data: 0
Missing values in test data: 0


In [10]:

# Align the training and test data, keep only columns present in both dataframes
train_data, test_data = train_data.align(test_data, join='inner', axis=1)

# Check the shapes of the data
print('Training data shape:', train_data.shape)
print('Test data shape:', test_data.shape)


Training data shape: (1460, 271)
Test data shape: (1459, 271)


In [32]:
 # Load the data again
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')


In [33]:

# Fill numerical missing values with median
for column in train_data.columns:
    if train_data[column].dtype != 'object':
        train_data[column].fillna(train_data[column].median(), inplace=True)
        
        if column in test_data.columns:
            test_data[column].fillna(test_data[column].median(), inplace=True)


In [34]:

# Fill categorical missing values with mode
for column in train_data.columns:
    if train_data[column].dtype == 'object':
        train_data[column].fillna(train_data[column].mode()[0], inplace=True)
        
        if column in test_data.columns:
            test_data[column].fillna(test_data[column].mode()[0], inplace=True)


In [35]:

# Separate the target variable
y = train_data['SalePrice']
train_data.drop('SalePrice', axis=1, inplace=True)


In [36]:

# One-hot encode the categorical variables
train_data = pd.get_dummies(train_data)
test_data = pd.get_dummies(test_data)

train_data.head()

Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,...,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
0,1,60,65.0,8450,7,5,2003,2003,196.0,706,...,0,0,0,1,0,0,0,0,1,0
1,2,20,80.0,9600,6,8,1976,1976,0.0,978,...,0,0,0,1,0,0,0,0,1,0
2,3,60,68.0,11250,7,5,2001,2002,162.0,486,...,0,0,0,1,0,0,0,0,1,0
3,4,70,60.0,9550,7,5,1915,1970,0.0,216,...,0,0,0,1,1,0,0,0,0,0
4,5,60,84.0,14260,8,5,2000,2000,350.0,655,...,0,0,0,1,0,0,0,0,1,0


In [37]:

test_data.head()

Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,...,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
0,1461,20,80.0,11622,5,6,1961,1961,0.0,468.0,...,0,0,0,1,0,0,0,0,1,0
1,1462,20,81.0,14267,6,6,1958,1958,108.0,923.0,...,0,0,0,1,0,0,0,0,1,0
2,1463,60,74.0,13830,5,5,1997,1998,0.0,791.0,...,0,0,0,1,0,0,0,0,1,0
3,1464,60,78.0,9978,6,6,1998,1998,20.0,602.0,...,0,0,0,1,0,0,0,0,1,0
4,1465,120,43.0,5005,8,5,1992,1992,0.0,263.0,...,0,0,0,1,0,0,0,0,1,0


In [38]:

# Align the training and test data, keep only columns present in both dataframes
train_data, test_data = train_data.align(test_data, join='inner', axis=1)


In [39]:
train_data.head()

Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,...,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
0,1,60,65.0,8450,7,5,2003,2003,196.0,706,...,0,0,0,1,0,0,0,0,1,0
1,2,20,80.0,9600,6,8,1976,1976,0.0,978,...,0,0,0,1,0,0,0,0,1,0
2,3,60,68.0,11250,7,5,2001,2002,162.0,486,...,0,0,0,1,0,0,0,0,1,0
3,4,70,60.0,9550,7,5,1915,1970,0.0,216,...,0,0,0,1,1,0,0,0,0,0
4,5,60,84.0,14260,8,5,2000,2000,350.0,655,...,0,0,0,1,0,0,0,0,1,0


In [40]:

# Check the shapes of the data
print('Training data shape:', train_data.shape)
print('Test data shape:', test_data.shape)


Training data shape: (1460, 271)
Test data shape: (1459, 271)


In [51]:

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_absolute_error,r2_score



In [52]:
# Split the data into training and validation sets
X_train, X_valid, y_train, y_valid = train_test_split(train_data, y, test_size=0.2, random_state=0)



In [53]:
# Define the model
model = LinearRegression()

# Fit the model
model.fit(X_train, y_train)

# Make predictions and evaluate the model
predictions = model.predict(X_valid)

mae = mean_absolute_error(y_valid, predictions)
r2s = r2_score(y_valid,predictions)

print('Mean Absolute Error:', mae)
print('Mean Absolute Error:', r2s)

Mean Absolute Error: 23203.88045270068
Mean Absolute Error: 0.5026496089353767


In [54]:
# Define the model
model = RandomForestRegressor(n_estimators=100, random_state=0)

# Fit the model
model.fit(X_train, y_train)

# Make predictions and evaluate the model
predictions = model.predict(X_valid)
mae = mean_absolute_error(y_valid, predictions)
r2s = r2_score(y_valid,predictions)
print('Mean Absolute Error:', mae)
print('Mean Absolute Error:', r2s)

Mean Absolute Error: 17428.12150684932
Mean Absolute Error: 0.8356180904626607


In [57]:
# Define the model with more trees
model = DecisionTreeRegressor(max_depth=8)

# Fit the model
model.fit(X_train, y_train)

# Make predictions and evaluate the model
predictions = model.predict(X_valid)
mae = mean_absolute_error(y_valid, predictions)
r2s = r2_score(y_valid,predictions)
print('Mean Absolute Error:', mae)
print('Mean Absolute Error:', r2s)


Mean Absolute Error: 25724.85449756529
Mean Absolute Error: 0.7330828224232979


In [None]:
from sklearn.ensemble import GradientBoostingRegressor

# Define the model
model = GradientBoostingRegressor(random_state=0)

# Fit the model
model.fit(X_train, y_train)

# Make predictions on the validation set and evaluate the model
predictions = model.predict(X_valid)
mae = mean_absolute_error(y_valid, predictions)
print('Mean Absolute Error:', mae)


In [None]:
# Make predictions on the test data
test_predictions = model.predict(test_data)


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
# Calculate the feature importances
importances = model.feature_importances_



In [None]:
# Sort the feature importances in descending order and take the top 10
indices = np.argsort(importances)[::-1]
columns = X_train.columns.values[indices[:10]]
values = importances[indices][:10]


In [None]:
# Create a bar plot of the feature importances
plt.figure(figsize=(10, 5))
sns.barplot(x=columns, y=values, palette='Blues_r')
plt.title('Feature Importance')
plt.xlabel('Feature')
plt.ylabel('Importance')
plt.xticks(rotation=90)
plt.show()


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Calculate the feature importances
importances = model.feature_importances_



In [None]:
# Sort the feature importances in descending order and take the top 10
indices = np.argsort(importances)[::-1]
columns = X_train.columns.values[indices[:10]]
values = importances[indices][:10]


In [None]:

# Create a bar plot of the feature importances
plt.figure(figsize=(10, 5))
sns.barplot(x=columns, y=values, palette='Blues_r')
plt.title('Feature Importance')
plt.xlabel('Feature')
plt.ylabel('Importance')
plt.xticks(rotation=90)
plt.show()


In [None]:
from sklearn.model_selection import GridSearchCV

# Define the parameter grid
param_grid = {
    'n_estimators': [100, 500, 1000],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 4, 5]
}


In [None]:

# Initialize a Gradient Boosting Regressor
model = GradientBoostingRegressor(random_state=0)


In [None]:

# Initialize a Grid Search object
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=3, scoring='neg_mean_absolute_error', verbose=2, n_jobs=-1)


In [None]:

# Fit the Grid Search object to the data
grid_search.fit(X_train, y_train)


In [2]:

# Get the best parameters
best_params = grid_search.best_params_
print('Best parameters:', best_params)



NameError: name 'grid_search' is not defined

In [None]:

# Fit the model with the best parameters
model = GradientBoostingRegressor(**best_params, random_state=0)
model.fit(X_train, y_train)



In [None]:

# Make predictions on the validation set and evaluate the model
predictions = model.predict(X_valid)
mae = mean_absolute_error(y_valid, predictions)
print('Mean Absolute Error:', mae)
test_predictions = model.predict(test_data)

In [None]:

# Create a DataFrame for submission
submission = pd.DataFrame({
    'Id': test_data['Id'],
    'SalePrice': test_predictions
})



In [None]:
# Write the submission file to csv
submission.to_csv('/kaggle/working/submission.csv', index=False)