In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LinearRegression
from sklearn import metrics
from sklearn.model_selection import train_test_split, cross_val_score

In [8]:
## EDA --------
# **Read the data dictionary.**
# Determine _what_ missing values mean.
# Figure out what each categorical value represents.
# Identify outliers.
# Consider whether discrete values are better represented as categorical or continuous. (Are relationships to the target linear?)

In [4]:
train = pd.read_csv('C:/Users/james/Documents/GA/Projects/project-2/datasets/train.csv')

In [5]:
train.head(5)

Unnamed: 0,Id,PID,MS SubClass,MS Zoning,Lot Frontage,Lot Area,Street,Alley,Lot Shape,Land Contour,...,Screen Porch,Pool Area,Pool QC,Fence,Misc Feature,Misc Val,Mo Sold,Yr Sold,Sale Type,SalePrice
0,109,533352170,60,RL,,13517,Pave,,IR1,Lvl,...,0,0,,,,0,3,2010,WD,130500
1,544,531379050,60,RL,43.0,11492,Pave,,IR1,Lvl,...,0,0,,,,0,4,2009,WD,220000
2,153,535304180,20,RL,68.0,7922,Pave,,Reg,Lvl,...,0,0,,,,0,1,2010,WD,109000
3,318,916386060,60,RL,73.0,9802,Pave,,Reg,Lvl,...,0,0,,,,0,4,2010,WD,174000
4,255,906425045,50,RL,82.0,14235,Pave,,IR1,Lvl,...,0,0,,,,0,3,2010,WD,138500


In [12]:
train.isnull().head(2)

Unnamed: 0,Id,PID,MS SubClass,MS Zoning,Lot Frontage,Lot Area,Street,Alley,Lot Shape,Land Contour,...,Screen Porch,Pool Area,Pool QC,Fence,Misc Feature,Misc Val,Mo Sold,Yr Sold,Sale Type,SalePrice
0,False,False,False,False,True,False,False,True,False,False,...,False,False,True,True,True,False,False,False,False,False
1,False,False,False,False,False,False,False,True,False,False,...,False,False,True,True,True,False,False,False,False,False


In [13]:
# makes Jupyter show all rows and columns
pd.options.display.max_rows = train.shape[1]
pd.options.display.max_columns = train.shape[1]
# set to show all columns in dataset
pd.set_option('display.max_columns', 300)

In [14]:
null_columns = train[['Misc Feature', 'Fence', 'Fireplace Qu', 'Mas Vnr Type', 'Alley', 'Lot Frontage']].head(10)
null_columns.head(50)

Unnamed: 0,Misc Feature,Fence,Fireplace Qu,Mas Vnr Type,Alley,Lot Frontage
0,,,,BrkFace,,
1,,,TA,BrkFace,,43.0
2,,,,,,68.0
3,,,,,,73.0
4,,,,,,82.0
5,,,Gd,,,137.0
6,,,,BrkFace,,35.0
7,,MnPrv,,BrkFace,,
8,Shed,MnPrv,,,,
9,,,TA,BrkFace,,70.0


In [None]:
# OneHotEncode
null_columns['Misc Feature'].unique()
Misc_Feature_dummies = pd.get_dummies(train['Misc Feature'], prefix='Shed', drop_first=True)
train = pd.concat([train, Misc_Feature_dummies], axis=1)
Misc_Feature_dummies

In [None]:
train.head()

In [None]:
null_columns['Misc Feature'].unique()

In [6]:
# DROP rows where column is empty
# Garage Area
# Garage Cars
# Total Bsmt SF
train.dropna(subset=['Garage Area'], inplace=True)
train.dropna(subset=['Garage Cars'], inplace=True)
train.dropna(subset=['Total Bsmt SF'], inplace=True)

In [None]:
# All missing values from highest to lowest
X.isnull().sum().sort_values(ascending=False).head(27)

In [None]:
# OneHotEncode
null_columns['Fence'].unique()

In [None]:
# Dropped column since there are no values
null_columns['Pool QC'].unique() 
train = train.drop('Pool QC', axis=1)

In [None]:
# 0 if no fireplace 1 if they do
null_columns['Fireplace Qu'].unique()

In [None]:
train['Garage Yr Blt'].dtypes

In [None]:
#OneHotEncode
null_columns['Mas Vnr Type'].unique()

In [None]:
# DROP
null_columns['Alley'].unique()
train = train.drop('Alley', axis=1)

In [None]:
# OneHotEncode 0, 1, 2, 3, 4, 5, 6
null_columns['Lot Frontage'].unique()

In [None]:
null_columns.dtypes.head(10)

In [None]:
# missing values in each column
train.isnull().sum().plot(kind='barh', figsize=(8,14))
plt.title('Count of Missing Values');

In [None]:
# shows the highest corellating variables for a single variable model // try top 10? and then retest with more or less variables // dont use both Garage area & Garage Cars??
plt.figure(figsize=(8,10))
sns.heatmap(train.corr(numeric_only=True)[['SalePrice']].sort_values(by='SalePrice', ascending = False), annot=True);

In [None]:
train.isna().mean()

In [None]:
train.describe()

In [None]:
train.dtypes.head(5)

In [None]:
## Data Cleaning--------
# Decide how to impute null values.
# Decide how to handle outliers.
# Do you want to combine any features?
# Do you want to have interaction terms?
# Do you want to manually drop collinear features?

In [None]:
## Exploratory Visualizations--------
# Look at distributions.
# Look at correlations.
# Look at relationships to target (scatter plots for continuous, box plots for categorical).

In [None]:
train['Overall Qual', 'Gr Liv Area', 'Garage Area', 'Garage Cars', 'Total Bsmt SF', 'Year Built', 'Year Remod/Add', 'Full Bath', 'Fireplaces', 'Wood Deck SF']

In [None]:
## Pre-processing----------
# One-hot encode categorical variables.
# Train/test split your data.
# Scale your data.
# Consider using automated feature selection.

In [None]:
plt.figure(figsize=(8,11))
sns.heatmap(train.corr(numeric_only=True)[['SalePrice']].sort_values(by='SalePrice', ascending = False), annot=True);

In [None]:
train.head()

In [7]:
xvars = ['Overall Qual', 'Gr Liv Area', 'Garage Area', 'Garage Cars', 'Total Bsmt SF', 'Year Built', 'Year Remod/Add', 'Full Bath', 'Fireplaces', 'Wood Deck SF']
X = train[xvars]
y = train['SalePrice']

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [None]:
train[xvars]

In [5]:
y

NameError: name 'y' is not defined

In [9]:
model = LinearRegression()
model.fit(X_train, y_train)

In [1]:
model.score(X_train, y_train)

NameError: name 'model' is not defined

In [2]:
model.score(X_test, y_test)

NameError: name 'model' is not defined

In [12]:
model.coef_

array([18687.46415569,    44.89041931,    35.50005662,  4562.67615196,
          21.62855808,   284.80838982,   321.16921564, -3210.27421035,
        9839.56988341,    27.91733725])

In [13]:
model.intercept_

-1250101.274140429

In [20]:
X.head()

Unnamed: 0,Overall Qual,Gr Liv Area,Garage Area,Garage Cars,Total Bsmt SF,Year Built,Year Remod/Add,Full Bath,Fireplaces,Wood Deck SF
0,6,1479,475.0,2.0,725.0,1976,2005,2,0,0
1,7,2122,559.0,2.0,913.0,1996,1997,2,1,0
2,5,1057,246.0,1.0,1057.0,1953,2007,1,0,0
3,5,1444,400.0,2.0,384.0,2006,2007,2,0,100
4,6,1445,484.0,2.0,676.0,1900,1993,2,0,0


In [15]:
y_pred = model.predict(X)

In [16]:
y_pred.shape

(2049,)

In [17]:
# R2
metrics.r2_score(y, y_pred)

0.795722849884253

In [18]:
# MSE: - Need to fix outliers
mse = metrics.mean_squared_error(y, y_pred)
mse

1282741294.7587163

In [19]:
# RMSE:
np.sqrt(mse)

35815.377908919465

In [None]:
# LINEM ASSUMPTIONS

In [None]:
# L X and y must be the same size?????
plt.scatter(X, y);

In [None]:
# I

In [None]:
# N
resids = y - y_pred
plt.hist(resids, bins=50);

In [None]:
# E
plt.scatter(y_pred, resids, s=1)
plt.axhline(0, color="orange");

In [None]:
plt.scatter(y_pred, resids)
plt.xlabel('Predicted values')
plt.ylabel('Residuals')
plt.title('Residual Plot')
plt.axhline(y=0, color='r', linestyle='--')
plt.show()

In [None]:
z_scores = (resids - np.mean(resids)) / np.std(resids)
outliers = np.abs(z_scores) > 3

In [None]:
z_scores

In [None]:
outliers.head(5)

In [None]:
plt.boxplot(resids)
plt.title('Residuals Boxplot')
plt.show()

In [None]:
X[outliers].head(30)

### SUBMISSION # 1

In [None]:
test = pd.read_csv('C:/Users/james/Documents/GA/Projects/project-2/datasets/test.csv')

In [None]:
test.head()

In [None]:
test['SalePrice'] = y_pred_submission

In [None]:
james_submission = test[['Id', 'SalePrice']]

In [None]:
y_pred_submission = model.predict(test[['Overall Qual', 'Gr Liv Area', 'Garage Area', 'Garage Cars', 'Total Bsmt SF', 'Year Built', 'Year Remod/Add', 'Full Bath', 'Fireplaces', 'Wood Deck SF']])

In [None]:
y_pred_submission.shape

In [None]:
james_submission.set_index('Id', inplace=True) 

In [None]:
james_submission.head()

In [None]:
james_submission.to_csv('C:/Users/james/Documents/GA/Projects/project-2/data/james_submission.csv')