# Kaggle Competition: House Prices 
## Author: Justine Schabel 

### Data Exploration

First we need to import the required libraries. 

In [245]:
# This tells matplotlib not to try opening a new window for each plot.
%matplotlib inline

# Libraries for reading, cleaning and plotting the dataa
import numpy as np 
import pandas as pd 
import csv
import matplotlib.pyplot as plt
import seaborn as sns

# Libraries for models 
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import plot_confusion_matrix
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.linear_model import LinearRegression

In [246]:
# Force numpy not to truncate output 
np.set_printoptions(threshold=100)

In [247]:
# Force pandas not to truncate output 
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 100)

Read the data from the csv data files we downloaded from Kaggle

In [253]:
# Read in training data 
train_df = pd.read_csv("data/train.csv")
test_df = pd.read_csv("data/test.csv")

Split the training data into 80% training data and 20% dev data. We have 1168 training examples, 292 dev examples, 1495 test examples and 81 features (including the label).

In [254]:
# Split training data (labeled) into 80% training and 20% dev) and randomly sample 
training_data = train_df.sample(frac=0.8)
dev_data_df = train_df.drop(training_data.index)

# Examine shape of data sets
print("Training Set Size: ", training_data.shape)
print("Dev Set Size: ", dev_data_df.shape)
print("Test Set Size: ", test_df.shape)

Training Set Size:  (1168, 81)
Dev Set Size:  (292, 81)
Test Set Size:  (1459, 80)


We can also breifly look at a summary of the features. 
- The count attribute shows us that there are columns with missing data - we will need to manage NAs and look for outliers (ex. 9999 for NA)
- The mean attribute shows us that the average value of the features varies widely - we might consider standardizing 

In [255]:
# Briefly examine feature attributes for the training data 
training_data.describe()

Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,1stFlrSF,2ndFlrSF,LowQualFinSF,GrLivArea,BsmtFullBath,BsmtHalfBath,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,TotRmsAbvGrd,Fireplaces,GarageYrBlt,GarageCars,GarageArea,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SalePrice
count,1168.0,1168.0,968.0,1168.0,1168.0,1168.0,1168.0,1168.0,1162.0,1168.0,1168.0,1168.0,1168.0,1168.0,1168.0,1168.0,1168.0,1168.0,1168.0,1168.0,1168.0,1168.0,1168.0,1168.0,1168.0,1101.0,1168.0,1168.0,1168.0,1168.0,1168.0,1168.0,1168.0,1168.0,1168.0,1168.0,1168.0,1168.0
mean,724.639555,56.386986,70.875,10679.182363,6.089897,5.57363,1971.156678,1984.919521,105.061102,449.204623,44.385274,561.706336,1055.296233,1163.205479,345.805651,6.263699,1515.274829,0.428938,0.059075,1.565925,0.380993,2.884418,1.045377,6.526541,0.61387,1978.805631,1.773116,475.05137,94.616438,46.597603,21.907534,3.707192,14.65839,2.400685,51.005137,6.374144,2007.837329,181388.657534
std,423.057188,42.03941,24.809218,10813.892622,1.398236,1.127542,30.414282,20.650402,183.66229,463.700588,156.507668,440.164957,445.747345,386.33164,440.239308,50.129624,537.062248,0.520449,0.243024,0.549927,0.501461,0.813024,0.216292,1.637494,0.651184,24.536622,0.758203,215.636387,125.451582,65.944583,61.605609,28.93831,55.79351,37.11815,552.972598,2.719932,1.327674,80670.705735
min,1.0,20.0,21.0,1477.0,1.0,1.0,1872.0,1950.0,0.0,0.0,0.0,0.0,0.0,334.0,0.0,0.0,334.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,1900.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2006.0,34900.0
25%,355.75,20.0,60.0,7578.75,5.0,5.0,1954.0,1966.0,0.0,0.0,0.0,223.75,796.0,882.0,0.0,0.0,1121.5,0.0,0.0,1.0,0.0,2.0,1.0,5.0,0.0,1962.0,1.0,336.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,2007.0,129975.0
50%,720.0,50.0,70.0,9478.5,6.0,5.0,1972.0,1994.0,0.0,390.0,0.0,461.5,992.0,1081.0,0.0,0.0,1457.0,0.0,0.0,2.0,0.0,3.0,1.0,6.0,1.0,1980.0,2.0,480.0,0.0,25.0,0.0,0.0,0.0,0.0,0.0,6.0,2008.0,163000.0
75%,1096.25,70.0,80.0,11657.25,7.0,6.0,2001.0,2004.0,167.75,728.75,0.0,797.25,1306.0,1391.25,728.0,0.0,1776.0,1.0,0.0,2.0,1.0,3.0,1.0,7.0,1.0,2002.0,2.0,576.25,168.0,69.0,0.0,0.0,0.0,0.0,0.0,8.0,2009.0,214125.0
max,1460.0,190.0,313.0,215245.0,10.0,9.0,2010.0,2010.0,1600.0,5644.0,1474.0,2336.0,6110.0,4692.0,2065.0,572.0,5642.0,3.0,2.0,3.0,2.0,8.0,3.0,14.0,3.0,2010.0,4.0,1418.0,736.0,547.0,552.0,407.0,480.0,738.0,15500.0,12.0,2010.0,755000.0


I will also consolodate our findings from the data_description file. Most features are either alphabetic (categorical) or numeric.

- Categorical Features:
    - Numeric:
        - MSSubCass: Identifies the type of dwelling involved in the sale
        - OverallQual: Rates the overall material and finish of the house (Likert Scale)
        - OverallCond: Rates the overall condition of the house (Likert Scale)
    - Alphabetic:   
        - MSZoning: Identifies the general zoning classification of the sale.
        - Street: Type of road access to property
        - Alley: Type of alley access to property
        - LotShape: General shape of property
        - LandContour: Flatness of the property
        - Utilities: Type of utilities available
        - LotConfig: Lot configuration
        - LandSlope: Slope of property
        - Neighborhood: Physical locations within Ames city limits
        - Condition1: Proximity to various conditions
        - Condition2: Proximity to various conditions (if more than one is present)
        - BldgType: Type of dwelling
        - HouseStyle: Style of dwelling
        - RoofStyle: Type of roof
        - RoofMatl: Roof material
        - Exterior1st: Exterior covering on house
        - Exterior2nd: Exterior covering on house (if more than one material)
        - MasVnrType: Masonry veneer type
        - ExterQual: Evaluates the quality of the material on the exterior 
        - ExterCond: Evaluates the present condition of the material on the exterior
        - Foundation: Type of foundation
        - BsmtQual: Evaluates the height of the basement
        - BsmtCond: Evaluates the general condition of the basement
        - BsmtExposure: Refers to walkout or garden level walls
        - BsmtFinType1: Rating of basement finished area
        - BsmtFinType2: Rating of basement finished area (if multiple types)
        - Heating: Type of heating
        - HeatingQC: Heating quality and condition
        - CentralAir: Central air conditioning (Y/N - This could be changed to 0/1 bool)
        - Electrical: Electrical system
        - KitchenQual: Kitchen quality
        - Functional: Home functionality (Assume typical unless deductions are warranted)
        - FireplaceQu: Fireplace quality
        - GarageType: Garage location
        - GarageFinish: Interior finish of the garage
        - GarageQual: Garage quality
        - GarageCond: Garage condition
        - PavedDrive: Paved driveway
        - PoolQC: Pool quality
        - Fence: Fence quality
        - MiscFeature: Miscellaneous feature not covered in other categories
        - SaleType: Type of sale
        - SaleCondition: Condition of sale    
- Numerical Features:
    - LotFrontage: Linear feet of street connected to property
    - LotArea: Lot size in square feet
    - YearBuilt: Original construction date
    - YearRemodAdd: Remodel date (same as construction date if no remodeling or additions)
    - MasVnrArea: Masonry veneer area in square feet
    - BsmtFinSF1: Type 1 finished square feet
    - BsmtFinSF2: Type 2 finished square feet
    - BsmtUnfSF: Unfinished square feet of basement area
    - TotalBsmtSF: Total square feet of basement area
    - 1stFlrSF: First Floor square feet
    - 2ndFlrSF: Second floor square feet
    - LowQualFinSF: Low quality finished square feet (all floors)
    - GrLivArea: Above grade (ground) living area square feet
    - BsmtFullBath: Basement full bathrooms
    - BsmtHalfBath: Basement half bathrooms
    - FullBath: Full bathrooms above grade
    - HalfBath: Half baths above grade
    - Bedroom: Bedrooms above grade (does NOT include basement bedrooms)
    - Kitchen: Kitchens above grade
    - TotRmsAbvGrd: Total rooms above grade (does not include bathrooms)
    - Fireplaces: Number of fireplaces
    - GarageYrBlt: Year garage was built
    - GarageCars: Size of garage in car capacity
    - GarageArea: Size of garage in square feet
    - WoodDeckSF: Wood deck area in square feet
    - OpenPorchSF: Open porch area in square feet
    - EnclosedPorch: Enclosed porch area in square feet
    - 3SsnPorch: Three season porch area in square feet
    - ScreenPorch: Screen porch area in square feet
    - PoolArea: Pool area in square feet
    - MiscVal: Value of miscellaneous feature
    - MoSold: Month Sold (MM)
    - YrSold: Year Sold (YYYY)

I would like to examine all of the different types (specifically what numeric types there are).

In [256]:
training_data.dtypes

Id                 int64
MSSubClass         int64
MSZoning          object
LotFrontage      float64
LotArea            int64
Street            object
Alley             object
LotShape          object
LandContour       object
Utilities         object
LotConfig         object
LandSlope         object
Neighborhood      object
Condition1        object
Condition2        object
BldgType          object
HouseStyle        object
OverallQual        int64
OverallCond        int64
YearBuilt          int64
YearRemodAdd       int64
RoofStyle         object
RoofMatl          object
Exterior1st       object
Exterior2nd       object
MasVnrType        object
MasVnrArea       float64
ExterQual         object
ExterCond         object
Foundation        object
BsmtQual          object
BsmtCond          object
BsmtExposure      object
BsmtFinType1      object
BsmtFinSF1         int64
BsmtFinType2      object
BsmtFinSF2         int64
BsmtUnfSF          int64
TotalBsmtSF        int64
Heating           object


In order to do some distribution analysis, I will filter out the non-numeric features 

In [257]:
numeric_training_data = training_data.select_dtypes(include=['int64','float64'])
numeric_dev_data = dev_data_df.select_dtypes(include=['int64','float64'])
print("Numeric Training Data Shape: ", numeric_training_data.shape)
print("Numeric Dev Data Shape: ", numeric_dev_data.shape)

Numeric Training Data Shape:  (1168, 38)
Numeric Dev Data Shape:  (292, 38)


It appears that the only (numeric) features with missing data are LotFrontage, Masonry veneer area in square feet (MasVnrArea), Garage Year Built (GarageYrBlt). Since lot frontage is the length of the property that is connected to the street, I will drop it. I believe that this value would hold similar information as the lot area. I'll also drop the Id (as it has no real numerical meaning), as well as the numeric categorical data.

In [258]:
numeric_training_data.describe()

Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,1stFlrSF,2ndFlrSF,LowQualFinSF,GrLivArea,BsmtFullBath,BsmtHalfBath,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,TotRmsAbvGrd,Fireplaces,GarageYrBlt,GarageCars,GarageArea,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SalePrice
count,1168.0,1168.0,968.0,1168.0,1168.0,1168.0,1168.0,1168.0,1162.0,1168.0,1168.0,1168.0,1168.0,1168.0,1168.0,1168.0,1168.0,1168.0,1168.0,1168.0,1168.0,1168.0,1168.0,1168.0,1168.0,1101.0,1168.0,1168.0,1168.0,1168.0,1168.0,1168.0,1168.0,1168.0,1168.0,1168.0,1168.0,1168.0
mean,724.639555,56.386986,70.875,10679.182363,6.089897,5.57363,1971.156678,1984.919521,105.061102,449.204623,44.385274,561.706336,1055.296233,1163.205479,345.805651,6.263699,1515.274829,0.428938,0.059075,1.565925,0.380993,2.884418,1.045377,6.526541,0.61387,1978.805631,1.773116,475.05137,94.616438,46.597603,21.907534,3.707192,14.65839,2.400685,51.005137,6.374144,2007.837329,181388.657534
std,423.057188,42.03941,24.809218,10813.892622,1.398236,1.127542,30.414282,20.650402,183.66229,463.700588,156.507668,440.164957,445.747345,386.33164,440.239308,50.129624,537.062248,0.520449,0.243024,0.549927,0.501461,0.813024,0.216292,1.637494,0.651184,24.536622,0.758203,215.636387,125.451582,65.944583,61.605609,28.93831,55.79351,37.11815,552.972598,2.719932,1.327674,80670.705735
min,1.0,20.0,21.0,1477.0,1.0,1.0,1872.0,1950.0,0.0,0.0,0.0,0.0,0.0,334.0,0.0,0.0,334.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,1900.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2006.0,34900.0
25%,355.75,20.0,60.0,7578.75,5.0,5.0,1954.0,1966.0,0.0,0.0,0.0,223.75,796.0,882.0,0.0,0.0,1121.5,0.0,0.0,1.0,0.0,2.0,1.0,5.0,0.0,1962.0,1.0,336.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,2007.0,129975.0
50%,720.0,50.0,70.0,9478.5,6.0,5.0,1972.0,1994.0,0.0,390.0,0.0,461.5,992.0,1081.0,0.0,0.0,1457.0,0.0,0.0,2.0,0.0,3.0,1.0,6.0,1.0,1980.0,2.0,480.0,0.0,25.0,0.0,0.0,0.0,0.0,0.0,6.0,2008.0,163000.0
75%,1096.25,70.0,80.0,11657.25,7.0,6.0,2001.0,2004.0,167.75,728.75,0.0,797.25,1306.0,1391.25,728.0,0.0,1776.0,1.0,0.0,2.0,1.0,3.0,1.0,7.0,1.0,2002.0,2.0,576.25,168.0,69.0,0.0,0.0,0.0,0.0,0.0,8.0,2009.0,214125.0
max,1460.0,190.0,313.0,215245.0,10.0,9.0,2010.0,2010.0,1600.0,5644.0,1474.0,2336.0,6110.0,4692.0,2065.0,572.0,5642.0,3.0,2.0,3.0,2.0,8.0,3.0,14.0,3.0,2010.0,4.0,1418.0,736.0,547.0,552.0,407.0,480.0,738.0,15500.0,12.0,2010.0,755000.0


In [259]:
numeric_dev_data.describe()

Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,1stFlrSF,2ndFlrSF,LowQualFinSF,GrLivArea,BsmtFullBath,BsmtHalfBath,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,TotRmsAbvGrd,Fireplaces,GarageYrBlt,GarageCars,GarageArea,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SalePrice
count,292.0,292.0,233.0,292.0,292.0,292.0,292.0,292.0,290.0,292.0,292.0,292.0,292.0,292.0,292.0,292.0,292.0,292.0,292.0,292.0,292.0,292.0,292.0,292.0,292.0,278.0,292.0,292.0,292.0,292.0,292.0,292.0,292.0,292.0,292.0,292.0,292.0,292.0
mean,753.941781,58.938356,66.622318,9867.410959,6.136986,5.582192,1971.712329,1984.650685,98.172414,421.380137,55.205479,589.376712,1065.962329,1160.311644,351.739726,4.167808,1516.219178,0.410959,0.05137,1.561644,0.390411,2.794521,1.05137,6.482877,0.609589,1977.320144,1.743151,464.695205,92.756849,46.910959,22.140411,2.219178,16.671233,4.191781,13.424658,6.113014,2007.729452,179051.349315
std,415.661457,43.342752,21.687113,5477.409855,1.321928,1.053594,29.389074,20.659435,170.45956,424.353257,179.310206,448.690399,409.944966,388.266332,422.06138,42.097121,477.206505,0.513344,0.22113,0.555789,0.509339,0.824186,0.23616,1.578327,0.618981,25.297116,0.702808,206.468472,125.090314,67.601665,59.236182,30.809579,55.6792,50.650384,81.687009,2.631602,1.328548,74429.699532
min,9.0,20.0,21.0,1300.0,3.0,3.0,1880.0,1950.0,0.0,0.0,0.0,0.0,0.0,483.0,0.0,0.0,605.0,0.0,0.0,0.0,0.0,0.0,1.0,3.0,0.0,1914.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2006.0,40000.0
25%,405.75,20.0,55.0,7426.0,5.0,5.0,1952.75,1969.0,0.0,0.0,0.0,218.75,788.75,889.5,0.0,0.0,1165.5,0.0,0.0,1.0,0.0,2.0,1.0,5.0,0.0,1959.0,1.0,326.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,2007.0,129975.0
50%,786.5,50.0,65.0,9477.5,6.0,5.0,1973.5,1993.5,0.0,348.5,0.0,511.0,980.5,1094.0,0.0,0.0,1472.0,0.0,0.0,2.0,0.0,3.0,1.0,6.0,1.0,1980.0,2.0,471.0,0.0,25.0,0.0,0.0,0.0,0.0,0.0,6.0,2008.0,163945.0
75%,1094.25,70.0,80.0,11381.5,7.0,6.0,2000.0,2003.0,153.25,663.75,0.0,878.5,1276.5,1369.5,725.75,0.0,1784.75,1.0,0.0,2.0,1.0,3.0,1.0,7.0,1.0,2001.0,2.0,565.5,168.0,64.25,0.0,0.0,0.0,0.0,0.0,8.0,2009.0,207700.0
max,1456.0,190.0,149.0,57200.0,10.0,9.0,2009.0,2010.0,1115.0,1904.0,1085.0,1969.0,3200.0,3228.0,1611.0,514.0,3447.0,2.0,1.0,3.0,2.0,6.0,3.0,12.0,2.0,2010.0,3.0,1166.0,857.0,418.0,301.0,508.0,312.0,648.0,620.0,12.0,2010.0,501837.0


### Feature Engineering

In [260]:
numeric_training_data = numeric_training_data.drop(columns=['Id','MasVnrArea', 'LotFrontage', 'MSSubClass', 'OverallQual', 'OverallCond', 'GarageYrBlt'])
numeric_dev_data = numeric_dev_data.drop(columns=['Id','MasVnrArea', 'LotFrontage', 'MSSubClass', 'OverallQual', 'OverallCond', 'GarageYrBlt'])

In [261]:
# numeric_training_data.columns

In [262]:
drop_columns = ['BsmtUnfSF','MiscVal','BsmtFinSF1','BsmtFinSF1','YearRemodAdd','LowQualFinSF','GrLivArea', 'KitchenAbvGr',
                'WoodDeckSF','OpenPorchSF','EnclosedPorch','3SsnPorch','ScreenPorch','PoolArea','TotRmsAbvGrd']
numeric_training_data = numeric_training_data.drop(columns=drop_columns)
numeric_dev_data = numeric_dev_data.drop(columns=drop_columns)

In [263]:
# numeric_training_data['TotalBedrooms'] = np.sum(numeric_training_data['Bedroom']+numeric_training_data[])
numeric_training_data['TotalSQFT'] = numeric_training_data['1stFlrSF'] + numeric_training_data['2ndFlrSF'] + numeric_training_data['GarageArea']
numeric_training_data['Bathrooms'] = numeric_training_data['BsmtFullBath'] + numeric_training_data['BsmtHalfBath'] + numeric_training_data['FullBath'] + numeric_training_data['HalfBath']

In [264]:
# numeric_training_data['TotalBedrooms'] = np.sum(numeric_training_data['Bedroom']+numeric_training_data[])
numeric_dev_data['TotalSQFT'] = numeric_dev_data['1stFlrSF'] + numeric_dev_data['2ndFlrSF'] + numeric_dev_data['GarageArea']
numeric_dev_data['Bathrooms'] = numeric_dev_data['BsmtFullBath'] + numeric_dev_data['BsmtHalfBath'] + numeric_dev_data['FullBath'] + numeric_dev_data['HalfBath']


LotFrontage: Linear feet of street connected to property
- KEEP: LotArea: Lot size in square feet
- KEEP: YearBuilt: Original construction date
- DROP: YearRemodAdd: Remodel date (same as construction date if no remodeling or additions)
- DROP: BsmtFinSF1: Type 1 finished square feet
- DROP: BsmtFinSF2: Type 2 finished square feet
- DROP: BsmtUnfSF: Unfinished square feet of basement area
- KEEP: TotalBsmtSF: Total square feet of basement area
- COMBINE: 1stFlrSF: First Floor square feet
- COMBINE: 2ndFlrSF: Second floor square feet
- DROP: LowQualFinSF: Low quality finished square feet (all floors)
- DROP: GrLivArea: Above grade (ground) living area square feet
- COMBINE: BsmtFullBath: Basement full bathrooms
- COMBINE: BsmtHalfBath: Basement half bathrooms
- COMBINE: FullBath: Full bathrooms above grade
- COMBINE: HalfBath: Half baths above grade
- DROP: Bedroom: Bedrooms above grade (does NOT include basement bedrooms)
- DROP: Kitchen: Kitchens above grade (Assume there's a kitchen?) 
- DROP: TotRmsAbvGrd: Total rooms above grade (does not include bathrooms)
- KEEP: Fireplaces: Number of fireplaces
- KEEP: GarageCars: Size of garage in car capacity
- COMBINE: GarageArea: Size of garage in square feet
- DROP: WoodDeckSF: Wood deck area in square feet
- DROP: OpenPorchSF: Open porch area in square feet
- DROP: EnclosedPorch: Enclosed porch area in square feet
- DROP: 3SsnPorch: Three season porch area in square feet
- DROP: ScreenPorch: Screen porch area in square feet
- DROP: PoolArea: Pool area in square feet (Assume its captured in LotArea)
- KEEP: MoSold: Month Sold (MM)
- KEEP: YrSold: Year Sold (Y

In [265]:
# Split into data and labels
train_data = numeric_training_data.drop(columns=["SalePrice"])
train_labels = numeric_training_data["SalePrice"]
dev_data = numeric_dev_data.drop(columns=["SalePrice"])
dev_labels = numeric_dev_data["SalePrice"]
test_data = test_df
# Double check the shape
print("Training Data Shape: ", train_data.shape)
print("Dev Data Shape: ", dev_data.shape)
print("Training Labels Shape: ", train_labels.shape)
print("Dev Labels Shape: ", dev_labels.shape)
print("Test Data Shape: ", test_data.shape)

Training Data Shape:  (1168, 18)
Dev Data Shape:  (292, 18)
Training Labels Shape:  (1168,)
Dev Labels Shape:  (292,)
Test Data Shape:  (1459, 80)


In [266]:
# Scale the Data
scaler = StandardScaler()
norm = scaler.fit(train_data)
train_data = norm.transform(train_data)
print(train_data.shape)
# Normalize features using the standard scaler [dev data]
dev_data = norm.transform(dev_data)
print(dev_data.shape)

(1168, 18)
(292, 18)


### Models

#### Random Forest

In [267]:
# Try a random forest 
def RandomForest(num_trees):
    model = RandomForestClassifier(num_trees,max_depth=8)
    model.fit(train_data, train_labels)
    predictions = model.predict(dev_data)
    score = model.score(dev_data, dev_labels)
    print("Random Forest Performance for {0} trees: {1}".format(num_trees,score))

    
num_trees_list = [1,3,5,10,100]
for num_trees in num_trees_list:
    RandomForest(num_trees)

Random Forest Performance for 1 trees: 0.023972602739726026
Random Forest Performance for 3 trees: 0.010273972602739725
Random Forest Performance for 5 trees: 0.003424657534246575
Random Forest Performance for 10 trees: 0.0136986301369863
Random Forest Performance for 100 trees: 0.010273972602739725


#### Linear Regression

In [268]:
def LinearRegressionModel():
    model = LinearRegression()
    model.fit(train_data, train_labels)
    dev_predicted = model.predict(dev_data)
    score = model.score(dev_data, dev_labels)
    print("Mean Accuracy: ", score)
    
LinearRegressionModel()

Mean Accuracy:  0.7691809784608337


#### Logistic Regression

In [269]:
def LogRegressionModel():
#     model = LogisticRegression(penalty='l2', C=0.5, solver="liblinear", multi_class="auto")
    model = LogisticRegression()
    model.fit(train_data, train_labels)
    dev_predicted = model.predict(dev_data)
    score = model.score(dev_data, dev_labels)
    f1_score = metrics.f1_score(dev_labels, dev_predicted, average='weighted')
    print("F1 Score: {0} Mean Accuracy: {1}".format(f1_score, score))

LogRegressionModel()

F1 Score: 0.006107305936073061 Mean Accuracy: 0.010273972602739725


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


#### Multi-Layer Perceptron

In [None]:
def MLP():
    # Default activation is 'relu', random state lets us get the same result every time (so we can tune other parameters)
    # max_iter is 200 by default, but more helps. alpha is the regularization parameter. solver is 'adam' by default
    model = MLPClassifier(alpha=1e-3, hidden_layer_sizes=(100,), random_state=0, max_iter=300) 
    model.fit(train_data, train_labels) 
    score = model.score(dev_data, dev_labels)
    print("MLP accuracy = ",score)
MLP()

#### K-Nearest Neighbors

In [None]:
# Try K Nearest Neighbors - before any data cleaning 
def KNN(kn):
    model = KNeighborsClassifier(n_neighbors = kn)
    model.fit(train_data, train_labels)
    predictions = model.predict(dev_data)
    score = model.score(dev_data, dev_labels)
    print("KNN {0} neighbors : accuracy = {1}".format(kn,score))
    
# The alpha isn't actually making a difference 
neigh_list = [1,2,4, 7, 10]
for neigh in neigh_list:
    KNN(neigh)