In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.svm import SVR
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_log_error

In [2]:
df = pd.read_csv('./all/train.csv')
df.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
Id               1460 non-null int64
MSSubClass       1460 non-null int64
MSZoning         1460 non-null object
LotFrontage      1201 non-null float64
LotArea          1460 non-null int64
Street           1460 non-null object
Alley            91 non-null object
LotShape         1460 non-null object
LandContour      1460 non-null object
Utilities        1460 non-null object
LotConfig        1460 non-null object
LandSlope        1460 non-null object
Neighborhood     1460 non-null object
Condition1       1460 non-null object
Condition2       1460 non-null object
BldgType         1460 non-null object
HouseStyle       1460 non-null object
OverallQual      1460 non-null int64
OverallCond      1460 non-null int64
YearBuilt        1460 non-null int64
YearRemodAdd     1460 non-null int64
RoofStyle        1460 non-null object
RoofMatl         1460 non-null object
Exterior1st      1460 non-n

In [4]:
def apply_preprocessing_to_data(df, drop_threshold, drop_first, dummy):
    
    
    cat_variables = []
    num_variables = []
    columns_to_be_dropped=[]
    
    test_time_operations = {}
    
    #if(not test):
    empty_columns = df.columns[df.isna().any()].tolist()
    for i in empty_columns:
        add = df[i].isnull().sum()
        if((add / len(df) *100) > drop_threshold):
            columns_to_be_dropped.append(i)

    first_column = df.columns[0]
    if(drop_first):
        df =df.drop(axis =1 , columns= df.columns[0])

    df =  df.drop(axis = 1 , columns = columns_to_be_dropped)
    
    columns_to_be_dropped.append(first_column)

    test_time_operations['columns_dropped'] = columns_to_be_dropped
    test_time_operations['default_values'] = {}
    

    
    for column in df.columns:
        if(df[column].dtype == 'O'):
            cat_variables.append(column)
        else:
            num_variables.append(column)
    
    for i in cat_variables:
        if i in empty_columns:
            top_variable = df[i].describe().top
            test_time_operations['default_values'][i]=top_variable
            df[i] = df[i].fillna(top_variable)

    for i in num_variables:
        if i in empty_columns:
            mean_ = df[i].mean()
            test_time_operations['default_values'][i]=mean_
            df[i] = df[i].fillna(mean_)

    test_time_operations['cat_variables'] = cat_variables
    print(test_time_operations)
    if(dummy):
        df = pd.get_dummies(df, columns = cat_variables)
        
    print(df.shape)
    return df, test_time_operations


In [5]:
def apply_preprocessing_to_test_data(df, test_data_ops):
    df =  df.drop(axis = 1 , columns = test_time_ops['columns_dropped'])
    
    for key, value in test_data_ops['default_values'].items():
        df[key] = df[key].fillna(value)
    
    cat_variables = test_data_ops['cat_variables']
    df = pd.get_dummies(df, columns = cat_variables)
    
    return df

In [6]:
df,test_time_ops = apply_preprocessing_to_data(df, 50 , True, True)

{'columns_dropped': ['Alley', 'PoolQC', 'Fence', 'MiscFeature', 'Id'], 'default_values': {'MasVnrType': 'None', 'BsmtQual': 'TA', 'BsmtCond': 'TA', 'BsmtExposure': 'No', 'BsmtFinType1': 'Unf', 'BsmtFinType2': 'Unf', 'Electrical': 'SBrkr', 'FireplaceQu': 'Gd', 'GarageType': 'Attchd', 'GarageFinish': 'Unf', 'GarageQual': 'TA', 'GarageCond': 'TA', 'LotFrontage': 70.04995836802665, 'MasVnrArea': 103.68526170798899, 'GarageYrBlt': 1978.5061638868744}, 'cat_variables': ['MSZoning', 'Street', 'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual', 'Functional', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 'PavedDrive', 'SaleType', 'SaleConditi

In [None]:
df.head()

In [None]:
def Get_X_y(df, target_variable):
    y = np.array(df[target_variable])
    X = np.array(df.drop(columns=[target_variable]))
    return X,y

In [None]:
X ,y = Get_X_y(df, 'SalePrice')
X.shape, y.shape

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size = 0.1, shuffle=True)

In [None]:
def apply_advanced_preprocessing( pca, pca_variance , X_train, X_valid, scale=True):
    if(scale):
        min_max = MinMaxScaler()
        min_max.fit(X_train)
        X_train = min_max.transform(X_train)
        X_valid = min_max.transform(X_valid)
    
    if(pca):
        pca = PCA(pca_variance).fit(X_train)
        X_train =  pca.transform(X_train)
        X_valid = pca.transform(X_valid)
        
    return X_train, X_valid, min_max, pca

In [None]:
X_train, X_valid, min_max, pca = apply_advanced_preprocessing(True, 0.95, X_train, X_valid, True)


In [None]:
X_train.shape

In [None]:
def err(y_test, y_pred):
    return mean_squared_log_error(y_test, y_pred)

In [None]:
def GridSearch(classifier):
    Cs = [0.001, 0.01, 0.1, 1, 10,1000]
    gammas = [0.001, 0.01, 0.1, 1, 10]
    param_grid = {'C': Cs, 'gamma' : gammas}

    grid_search = GridSearchCV(classifier, param_grid)
    grid_search.fit(X_train, y_train)
    
    return grid_search.best_params_


In [None]:
GridSearch(SVR())

In [None]:
svr = SVR(C=1000, gamma=0.01)

In [None]:
svr.fit(X_train, y_train)

In [None]:
y_pred = svr.predict(X_valid)

In [None]:
test =  pd.read_csv('./all/test.csv')
test.head()

In [None]:
test = apply_preprocessing_to_test_data(test, test_time_ops)

In [None]:
test.shape

In [None]:
empty_columns = test.columns[test.isna().any()].tolist()
empty_columns

In [None]:
for column in empty_columns:
    if(test[column].dtype == 'O'):
        test[column] =  test[column].fillna(test[column].describe().top)
    else:
        test[column] = test[column].fillna(test[column].mean())

In [None]:
X_test = np.array(test)

In [None]:
X_test.shape

In [None]:
X_test = min_max.transform(X_test)