In [253]:
import warnings
import numpy as np
import pandas as pd
import seaborn as sn

from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.exceptions import ConvergenceWarning
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split, cross_val_score,GridSearchCV
import matplotlib.pyplot as plt
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter("ignore", category=ConvergenceWarning)
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter("ignore", category=ConvergenceWarning)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.width', None)
pd.set_option('display.float_format', lambda x: '%.3f' % x)

In [254]:
train = pd.read_csv("../input/house-prices-advanced-regression-techniques/train.csv")
test = pd.read_csv("../input/house-prices-advanced-regression-techniques/test.csv")
df = train.append(test).reset_index()


In [255]:
def grab_col_names(dataframe, cat_th=10, car_th=20):
    cat_cols = [col for col in dataframe.columns if dataframe[col].dtypes == "O"]
    num_but_cat = [col for col in dataframe.columns if dataframe[col].nunique() < cat_th and
                   dataframe[col].dtypes != "O"]
    cat_but_car = [col for col in dataframe.columns if dataframe[col].nunique() > car_th and
                   dataframe[col].dtypes == "O"]
    cat_cols = cat_cols + num_but_cat
    cat_cols = [col for col in cat_cols if col not in cat_but_car]
    num_cols = [col for col in dataframe.columns if dataframe[col].dtypes != "O" and col not in "SalePrice"]
    num_cols = [col for col in num_cols if col not in num_but_cat]
    print(f"Observations: {dataframe.shape[0]}")
    print(f"Variables: {dataframe.shape[1]}")
    print(f'cat_cols: {len(cat_cols)}')
    print(f'num_cols: {len(num_cols)}')
    print(f'cat_but_car: {len(cat_but_car)}')
    print(f'num_but_cat: {len(num_but_cat)}')
    return cat_cols, cat_but_car, num_cols, num_but_cat


cat_cols, cat_but_car, num_cols, num_but_cat = grab_col_names(df)

Observations: 2919
Variables: 82
cat_cols: 52
num_cols: 28
cat_but_car: 1
num_but_cat: 10


In [256]:

def outlier_thresholds(dataframe, variable, low_quantile=0.01, up_quantile=0.99):
    quantile_one = dataframe[variable].quantile(low_quantile)
    quantile_three = dataframe[variable].quantile(up_quantile)
    interquantile_range = quantile_three - quantile_one
    up_limit = quantile_three + 1.5 * interquantile_range
    low_limit = quantile_one - 1.5 * interquantile_range
    return low_limit, up_limit


def check_outlier(dataframe, col_name):
    low_limit, up_limit = outlier_thresholds(dataframe, col_name)
    if dataframe[(dataframe[col_name] > up_limit) | (dataframe[col_name] < low_limit)].any(axis=None):
        return True
    else:
        return False

def replace_with_thresholds(dataframe, variable):
        low_limit, up_limit = outlier_thresholds(dataframe, variable)
        dataframe.loc[(dataframe[variable] < low_limit), variable] = low_limit
        dataframe.loc[(dataframe[variable] > up_limit), variable] = up_limit


In [257]:
def quick_missing_imp(data, num_method="median", cat_length=20, target="SalePrice"):
    variables_with_na = [col for col in data.columns if
                         data[col].isnull().sum() > 0]  # Eksik değere sahip olan değişkenler listelenir

    temp_target = data[target]

    print("# BEFORE")
    print(data[variables_with_na].isnull().sum(), "\n\n")  # Uygulama öncesi değişkenlerin eksik değerlerinin sayısı

    # değişken object ve sınıf sayısı cat_lengthe eşit veya altındaysa boş değerleri mode ile doldur
    data = data.apply(lambda x: x.fillna(x.mode()[0]) if (x.dtype == "O" and len(x.unique()) <= cat_length) else x,
                      axis=0)

    # num_method mean ise tipi object olmayan değişkenlerin boş değerleri ortalama ile dolduruluyor
    if num_method == "mean":
        data = data.apply(lambda x: x.fillna(x.mean()) if x.dtype != "O" else x, axis=0)
    # num_method median ise tipi object olmayan değişkenlerin boş değerleri ortalama ile dolduruluyor
    elif num_method == "median":
        data = data.apply(lambda x: x.fillna(x.median()) if x.dtype != "O" else x, axis=0)

    data[target] = temp_target

    print("# AFTER \n Imputation method is 'MODE' for categorical variables!")
    print(" Imputation method is '" + num_method.upper() + "' for numeric variables! \n")
    print(data[variables_with_na].isnull().sum(), "\n\n")

    return data

In [258]:
def missing_values_table(dataframe, na_name=False):
    na_columns = [col for col in dataframe.columns if dataframe[col].isnull().sum() > 0]

    n_miss = dataframe[na_columns].isnull().sum().sort_values(ascending=False)

    ratio = (dataframe[na_columns].isnull().sum() / dataframe.shape[0] * 100).sort_values(ascending=False)

    missing_df = pd.concat([n_miss, np.round(ratio, 2)], axis=1, keys=['n_miss', 'ratio'])

    print(missing_df, end="\n")

    if na_name:
        return na_columns

missing_values_table(df)

              n_miss  ratio
PoolQC          2909 99.660
MiscFeature     2814 96.400
Alley           2721 93.220
Fence           2348 80.440
SalePrice       1459 49.980
FireplaceQu     1420 48.650
LotFrontage      486 16.650
GarageCond       159  5.450
GarageYrBlt      159  5.450
GarageFinish     159  5.450
GarageQual       159  5.450
GarageType       157  5.380
BsmtExposure      82  2.810
BsmtCond          82  2.810
BsmtQual          81  2.770
BsmtFinType2      80  2.740
BsmtFinType1      79  2.710
MasVnrType        24  0.820
MasVnrArea        23  0.790
MSZoning           4  0.140
Functional         2  0.070
BsmtHalfBath       2  0.070
BsmtFullBath       2  0.070
Utilities          2  0.070
KitchenQual        1  0.030
TotalBsmtSF        1  0.030
BsmtUnfSF          1  0.030
GarageCars         1  0.030
GarageArea         1  0.030
BsmtFinSF2         1  0.030
BsmtFinSF1         1  0.030
Exterior2nd        1  0.030
Exterior1st        1  0.030
SaleType           1  0.030
Electrical         1

In [259]:
df["MSZoning"] = np.where(df.LotShape.isin(["C (all)", "RH"]), "CRH", df["MSZoning"])
df["Fireplaces"] = np.where(df.LotShape.isin(["4", "3"]), "43", df["Fireplaces"])
df["GarageCars"] = np.where(df.LotShape.isin(["4.000", "5.000"]), "45", df["GarageCars"])
df["Condition1"] = np.where(df.LotShape.isin(["RRNn", "RRNe","RRAn"]), "RRR", df["Condition1"])
df["Exterior2nd"] = np.where(df.LotShape.isin(["AsbShng", "AsphShn"]), "AA", df["Exterior2nd"])
df["Exterior2nd"] = np.where(df.LotShape.isin(["Brk Cmn", "BrkFace"]), "BB", df["Exterior2nd"])
df["Exterior2nd"] = np.where(df.LotShape.isin(["Other", "Stone","CBlock"]), "OSC", df["Exterior2nd"])

In [260]:
drop_list = ["MiscFeature","Fence","PoolQC","GarageCond","GarageQual",
             "FireplaceQu"]
df.drop("Alley", axis=1, inplace=True)
# drop_list'teki değişkenlerin düşürülmesi
df.drop(drop_list, axis=1, inplace=True)

In [261]:

del df["index"]

In [264]:
cat_cols, cat_but_car, num_cols, num_but_cat = grab_col_names(df)

Observations: 2919
Variables: 74
cat_cols: 46
num_cols: 26
cat_but_car: 1
num_but_cat: 9


In [263]:
cat_cols, cat_but_car, num_cols, num_but_cat = grab_col_names(df)
for col in num_cols:
    if col != "SalePrice":
        replace_with_thresholds(df,col)

Observations: 2919
Variables: 74
cat_cols: 45
num_cols: 27
cat_but_car: 1
num_but_cat: 8


In [265]:
Year_col = [col for col in df[num_cols] if "Year" in col]
Year_col

['YearBuilt', 'YearRemodAdd']

In [249]:
df.isnull().sum()

Id                                0
MSSubClass                        0
LotFrontage                     486
LotArea                           0
OverallQual                       0
OverallCond                       0
YearBuilt                         0
YearRemodAdd                      0
MasVnrArea                       23
BsmtFinSF1                        1
BsmtFinSF2                        1
BsmtUnfSF                         1
1stFlrSF                          0
2ndFlrSF                          0
LowQualFinSF                      0
GrLivArea                         0
BsmtFullBath                      2
BsmtHalfBath                      2
FullBath                          0
HalfBath                          0
BedroomAbvGr                      0
KitchenAbvGr                      0
GarageYrBlt                     159
EnclosedPorch                     0
3SsnPorch                         0
ScreenPorch                       0
PoolArea                          0
MiscVal                     

In [266]:
df["GarageYrBlt"].fillna(df["GarageYrBlt"].mode()[0]).isnull().sum()


0

In [202]:
df["LandSlope"].value_counts()

Gtl    2778
Mod     125
Sev      16
Name: LandSlope, dtype: int64

In [271]:
aggregations_MS = {"LotFrontage":["sum","mean"],
                    "LotArea": ["mean","sum","max"],
                    "Street":["nunique"],
                     "LandContour": ["nunique"],
                    "LandSlope": ["nunique"]}

 

In [272]:
 MSZoning = df.groupby('MSZoning').agg(aggregations_MS)

In [273]:
 MSZoning.columns = pd.Index(['MSZoning' + e[0] + "_" + e[1].upper() for e in MSZoning.columns.tolist()])
 
 df.drop(["LotFrontage", "LotArea", "Street", "LandContour", "LandSlope" ],axis=1, inplace=True)
    
 
 df = df.join(MSZoning, how='left', on='MSZoning')   

In [274]:
cat_cols, cat_but_car, num_cols, num_but_cat = grab_col_names(df)

Observations: 2919
Variables: 77
cat_cols: 51
num_cols: 24
cat_but_car: 1
num_but_cat: 17


In [275]:
aggregations_BC = {"GarageArea":["sum","mean"],
                    "WoodDeckSF": ["mean","sum","max"],
                    "OpenPorchSF":["min", "mean","sum","max"],
                     "TotRmsAbvGrd": ["mean"],
                    "TotalBsmtSF": ["mean","sum","max"]}

In [276]:
MSSubClass = df.groupby('MSSubClass').agg(aggregations_BC)

In [277]:
 MSSubClass.columns = pd.Index(['MSSubClass' + e[0] + "_" + e[1].upper() for e in MSSubClass.columns.tolist()])

In [278]:
 df.drop(["GarageArea", "WoodDeckSF", "OpenPorchSF", "TotRmsAbvGrd", "TotalBsmtSF" ],axis=1, inplace=True)

In [279]:
df = df.join(MSSubClass, how='left', on='MSSubClass')   

In [280]:
cat_cols, cat_but_car, num_cols, num_but_cat = grab_col_names(df)

Observations: 2919
Variables: 85
cat_cols: 52
num_cols: 31
cat_but_car: 1
num_but_cat: 18


In [281]:
def one_hot_encoder(df, drop_first=True):
    original_columns = list(df.columns)
    categorical_columns = [col for col in df.columns if df[col].dtype == 'object']
    df = pd.get_dummies(df, columns= categorical_columns, drop_first=drop_first)
    new_columns = [c for c in df.columns if c not in original_columns]
    return df, new_columns

In [282]:
df, cat_colls = one_hot_encoder(df)

In [283]:
train_df = df[df['SalePrice'].notnull()]
test_df = df[df['SalePrice'].isnull()]

In [284]:
train_df.dropna(inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return func(*args, **kwargs)


In [287]:
X.shape, test_df.shape

((1371, 240), (1459, 242))

In [286]:
X =train_df.drop(["SalePrice", "Id"], axis=1)
y = np.log1p(train_df['SalePrice'])

In [288]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=17)

In [290]:
from lightgbm import LGBMRegressor

lgbm_model = LGBMRegressor(random_state=46)

rmse = np.mean(np.sqrt(-cross_val_score(lgbm_model,
                                        X, y, cv=5, scoring="neg_mean_squared_error")))


In [None]:
lgbm_params = {"learning_rate": [0.01, 0.1],
               "n_estimators": [500, 1500, 5000, 6000, 7000, 10000, 15000], 
                   }
 
lgbm_gs_best = GridSearchCV(lgbm_model,
                            lgbm_params,
                            cv=10,
                            n_jobs=-1,
                            verbose=True).fit(X_train, y_train)

Fitting 10 folds for each of 14 candidates, totalling 140 fits


In [244]:

final_model = lgbm_model.set_params(**lgbm_gs_best.best_params_).fit(X, y)

rmse = np.mean(np.sqrt(-cross_val_score(final_model, X, y, cv=10, scoring="neg_mean_squared_error")))



Traceback (most recent call last):
  File "/opt/conda/lib/python3.7/site-packages/sklearn/model_selection/_validation.py", line 761, in _score
    scores = scorer(estimator, X_test, y_test)
  File "/opt/conda/lib/python3.7/site-packages/sklearn/metrics/_scorer.py", line 418, in _passthrough_scorer
    return estimator.score(*args, **kwargs)
  File "/opt/conda/lib/python3.7/site-packages/sklearn/base.py", line 706, in score
    return r2_score(y, y_pred, sample_weight=sample_weight)
  File "/opt/conda/lib/python3.7/site-packages/sklearn/metrics/_regression.py", line 790, in r2_score
    y_true, y_pred, multioutput
  File "/opt/conda/lib/python3.7/site-packages/sklearn/metrics/_regression.py", line 95, in _check_reg_targets
    y_true = check_array(y_true, ensure_2d=False, dtype=dtype)
  File "/opt/conda/lib/python3.7/site-packages/sklearn/utils/validation.py", line 800, in check_array
    _assert_all_finite(array, allow_nan=force_all_finite == "allow-nan")
  File "/opt/conda/lib/python3

KeyboardInterrupt: 

In [220]:
rmse

nan

In [221]:
lgbm_tuned = LGBMRegressor(**lgbm_gs_best.best_params_).fit(X_train, y_train)
y_pred = lgbm_tuned.predict(X_test)

In [222]:
## Log dönüşüm tersi
new_y= np.expm1(y_pred)
new_y_test= np.expm1(y_test)

In [223]:
test_dff = test_df.drop(["SalePrice", "Id"], axis=1)

In [224]:
test_df.shape

(1459, 241)

In [225]:
y_predd = lgbm_tuned.predict(test_dff)

ValueError: Number of features of the model must match the input. Model n_features_ is 240 and input n_features is 241

In [None]:
new_y= np.expm1(y_predd)
new_y_test_df= np.expm1(tahminlenen)

In [None]:
my_submission = pd.DataFrame({'Id': test_df["Id"], 'SalePrice': tahminlenen})
# you could use any filename. We choose submission here
my_submission.to_csv('submission5.csv', index=False)