In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

from datetime import datetime
from scipy.stats import skew  # for some statistics
from scipy.special import boxcox1p
from scipy.stats import boxcox_normmax
from sklearn.linear_model import ElasticNetCV, LassoCV, RidgeCV
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import mean_squared_error
from mlxtend.regressor import StackingCVRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
import matplotlib.pyplot as plt
import scipy.stats as stats
import sklearn.linear_model as linear_model
import seaborn as sns
from sklearn.manifold import TSNE
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

import os
print(os.listdir("./data"))

import warnings
warnings.filterwarnings('ignore')

In [None]:
train = pd.read_csv('./data/kaggle_house_pred_train.csv')
test = pd.read_csv('./data/kaggle_house_pred_test.csv')
print(train.shape)
print(test.shape)

In [None]:
#train.iloc[0:4, [0, 1, 2, 3, -3, -2, -1]]
train.head()

In [14]:
quantitative = [f for f in train.columns if train.dtypes[f] != 'object']
quantitative.remove('SalePrice')
quantitative.remove('Id')
qualitative = [f for f in train.columns if train.dtypes[f] == 'object']

# quantitative
# qualitative

In [None]:
# missing_value

sns.set_style("whitegrid")
missing = train.isnull().sum()
missing = missing[missing > 0]
missing.sort_values(inplace=True)
missing.plot.bar()

In [None]:
y = train['SalePrice']
plt.figure(1); plt.title('Johnson SU')
sns.distplot(y, kde=False, fit=stats.johnsonsu)
plt.figure(2); plt.title('Normal')
sns.distplot(y, kde=False, fit=stats.norm)
plt.figure(3); plt.title('Log Normal')
sns.distplot(y, kde=False, fit=stats.lognorm)

In [None]:
test_normality = lambda x: stats.shapiro(x.fillna(0))[1] < 0.01
normal = pd.DataFrame(train[quantitative])
normal = normal.apply(test_normality)
print(not normal.any())

#none of quantitative variables has normal distribution

In [None]:
# Ordinal Encoding （有序编码）
#原理：按照类别与目标变量（SalePrice）的关系，将类别特征排序，并将每个类别分配一个数值。
#效果：保留了类别的顺序和大小关系。编码后的值有意义，比如类别 A 对 SalePrice 的影响大于 B，编码值 A 可能比 B 大。

# 和One-Hot Encoding的区别
#One-Hot Encoding 适用于类别之间没有明显顺序或大小关系的情况（例如，颜色、城市名等）。
#Ordinal Encoding 适用于类别与目标变量之间有明确的相关性或顺序的情况（例如评级、教育水平等）。

def encode(frame, feature):
    ordering = pd.DataFrame()
    ordering['val'] = frame[feature].unique()
    ordering.index = ordering.val
    ordering['spmean'] = frame[[feature, 'SalePrice']].groupby(feature).mean()['SalePrice']
    ordering = ordering.sort_values('spmean')
    ordering['ordering'] = range(1, ordering.shape[0]+1)
    ordering = ordering['ordering'].to_dict()
    
    for cat, o in ordering.items():
        frame.loc[frame[feature] == cat, feature+'_E'] = o
    
qual_encoded = []
for q in qualitative:  
    encode(train, q)
    qual_encoded.append(q+'_E')
print(qual_encoded)

In [None]:
def spearman(frame, features):
    spr = pd.DataFrame()
    spr['feature'] = features
    spr['spearman'] = [frame[f].corr(frame['SalePrice'], 'spearman') for f in features]
    spr = spr.sort_values('spearman')
    plt.figure(figsize=(6, 0.25*len(features)))
    sns.barplot(data=spr, y='feature', x='spearman', orient='h')
    
features = quantitative + qual_encoded
spearman(train, features)

In [None]:
plt.figure(1)
corr = train[quantitative+['SalePrice']].corr()
sns.heatmap(corr)
plt.figure(2)
corr = train[qual_encoded+['SalePrice']].corr()
sns.heatmap(corr)
plt.figure(3)
corr = pd.DataFrame(np.zeros([len(quantitative)+1, len(qual_encoded)+1]), index=quantitative+['SalePrice'], columns=qual_encoded+['SalePrice'])
for q1 in quantitative+['SalePrice']:
    for q2 in qual_encoded+['SalePrice']:
        corr.loc[q1, q2] = train[q1].corr(train[q2])
sns.heatmap(corr)

In [None]:
# t-SNE; PCA+k-means; 

features = quantitative + qual_encoded
model = TSNE(n_components=2, random_state=0, perplexity=50)
X = train[features].fillna(0.).values
tsne = model.fit_transform(X)

std = StandardScaler()
s = std.fit_transform(X)
pca = PCA(n_components=30)
pca.fit(s)
pc = pca.transform(s)
kmeans = KMeans(n_clusters=5)
kmeans.fit(pc)

fr = pd.DataFrame({'tsne1': tsne[:,0], 'tsne2': tsne[:, 1], 'cluster': kmeans.labels_})
sns.lmplot(data=fr, x='tsne1', y='tsne2', hue='cluster', fit_reg=False)
print(np.sum(pca.explained_variance_ratio_))

In [None]:
# TODO 整理各种检查数据的函数便于使用
#print(features_encoded['MSSubClass'].value_counts())

#train.info()

#pd.set_option('display.max_rows', None)
#features_encoded.dtypes

# 检查列是否是字符串类型
# columns_to_check = ['ExterQual_E', 'BsmtCond_E', 'BsmtFinType1_E', 'BsmtFinType2_E']
# str_columns = features_encoded[columns_to_check].select_dtypes(include=['object']).columns

# # 打印所有为字符串类型的列
# if not str_columns.empty:
#     print(f"以下列是字符串类型: {list(str_columns)}")
# else:
#     print("没有列是字符串类型")


# # 检查是否存在 object 类型
# if (features_encoded.dtypes == 'object').any():
#     print("仍然有 object 类型的列")
# else:
#     print("没有 object 类型的列")

#str_columns = features_encoded.select_dtypes(include=['object']).columns
#print(str_columns)

In [None]:
#检查NAN

# missing_values = features.isnull().sum()
# print(missing_values[missing_values > 0])  # 打印仍然存在缺失值的列

In [None]:
# Data inspection
# train.shape
# train.info()
# if "Id" in train.columns.tolist():
#     dataset_df = train.drop('Id', axis=1)
# dataset_df.head(3)

# print(dataset_df['SalePrice'].describe())
# plt.figure(figsize=(9, 8))
# sns.histplot(dataset_df['SalePrice'], color='g', bins=100, kde=True, alpha=0.4)

# list(set(dataset_df.dtypes.tolist()))
# df_num = dataset_df.select_dtypes(include = ['float64', 'int64'])
# df_num.head()