In [182]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

from datetime import datetime
from scipy.stats import skew  # for some statistics
from scipy.special import boxcox1p
from scipy.stats import boxcox_normmax
from sklearn.linear_model import ElasticNetCV, LassoCV, RidgeCV
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import mean_squared_error
from mlxtend.regressor import StackingCVRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
import matplotlib.pyplot as plt
import scipy.stats as stats
import sklearn.linear_model as linear_model
import seaborn as sns
from sklearn.manifold import TSNE
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

import os
print(os.listdir("./data"))

import warnings
warnings.filterwarnings('ignore')

['kaggle_house_pred_test.csv', 'kaggle_house_pred_train.csv']


In [183]:
train = pd.read_csv('./data/kaggle_house_pred_train.csv')
test = pd.read_csv('./data/kaggle_house_pred_test.csv')

## Feature Engineering

In [184]:
train.drop(['Id'], axis=1, inplace=True)
test.drop(['Id'], axis=1, inplace=True)

# Dropping outliers after detecting them by eye(删除异常值并重置索引)
train = train.drop(train[(train['OverallQual'] < 5)
                                  & (train['SalePrice'] > 200000)].index)
train = train.drop(train[(train['GrLivArea'] > 4000)
                                  & (train['SalePrice'] < 200000)].index)
train = train.drop(train[(train['GarageArea'] > 1200)
                                  & (train['SalePrice'] < 200000)].index)
train = train.drop(train[(train['TotalBsmtSF'] > 3000)
                                  & (train['SalePrice'] > 320000)].index)
train = train.drop(train[(train['1stFlrSF'] < 3000)
                                  & (train['SalePrice'] > 600000)].index)
train = train.drop(train[(train['1stFlrSF'] > 3000)
                                  & (train['SalePrice'] < 200000)].index)
train = train[train.GrLivArea < 4500]
train.reset_index(drop=True, inplace=True)

#feature merge(合并训练集和测试集的特征)
train_features = train.drop(['SalePrice'], axis=1)
test_features = test
features = pd.concat([train_features, test_features]).reset_index(drop=True)

# target保存训练集的目标变量
y = train['SalePrice'].reset_index(drop=True)

In [185]:
# Ordinal Encoding
def encode(frame, feature):
    '''
    修改frame
    '''
    ordering = pd.DataFrame()
    ordering['val'] = frame[feature].unique()
    ordering.index = ordering.val
    ordering['spmean'] = frame[[feature, 'SalePrice']].groupby(feature).mean()['SalePrice']
    ordering = ordering.sort_values('spmean')
    ordering['ordering'] = range(1, ordering.shape[0]+1)
    ordering = ordering['ordering'].to_dict()

    for cat, o in ordering.items():
        frame.loc[frame[feature] == cat, feature+'_E'] = o

quantitative = [f for f in features.columns if features.dtypes[f] != 'object']
qualitative = [f for f in features.columns if features.dtypes[f] == 'object']

qual_encoded = []
for q in qualitative:  
    encode(train, q)
    qual_encoded.append(q+'_E')

quantitative_data = features[quantitative]
qual_encoded_data = train[qual_encoded]

# 将两部分数据拼接成 DataFrame
features_encoded = pd.concat([quantitative_data, qual_encoded_data], axis=1)

# TODO
#str_columns = train.select_dtypes(include=['object']).columns
#train = train.drop(columns=str_columns)

In [186]:
# Data inspection
# train.shape
# train.info()
# if "Id" in train.columns.tolist():
#     dataset_df = train.drop('Id', axis=1)
# dataset_df.head(3)

# print(dataset_df['SalePrice'].describe())
# plt.figure(figsize=(9, 8))
# sns.histplot(dataset_df['SalePrice'], color='g', bins=100, kde=True, alpha=0.4)

# list(set(dataset_df.dtypes.tolist()))
# df_num = dataset_df.select_dtypes(include = ['float64', 'int64'])
# df_num.head()

In [187]:
# fill out NAN(填充缺失值)

## List of NaN including columns where NaN's mean none. 
## exp. there is no pool in the house.
none_cols = ['Alley_E', 'PoolQC_E', 'MiscFeature_E', 'Fence_E', 'FireplaceQu_E', 'GarageType_E', 
             'GarageFinish_E', 'GarageQual_E', 'GarageCond_E', 'BsmtQual_E', 'BsmtCond_E', 
             'BsmtExposure_E', 'BsmtFinType1_E', 'BsmtFinType2_E', 'MasVnrType_E']

## List of NaN including columns where NaN's mean 0.
## exp. area 
zero_cols = ['BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'BsmtFullBath', 
             'BsmtHalfBath', 'GarageYrBlt', 'GarageArea', 'GarageCars', 'MasVnrArea']

## List of NaN including columns where NaN's actually missing gonna replaced with mode.
## categorical variables so use the number with the most frequecy.
freq_cols = ['Electrical_E', 'Exterior1st_E', 'Exterior2nd_E', 'Functional_E', 'KitchenQual_E', 
             'SaleType_E', 'Utilities_E']

# Filling the list of columns above:

for col in none_cols:
    features_encoded[col].replace(np.nan, 'None', inplace=True)

for col in zero_cols:
    features_encoded[col].replace(np.nan, 0, inplace=True)

for col in freq_cols:
    features_encoded[col].replace(np.nan,features_encoded[col].mode()[0], inplace=True)
    
# Filling MSZoning according to MSSubClass
# since different Mssubclass have different MSZoning
# features_encoded['MSZoning_E'] = features_encoded.groupby('MSSubClass')['MSZoning_E'].transform(
#     lambda x: x.fillna(x.mode()[0]))

features_encoded['MSZoning_E'] = features_encoded.groupby('MSSubClass')['MSZoning_E'].transform(
    lambda x: x.fillna(x.mode()[0]) if not x.mode().empty else x.fillna(x.mean()))


# Filling LotFrontage according to Neighborhood
# the house nearby tend to have the same area of lot frontage.
features_encoded['LotFrontage'] = features_encoded.groupby(
    ['Neighborhood_E'])['LotFrontage'].transform(lambda x: x.fillna(x.median()))

#fill according to dtypes automatically
objects = []
for i in features_encoded.columns:
    if features_encoded[i].dtype == object:
        objects.append(i)
features_encoded.update(features_encoded[objects].fillna('None'))

numeric_dtypes = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
numerics = []
for i in features_encoded.columns:
    if features_encoded[i].dtype in numeric_dtypes:
        numerics.append(i)
features_encoded.update(features_encoded[numerics].fillna(0))

In [188]:
# features_encoded which numerical on data but should be treated as category.
# 确定分类变量
#features_encoded['MSSubClass'] = features_encoded['MSSubClass'].astype(str)

# Transforming rare values(less than 10) into one group - dimensionality reduction 
# 稀有值处理，将频率小于10的值归为"Other"类
rare_cols = ['Condition1_E', 'Condition2_E', 'RoofMatl_E', 'Exterior1st_E', 'Exterior2nd_E', 
             'Heating_E', 'Electrical_E', 'Functional_E', 'SaleType_E']

for col in rare_cols:
    rare_values = features_encoded[col].value_counts()[features_encoded[col].value_counts() < 10].index
    features_encoded[col].replace(rare_values, 'Other', inplace=True)

In [189]:
missing_values = features_encoded.isnull().sum()
print(missing_values[missing_values > 0])  # 打印仍然存在缺失值的列


Series([], dtype: int64)


In [190]:
# Converting some of the categorical values to numeric ones.
# 特征映射，将分类变量转换为数值型变量
neigh_map = {'MeadowV': 1, 'IDOTRR': 1, 'BrDale': 1, 'BrkSide': 2, 'OldTown': 2, 
             'Edwards': 2, 'Sawyer': 3, 'Blueste': 3, 'SWISU': 3, 'NPkVill': 3, 
             'NAmes': 3, 'Mitchel': 4, 'SawyerW': 5, 'NWAmes': 5, 'Gilbert': 5, 
             'Blmngtn': 5, 'CollgCr': 5, 'ClearCr': 6, 'Crawfor': 6, 'Veenker': 7, 
             'Somerst': 7, 'Timber': 8, 'StoneBr': 9, 'NridgHt': 10, 'NoRidge': 10}

features_encoded['Neighborhood_E'] = features_encoded['Neighborhood_E'].map(neigh_map).astype(int)

quality_maps = {'ExterQual': {'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5}, 
                'ExterCond': {'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5}, 
                'BsmtQual': {'None': 0, 'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5}, 
                'BsmtCond': {'None': 0, 'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5}, 
                'BsmtFinType1': {'None': 0, 'Unf': 1, 'LwQ': 2, 'Rec': 3, 'BLQ': 4, 'ALQ': 5, 'GLQ': 6}, 
                'BsmtFinType2': {'None': 0, 'Unf': 1, 'LwQ': 2, 'Rec': 3, 'BLQ': 4, 'ALQ': 5, 'GLQ': 6}, 
                'HeatingQC': {'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5}, 
                'KitchenQual': {'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5}, 
                'FireplaceQu': {'None': 0, 'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5}, 
                'GarageQual': {'None': 0, 'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5}, 
                'GarageCond': {'None': 0, 'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5}}

for col, mapping in quality_maps.items():
    features_encoded[col] = features_encoded[col].map(mapping).astype(int)

IntCastingNaNError: Cannot convert non-finite values (NA or inf) to integer

In [71]:
# Creating new features based on previous observations
# 创建新特征
features_encoded['TotalSF'] = features_encoded['BsmtFinSF1'] + features_encoded['BsmtFinSF2'] + features_encoded['1stFlrSF'] + features_encoded['2ndFlrSF']
features_encoded['TotalBathrooms'] = features_encoded['FullBath'] + 0.5 * features_encoded['HalfBath'] + features_encoded['BsmtFullBath'] + 0.5 * features_encoded['BsmtHalfBath']
features_encoded['TotalPorchSF'] = features_encoded['OpenPorchSF'] + features_encoded['3SsnPorch'] + features_encoded['EnclosedPorch'] + features_encoded['ScreenPorch'] + features_encoded['WoodDeckSF']
features_encoded['YearBlRm'] = features_encoded['YearBuilt'] + features_encoded['YearRemodAdd']

# Merging quality and conditions
# 合并质量和条件相关的特征
features_encoded['TotalExtQual'] = (features_encoded['ExterQual'] + features_encoded['ExterCond'])
features_encoded['TotalBsmQual'] = (features_encoded['BsmtQual'] + features_encoded['BsmtCond'] +
                            features_encoded['BsmtFinType1'] +
                            features_encoded['BsmtFinType2'])
features_encoded['TotalGrgQual'] = (features_encoded['GarageQual'] + features_encoded['GarageCond'])
features_encoded['TotalQual'] = features_encoded['OverallQual'] + features_encoded[
    'TotalExtQual'] + features_encoded['TotalBsmQual'] + features_encoded[
        'TotalGrgQual'] + features_encoded['KitchenQual'] + features_encoded['HeatingQC']

## Creating new features_encoded by using new quality indicators
# 创建质量乘积特征
features_encoded['QualGr'] = features_encoded['TotalQual'] * features_encoded['GrLivArea']
features_encoded['QualBsm'] = features_encoded['TotalBsmQual'] * (features_encoded['BsmtFinSF1'] + features_encoded['BsmtFinSF2'])
features_encoded['QualPorch'] = features_encoded['TotalExtQual'] * features_encoded['TotalPorchSF']
features_encoded['QualExt'] = features_encoded['TotalExtQual'] * features_encoded['MasVnrArea']
features_encoded['QualGrg'] = features_encoded['TotalGrgQual'] * features_encoded['GarageArea']
features_encoded['QlLivArea'] = (features_encoded['GrLivArea'] - features_encoded['LowQualFinSF']) * features_encoded['TotalQual']
features_encoded['QualSFNg'] = features_encoded['QualGr'] * features_encoded['Neighborhood']

# Creating some simple features_encoded
# 创建简单特征
features_encoded['HasPool'] = features_encoded['PoolArea'].apply(lambda x: 1 if x > 0 else 0)
features_encoded['Has2ndFloor'] = features_encoded['2ndFlrSF'].apply(lambda x: 1 if x > 0 else 0)
features_encoded['HasGarage'] = features_encoded['GarageArea'].apply(lambda x: 1 if x > 0 else 0)
features_encoded['HasBsmt'] = features_encoded['TotalBsmtSF'].apply(lambda x: 1 if x > 0 else 0)
features_encoded['HasFireplace'] = features_encoded['Fireplaces'].apply(lambda x: 1 if x > 0 else 0)
features_encoded['HasPorch'] = features_encoded['TotalPorchSF'].apply(lambda x: 1 if x > 0 else 0)

In [72]:
# Transforming The Skewed Features
# 处理偏态特征
possible_skewed = [
    'LotFrontage', 'LotArea', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2',
    'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'GrLivArea',
    'GarageArea', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch',
    'ScreenPorch', 'PoolArea', 'LowQualFinSF', 'MiscVal'
]

# Finding skewness of the numerical features
# 计算数值特征的偏度
skew_features = np.abs(features_encoded[possible_skewed].apply(lambda x: skew(x)).sort_values(ascending=False))

# Filtering skewed features
# 过滤偏度大于0.3的特征
high_skew = skew_features[skew_features > 0.3]

# 获取高偏度特征的索引
skew_index = high_skew.index

# Applying boxcox transformation to fix skewness
# 使用Box-Cox变换来修正偏度
for i in skew_index:
    features_encoded[i] = boxcox1p(features_encoded[i], boxcox_normmax(features_encoded[i] + 1))

'''automatically
numeric_dtypes = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
numerics2 = []
for i in features_encoded.columns:
    if features_encoded[i].dtype in numeric_dtypes:
        numerics2.append(i)
skew_features = features_encoded[numerics2].apply(lambda x: skew(x)).sort_values(ascending=False)

high_skew = skew_features[skew_features > 0.5]
skew_index = high_skew.index

for i in skew_index:
    features_encoded[i] = boxcox1p(features_encoded[i], boxcox_normmax(ffeatures_encoded[i] + 1))
'''

"automatically\nnumeric_dtypes = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']\nnumerics2 = []\nfor i in features.columns:\n    if features[i].dtype in numeric_dtypes:\n        numerics2.append(i)\nskew_features = features[numerics2].apply(lambda x: skew(x)).sort_values(ascending=False)\n\nhigh_skew = skew_features[skew_features > 0.5]\nskew_index = high_skew.index\n\nfor i in skew_index:\n    features[i] = boxcox1p(features[i], boxcox_normmax(features[i] + 1))\n"

In [73]:
# 删除无关特征
# TODO: STREET
to_drop = [
    'Utilities',
    'PoolQC',
    'YrSold',
    'MoSold',
    'ExterQual',
    'BsmtQual',
    'GarageQual',
    'KitchenQual',
    'HeatingQC',
]
# 只显示有缺失值的列及其缺失值数量
missing_values = features_encoded.isnull().sum()
missing_values = missing_values[missing_values > 0]
print(missing_values)

str_columns = features_encoded.select_dtypes(include=['object']).columns
print(str_columns)

Series([], dtype: int64)
Index(['MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities',
       'LotConfig', 'LandSlope', 'Condition1', 'Condition2', 'BldgType',
       'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd',
       'MasVnrType', 'Foundation', 'BsmtExposure', 'Heating', 'CentralAir',
       'Electrical', 'Functional', 'GarageType', 'GarageFinish', 'PavedDrive',
       'PoolQC', 'Fence', 'MiscFeature', 'SaleType', 'SaleCondition'],
      dtype='object')


In [74]:
# 对分类变量进行独热编码
#features_encoded = pd.get_dummies(data=features_encoded)

# 分离训练集和测试集
train = features_encoded.iloc[:len(y), :]
test = features_encoded.iloc[len(train):, :]

# 设置模型数据
X = train
y = np.log1p(y)

print(X.shape,test.shape,y.shape)

(1449, 100) (1459, 100) (1449,)


In [75]:
#outliers = [30, 88, 462, 631, 1322]
#X = X.drop(X.index[outliers])
#y = y.drop(y.index[outliers])

# overfit = []
# for i in X.columns:
#     counts = X[i].value_counts()
#     zeros = counts.iloc[0]
#     if zeros / len(X) * 100 > 99.94:
#         overfit.append(i)

#overfit = list(overfit)
#X = X.drop(overfit, axis=1)
#test = test.drop(overfit, axis=1)

In [76]:
kfolds = KFold(n_splits=10, shuffle=True, random_state=42)

def rmsle(y, y_pred):
    return np.sqrt(mean_squared_error(y, y_pred))

def cv_rmse(model, X=X):
    rmse = np.sqrt(-cross_val_score(model, X, y, scoring="neg_mean_squared_error", cv=kfolds))
    return (rmse)

In [77]:
alphas_alt = [14.5, 14.6, 14.7, 14.8, 14.9, 15, 15.1, 15.2, 15.3, 15.4, 15.5]
alphas2 = [5e-05, 0.0001, 0.0002, 0.0003, 0.0004, 0.0005, 0.0006, 0.0007, 0.0008]
e_alphas = [0.0001, 0.0002, 0.0003, 0.0004, 0.0005, 0.0006, 0.0007]
e_l1ratio = [0.8, 0.85, 0.9, 0.95, 0.99, 1]

In [78]:
ridge = make_pipeline(RobustScaler(), RidgeCV(alphas=alphas_alt, cv=kfolds))
lasso = make_pipeline(RobustScaler(), LassoCV(max_iter=1e7, alphas=alphas2, random_state=42, cv=kfolds))
elasticnet = make_pipeline(RobustScaler(), ElasticNetCV(max_iter=1e7, alphas=e_alphas, cv=kfolds, l1_ratio=e_l1ratio))                                
svr = make_pipeline(RobustScaler(), SVR(C= 20, epsilon= 0.008, gamma=0.0003,))

In [79]:
gbr = GradientBoostingRegressor(n_estimators=3000, learning_rate=0.05, max_depth=4, max_features='sqrt', min_samples_leaf=15, min_samples_split=10, loss='huber', random_state =42)      

In [80]:
lightgbm = LGBMRegressor(objective='regression', 
                                       num_leaves=4,
                                       learning_rate=0.01, 
                                       n_estimators=5000,
                                       max_bin=200, 
                                       bagging_fraction=0.75,
                                       bagging_freq=5, 
                                       bagging_seed=7,
                                       feature_fraction=0.2,
                                       feature_fraction_seed=7,
                                       verbose=-1,
                                       )

In [81]:
xgboost = XGBRegressor(learning_rate=0.01,n_estimators=3460,
                                     max_depth=3, min_child_weight=0,
                                     gamma=0, subsample=0.7,
                                     colsample_bytree=0.7,
                                     objective='reg:linear', nthread=-1,
                                     scale_pos_weight=1, seed=27,
                                     reg_alpha=0.00006)

In [82]:
stack_gen = StackingCVRegressor(regressors=(ridge, lasso, elasticnet, gbr, xgboost, lightgbm),
                                meta_regressor=xgboost,
                                use_features_in_secondary=True)

In [83]:
score = cv_rmse(ridge)
score = cv_rmse(lasso)
print("LASSO: {:.4f} ({:.4f})\n".format(score.mean(), score.std()), datetime.now(), )

score = cv_rmse(elasticnet)
print("elastic net: {:.4f} ({:.4f})\n".format(score.mean(), score.std()), datetime.now(), )

score = cv_rmse(svr)
print("SVR: {:.4f} ({:.4f})\n".format(score.mean(), score.std()), datetime.now(), )

score = cv_rmse(lightgbm)
print("lightgbm: {:.4f} ({:.4f})\n".format(score.mean(), score.std()), datetime.now(), )

score = cv_rmse(gbr)
print("gbr: {:.4f} ({:.4f})\n".format(score.mean(), score.std()), datetime.now(), )

score = cv_rmse(xgboost)
print("xgboost: {:.4f} ({:.4f})\n".format(score.mean(), score.std()), datetime.now(), )

ValueError: 
All the 10 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
10 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\huang\Desktop\New folder\.venv\lib\site-packages\sklearn\model_selection\_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\huang\Desktop\New folder\.venv\lib\site-packages\sklearn\base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "c:\Users\huang\Desktop\New folder\.venv\lib\site-packages\sklearn\pipeline.py", line 469, in fit
    Xt = self._fit(X, y, routed_params)
  File "c:\Users\huang\Desktop\New folder\.venv\lib\site-packages\sklearn\pipeline.py", line 406, in _fit
    X, fitted_transformer = fit_transform_one_cached(
  File "c:\Users\huang\Desktop\New folder\.venv\lib\site-packages\joblib\memory.py", line 312, in __call__
    return self.func(*args, **kwargs)
  File "c:\Users\huang\Desktop\New folder\.venv\lib\site-packages\sklearn\pipeline.py", line 1310, in _fit_transform_one
    res = transformer.fit_transform(X, y, **params.get("fit_transform", {}))
  File "c:\Users\huang\Desktop\New folder\.venv\lib\site-packages\sklearn\utils\_set_output.py", line 316, in wrapped
    data_to_wrap = f(self, X, *args, **kwargs)
  File "c:\Users\huang\Desktop\New folder\.venv\lib\site-packages\sklearn\base.py", line 1101, in fit_transform
    return self.fit(X, y, **fit_params).transform(X)
  File "c:\Users\huang\Desktop\New folder\.venv\lib\site-packages\sklearn\base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "c:\Users\huang\Desktop\New folder\.venv\lib\site-packages\sklearn\preprocessing\_data.py", line 1597, in fit
    X = self._validate_data(
  File "c:\Users\huang\Desktop\New folder\.venv\lib\site-packages\sklearn\base.py", line 633, in _validate_data
    out = check_array(X, input_name="X", **check_params)
  File "c:\Users\huang\Desktop\New folder\.venv\lib\site-packages\sklearn\utils\validation.py", line 1012, in check_array
    array = _asarray_with_order(array, order=order, dtype=dtype, xp=xp)
  File "c:\Users\huang\Desktop\New folder\.venv\lib\site-packages\sklearn\utils\_array_api.py", line 745, in _asarray_with_order
    array = numpy.asarray(array, order=order, dtype=dtype)
  File "c:\Users\huang\Desktop\New folder\.venv\lib\site-packages\pandas\core\generic.py", line 1998, in __array__
    arr = np.asarray(values, dtype=dtype)
ValueError: could not convert string to float: 'RL'
