In [1]:
import warnings
warnings.filterwarnings("ignore", 
                        message="This pattern is interpreted as a regular expression, and has match groups.")
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, FunctionTransformer
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from category_encoders import BinaryEncoder
import numpy as np
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import GradientBoostingRegressor
import lightgbm as lgb
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.neural_network import MLPRegressor
from sklearn.decomposition import PCA
from sklearn.decomposition import TruncatedSVD

In [3]:
import pandas as pd

df = pd.read_csv('train.csv', low_memory=False).drop(
    columns=['host_acceptance_rate', 'state', 'country', 'market',
             'is_business_travel_ready', 'experiences_offered', 'country_code'])
df=df.set_index('id')
df.head()

Unnamed: 0_level_0,name,summary,space,description,neighborhood_overview,notes,transit,access,interaction,house_rules,...,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,instant_bookable,cancellation_policy,require_guest_profile_picture,require_guest_phone_verification,calculated_host_listings_count,reviews_per_month
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
22267382,Modern and Cozy Large Studio in Brooklyn,Modern large studio with new amenities and app...,Our place is a little quiet sanctuary in the h...,Modern large studio with new amenities and app...,"BAM, Barclays, Brooklyn City Point, Fort Green...",,"Subway: 2,3,4,5,A,C,B,Q,G",Washer/Dryer Dishwasher Internet Gym Roof Top ...,"Depending on the time of your visit, I'll be h...","- Please be respectful of our neighbors, no lo...",...,10.0,10.0,10.0,10.0,t,flexible,f,f,1,0.59
2473861,Royal Harlem TRIPLEX Home 5 Beds,Harlem is back and so gorgeous! Visit and expl...,Harlem is back and so gorgeous! Visit and expl...,Harlem is back and so gorgeous! Visit and expl...,HARLEM is a piece of real NY history overflowi...,HARLEM RESTAURANTS Red Rooster Harlem -- excel...,PUBLIC TRANSPORTATION: Conveniently near all p...,The WHOLE ENTIRE HOUSE,,"Smoking, pets and unaccounted guests NOT permi...",...,9.0,9.0,9.0,9.0,t,moderate,f,f,3,2.47
25079703,Sunny East Village Studio,"Clean, hip and well designed sun drenched East...",This is a rare East Village studio with it's h...,"Clean, hip and well designed sun drenched East...",East Village is one of the last remaining neig...,,,You'll have access to the entire space - it's ...,"Very responsive via phone call, text or email.",,...,9.0,10.0,10.0,10.0,f,moderate,f,f,1,0.89
9342478,"Beautiful, airy, light-filled room","Private, spacious, comfortable room in 2-bed f...","Big closet, two big windows, tall ceiling and ...","Private, spacious, comfortable room in 2-bed f...",One block from Morgan L stop. Super cool area....,,,,,,...,,,,,f,flexible,f,f,1,
4866426,Private Room in Prime Brooklyn Spot,"Comfy, quiet and big private room in a three b...",This big old apartment that we love and take c...,"Comfy, quiet and big private room in a three b...",I absolutely love this neighborhood - right at...,Just a note about the space: The window in you...,Super convenient to almost all subway lines. A...,Your room has a very comfortable queen sized b...,"We are my husband Joaquin and I, our sweet new...",This house is shoes off. Thank you! No guests ...,...,10.0,10.0,10.0,10.0,f,flexible,f,f,1,3.14


In [5]:
def cleaning(df):
    df_copy = df.copy()
    df_copy['extra_people'] = df_copy['extra_people'].str.replace('$', '').astype(float)
    
    df_copy['TV'] = df['amenities'].str.contains(r'\b(TV|Cable TV)\b', case=False, na=False)
    df_copy['wifi'] = df['amenities'].str.contains(r'\b(internet|wifi)\b', case=False, na=False)
    df_copy['gym'] = df['amenities'].str.contains(r'\b(gym)\b', case=False, na=False)
    df_copy['air'] = df['amenities'].str.contains(r'\b(Air conditioning)\b', case=False, na=False)
    df_copy['Refrigerator'] = df['amenities'].str.contains(r'\b(Refrigerator)\b', case=False, na=False)
    df_copy['Dishwasher'] = df['amenities'].str.contains(r'\b(Dishwasher)\b', case=False, na=False)
    
    #df_copy['reviews'] = df['amenities'].str.contains(r'\b(reviews)\b', case=False, na=False)
    
    df_copy['host_since'] = pd.to_datetime(df['host_since'])
    df_copy['host_since_year'] = df_copy['host_since'].dt.year
    
    df_copy['lux'] = df['description'].str.contains(r'\b(Valet|Sky Lounge|dry cleaning)\b', 
                                                    case=False, na=False)
    #df_copy['King bed'] = df['description'].str.contains(r'\b(King bed)\b', case=False, na=False)
    df_copy['wine storage'] = df['description'].str.contains(r'\b(wine storage)\b', case=False, na=False)
    df_copy['super bowl'] = df['description'].str.contains(r'\b(super bowl)\b', case=False, na=False)
    df_copy['Time Square'] = df['description'].str.contains(r'\b(Time Square)\b', case=False, na=False)
    df_copy['sqft'] = df['description'].str.contains(r'\b(sqft)\b', case=False, na=False)
    df_copy['private elevator'] = df['description'].str.contains(r'\b(private elevator|private keyed elevator)\b', 
                                                                 case=False, na=False)
    df_copy['Front desk'] = df['description'].str.contains(r'\b(Front desk)\b', 
                                                                         case=False, na=False)
    
    #df_copy['state-of-the-art'] = df['description'].str.contains(r'\b(state-of-the-art)\b', case=False, na=False)
    
    df_copy['shared'] = df['description'].str.contains(r'\b(shared|sharing)\b', 
                                                       case=False, na=False)

    return df_copy

In [1948]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 33538 entries, 0 to 33537
Data columns (total 58 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   id                                33538 non-null  int64  
 1   name                              33526 non-null  object 
 2   summary                           32266 non-null  object 
 3   space                             23038 non-null  object 
 4   description                       33230 non-null  object 
 5   neighborhood_overview             19948 non-null  object 
 6   notes                             13445 non-null  object 
 7   transit                           20796 non-null  object 
 8   access                            19304 non-null  object 
 9   interaction                       18658 non-null  object 
 10  house_rules                       19983 non-null  object 
 11  host_id                           33538 non-null  int64  
 12  host

In [6]:
# clean the dataframe
df_cleaned = cleaning(df)
X = df_cleaned
y = df_cleaned['price']

# Define columns to be one-hot encoded and numerical columns
categorical_cols = ['neighbourhood_group_cleansed', 'city', 'room_type', 
                    'review_scores_location', 'host_neighbourhood',
                    'review_scores_cleanliness','review_scores_rating', 'review_scores_checkin',
                    'instant_bookable', 'property_type']
numerical_cols = ['bedrooms', 'bathrooms', 'beds', 'number_of_reviews', 'Refrigerator', 
                  'Time Square', 'extra_people', 'lux', 'wine storage', 
                  'super bowl', 'sqft', 'private elevator', 'Front desk',
                  'host_listings_count', 'maximum_nights', 'wifi', 'gym', 'air', 'Dishwasher',
                  'guests_included', 'accommodates', 'reviews_per_month', 'shared', 'TV']

# Define preprocessing steps for numerical and categorical data
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', max_categories=70))
])

# Combine preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', categorical_transformer, categorical_cols),
        ('num', numerical_transformer, numerical_cols)
    ]
)

# Define the pipeline with preprocessing and model training steps
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('lgb', lgb.LGBMRegressor(n_estimators=200,
                                                     min_child_samples=16)) #7558.500639230459
                           ])

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Fit the pipeline (preprocessing and model) on the training data
pipeline.fit(X_train, y_train)

# Evaluate the pipeline on the testing data
mse = mean_squared_error(pipeline.predict(X_test), y_test)
print('Mean Squared Error:', mse)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003822 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1249
[LightGBM] [Info] Number of data points in the train set: 26830, number of used features: 191
[LightGBM] [Info] Start training from score 145.177190
Mean Squared Error: 7558.500639230459


In [3346]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, FunctionTransformer, Binarizer
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from category_encoders import BinaryEncoder
import numpy as np
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import GradientBoostingRegressor
import lightgbm as lgb
from sklearn.preprocessing import KBinsDiscretizer
from catboost import CatBoostRegressor

# Define columns to be one-hot encoded and numerical columns
categorical_cols = ['neighbourhood_group_cleansed', 'city', 'room_type', 
                    'review_scores_location', 'host_neighbourhood',
                    'review_scores_cleanliness','review_scores_rating', 'review_scores_checkin',
                    'instant_bookable', 'property_type']
numerical_cols = ['bedrooms', 'bathrooms', 'beds', 'number_of_reviews', 'Refrigerator', 
                  'Time Square', 'extra_people', 'lux', 'wine storage', 
                  'super bowl', 'sqft', 'private elevator', 'Front desk', 
                  'host_listings_count', 'maximum_nights', 'wifi', 'gym', 'air', 'Dishwasher',
                  'guests_included', 'accommodates', 'reviews_per_month', 'shared', 'TV']

# Define preprocessing steps for numerical and categorical data
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])


categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combine preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', categorical_transformer, categorical_cols),
        ('num', numerical_transformer, numerical_cols)
    ]
)

base_estimators = [
    ('xgb', XGBRegressor(n_estimators=140, max_depth=5, gamma=0.5, random_state=42)),
    ('lgb', lgb.LGBMRegressor(n_estimators=180, min_child_samples=16)),
    ('cat', CatBoostRegressor(iterations=1000, random_seed=42, verbose=0, depth=10)),
    ('gb', GradientBoostingRegressor(n_estimators=300, max_depth=16, random_state=42,
                                     criterion='squared_error', max_features='sqrt')) 
]

stacked_regressor = StackingRegressor(
    estimators=base_estimators
)

# Define the pipeline with preprocessing and model training steps
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('stacked_regressor', stacked_regressor)
                           ]) #7455.149868475568

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Fit the pipeline (preprocessing and model) on the training data
pipeline.fit(X_train, y_train)

# Evaluate the pipeline on the testing data
mse = mean_squared_error(pipeline.predict(X_test), y_test)
print('Mean Squared Error:', mse)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004599 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1313
[LightGBM] [Info] Number of data points in the train set: 26830, number of used features: 223
[LightGBM] [Info] Start training from score 145.177190
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003739 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1269
[LightGBM] [Info] Number of data points in the train set: 21464, number of used features: 211
[LightGBM] [Info] Start training from score 145.516772
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003373 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not

In [3348]:
# Evaluate the pipeline on all the data
pipeline.fit(X, y)
mse = mean_squared_error(pipeline.predict(X), y, squared=False)
print('Mean Squared Error:', mse) #52.570513026079624

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.005721 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1371
[LightGBM] [Info] Number of data points in the train set: 33538, number of used features: 243
[LightGBM] [Info] Start training from score 145.177291
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.005111 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1306
[LightGBM] [Info] Number of data points in the train set: 26830, number of used features: 222
[LightGBM] [Info] Start training from score 144.872009
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003757 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not

In [3349]:
# Evaluate the pipeline on the training data
mse = mean_squared_error(pipeline.predict(X_test), y_test, squared=False)
print('Mean Squared Error:', mse) #86.43423233229402

Mean Squared Error: 57.86591306218374


In [3350]:
# Evaluate the pipeline on the training data
mse = mean_squared_error(pipeline.predict(X_train), y_train, squared=False)
print('Mean Squared Error:', mse) #52.570513026079624

Mean Squared Error: 57.03897374357588


In [2974]:
display(pipeline.predict(X_test))

array([128.38243265, 159.94697283, 200.59677705, ...,  93.01628181,
        69.74474314, 169.81378171])

In [2975]:
y_test

id
4907206     225.0
1559912      79.0
10658564    100.0
5373674      90.0
13006794     74.0
            ...  
21769592     88.0
10381948     80.0
8798499      62.0
15097126     65.0
29120288    140.0
Name: price, Length: 6708, dtype: float64

In [492]:
import pandas as pd

test_df = pd.read_csv('test.csv', low_memory=False)

In [388]:
test_df

Unnamed: 0,id,name,summary,space,description,experiences_offered,neighborhood_overview,notes,transit,access,...,review_scores_communication,review_scores_location,review_scores_value,instant_bookable,is_business_travel_ready,cancellation_policy,require_guest_profile_picture,require_guest_phone_verification,calculated_host_listings_count,reviews_per_month
0,19307997,Super Lux 2BR in Downtown Manhattan,Prepare to be WOWED! This spectacularly bright...,"Top of the line Wolf and Sub-Zero appliances, ...",Prepare to be WOWED! This spectacularly bright...,none,,,,,...,,,,f,f,flexible,f,f,1,
1,20176193,Vintage Eclectic Brownstone Pad in Brooklyn,"Ideal for romantic, creative types, this is an...","Not your typical New York abode, my apartment ...","Ideal for romantic, creative types, this is an...",none,Bed Stuy is a diverse historic neighborhood wi...,This is an actual unique living experience whe...,Close to buses and subways there is also free ...,"Entrance hallway, living room, bedroom, kitche...",...,10.0,10.0,10.0,f,f,flexible,f,f,1,1.48
2,19485371,Spacious Harlem Hideaway,"Postive Vibes . This is our Harlem tree house,...",The private room is very spacious and cozy. Th...,"Postive Vibes . This is our Harlem tree house,...",none,You are in a Cultural Haven full of restaurant...,We also keep cucumber water in the fridge feel...,"Train, uber or a taxi. (Extremely taxi accessi...","Private Room, Kitchen And Bathroom",...,10.0,10.0,10.0,t,f,flexible,f,f,1,0.37
3,13079990,Spacius private room in Brooklyn,"Newly renovated apartment, its a 3 bedroom apa...","3 bedroom apartment, 1 full bathroom, living r...","Newly renovated apartment, its a 3 bedroom apa...",none,,,"There is the Mta 3 train Sutter stop, also the...",,...,9.0,8.0,9.0,f,f,flexible,f,f,1,0.23
4,22339757,*Dg) Delightful Private Room 20 min to Manhattan,Hi my home is only 2 blocks from the subway st...,,Hi my home is only 2 blocks from the subway st...,none,,,,,...,8.0,8.0,8.0,t,f,strict_14_with_grace_period,f,f,9,1.53
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17332,22325617,Charming room in Brooklyn,Charming newly renovated 2 bed apartment in Be...,,Charming newly renovated 2 bed apartment in Be...,none,,,"Though street parking is available, space is 2...",,...,,,,t,f,flexible,f,f,1,
17333,8372650,Luxurious 1BR in Herald Square,"-Apartment includes: Hardwood Floors, High Cei...",- ONE proper bed that can sleep 2. - ONE plush...,"-Apartment includes: Hardwood Floors, High Cei...",none,Smack in the middle of Manhattan. Near the Emp...,,,,...,,,,f,f,strict_14_with_grace_period,f,f,1,
17334,3812554,"Master Bedrm, Steam Shr/Jacuzzi, FH","Elegant room w/AC, King Bed, Jacuzzi and Steam...",This stately red brick federal on a quiet stre...,"Elegant room w/AC, King Bed, Jacuzzi and Steam...",none,It's easy to enjoy all that NYC has to offer f...,Guests are expected to respect others' needs f...,A three block walk to the Forest Hills/71st Rd...,Guests have access to the common areas of the ...,...,10.0,10.0,10.0,f,f,moderate,f,f,3,1.32
17335,18891508,Private rooms starting at $67 a night per person.,Family friendly neighborhood. Caribbean settin...,You have a choice of 1 of 4 bedrooms. Room#4 c...,Family friendly neighborhood. Caribbean settin...,none,The neighborhood has easy access to Manhattan....,Our place can also be available for $249 a nig...,Buses are 1 block away. A 10 minute ride in th...,"Coffee, tea, cocoa and various juices are prov...",...,10.0,9.0,10.0,f,f,moderate,f,f,4,0.29


In [3351]:
test_df_cleaned = cleaning(test_df)

In [3352]:
test_y = pipeline.predict(test_df_cleaned)
test_y

array([379.71413873, 150.56926441,  53.87383155, ...,  81.91204623,
        56.91122883, 120.88416828])

In [3353]:
output_df = pd.DataFrame()
output_df['Id'] = test_df['id']
output_df['Predicted'] = test_y
output_df.to_csv('linear_regression_model.csv', index = False)

In [3354]:
pd.read_csv('linear_regression_model.csv')

Unnamed: 0,Id,Predicted
0,19307997,379.714139
1,20176193,150.569264
2,19485371,53.873832
3,13079990,70.250235
4,22339757,51.699138
...,...,...
17332,22325617,77.556646
17333,8372650,352.085431
17334,3812554,81.912046
17335,18891508,56.911229
