In [92]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.cluster import KMeans
import seaborn as sns
pd.set_option('max_columns', None)

In [93]:
# Open calendar data
boston_calendar = pd.read_csv("boston_airbnb_data/calendar.csv")
seattle_calendar = pd.read_csv("seattle_airbnb_data/calendar.csv")
# Open listings data
boston_listings = pd.read_csv("boston_airbnb_data/listings.csv")
seattle_listings = pd.read_csv("seattle_airbnb_data/listings.csv")

In [94]:
# Concatenate the data calendar from Boston and Seattle
df_calendar = pd.concat([boston_calendar,seattle_calendar ], axis=0,ignore_index=True)
# Drop price column because we don't need it for our analysis
df_calendar = df_calendar.drop('price', axis=1)
# Get one column for each variable on for t(true) and one for f(false)
df_calendar = pd.concat([df_calendar.drop('available', axis=1), pd.get_dummies(df_calendar['available'], prefix='available', prefix_sep='_')], axis=1)
# Group by each list id by adding the number of times each list_id is available and unavailable
df_occupation = df_calendar.groupby("listing_id").sum()
# Add a column with the occupancy percentage, which is the number of days occupied divided by the total number of days registered
df_occupation["occupation_percentage"] = df_occupation["available_f"]*100/(df_occupation["available_f"]+df_occupation["available_t"])
# Drop columns will no longer be used
df_occupation = df_occupation.drop(["available_f","available_t"], axis=1)
# Before concatenating listings dataframes, we need to remove the columns that have boston and not seattle
boston_listings = boston_listings.drop( ['access', 'interaction', 'house_rules'], axis=1)
# Concatenate the data listings from Boston and Seattle
df_listings = pd.concat([boston_listings, seattle_listings], axis=0)

In [95]:
# Create a consolidate dataframe with all the airbnb data including the ocupattion rate that wass alcasdflklasdf
df = pd.merge(df_listings, df_occupation,left_on="id",right_on="listing_id", how="inner")

In [96]:
df_boston = df[df["city"]=="Boston"]
df_seattle = df[df["city"]=="Seattle"]

In [63]:
columns_of_interest = ['property_type','room_type','accommodates','bathrooms','bedrooms','beds',
                       'bed_type','amenities','city']

df = df[columns_of_interest]


In [97]:
columns_of_interest = ['occupation_percentage','property_type','room_type','accommodates','bathrooms','bedrooms','beds',
                       'bed_type']

df_boston = df_boston[columns_of_interest]
df_boston = df_boston.dropna()

In [100]:
df_boston

Unnamed: 0,occupation_percentage,accommodates,bathrooms,bedrooms,beds,property_type_Bed & Breakfast,property_type_Boat,property_type_Condominium,property_type_Dorm,property_type_Entire Floor,property_type_Guesthouse,property_type_House,property_type_Loft,property_type_Other,property_type_Townhouse,property_type_Villa,room_type_Private room,room_type_Shared room,bed_type_Couch,bed_type_Futon,bed_type_Pull-out Sofa,bed_type_Real Bed
0,100.000000,4,1.5,2.0,3.0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1
1,1.643836,2,1.0,1.0,1.0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1
2,12.602740,2,1.0,1.0,1.0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1
3,73.150685,4,1.0,1.0,2.0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,1
4,8.493151,2,1.5,1.0,2.0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3533,100.000000,4,1.0,1.0,2.0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,1
3534,98.904110,2,1.0,1.0,1.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
3535,100.000000,2,1.0,1.0,1.0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1
3559,0.000000,7,1.0,1.0,5.0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1


In [99]:
cat_vars = df_boston.select_dtypes(include=['object']).copy().columns
for var in  cat_vars:
    # for each cat add dummy var, drop original column
    df_boston = pd.concat([df_boston.drop(var, axis=1), pd.get_dummies(df_boston[var], prefix=var, prefix_sep='_', drop_first=True)], axis=1)

In [101]:
df_boston

Unnamed: 0,occupation_percentage,accommodates,bathrooms,bedrooms,beds,property_type_Bed & Breakfast,property_type_Boat,property_type_Condominium,property_type_Dorm,property_type_Entire Floor,property_type_Guesthouse,property_type_House,property_type_Loft,property_type_Other,property_type_Townhouse,property_type_Villa,room_type_Private room,room_type_Shared room,bed_type_Couch,bed_type_Futon,bed_type_Pull-out Sofa,bed_type_Real Bed
0,100.000000,4,1.5,2.0,3.0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1
1,1.643836,2,1.0,1.0,1.0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1
2,12.602740,2,1.0,1.0,1.0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1
3,73.150685,4,1.0,1.0,2.0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,1
4,8.493151,2,1.5,1.0,2.0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3533,100.000000,4,1.0,1.0,2.0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,1
3534,98.904110,2,1.0,1.0,1.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
3535,100.000000,2,1.0,1.0,1.0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1
3559,0.000000,7,1.0,1.0,5.0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1


In [103]:
#Split data into an X matrix and a response vector y
y = df_boston['occupation_percentage']
X = df_boston.drop('occupation_percentage', axis=1)
#6. Create training and test sets of data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .30, random_state=42)
#7. Instantiate a LinearRegression model with normalized data
lm_model = LinearRegression(normalize=True)
#8. Fit your model to the training data
lm_model.fit(X_train, y_train)
#9. Predict the response for the training data and the test data
y_test_preds = lm_model.predict(X_test)
y_train_preds = lm_model.predict(X_train)
#10. Obtain an rsquared value for both the training and test data
test_score = r2_score(y_test, y_test_preds)
train_score = r2_score(y_train, y_train_preds)

In [107]:
importance = lm_model.coef_

In [108]:
importance

array([-2.71215540e+00, -1.41673408e+00,  2.64741040e+00, -4.52274447e-01,
       -2.88698481e+01, -3.04911609e+01,  6.11174990e+00,  2.84261972e+01,
        5.19651083e+00,  3.19744231e-14, -1.48369323e+01,  8.77815997e+00,
       -2.71432255e+01,  1.07757334e+01,  2.43064268e+01, -2.47937662e-01,
       -4.30788667e+00,  2.00668023e+01,  2.23694528e+01,  2.16765901e+01,
        2.10903436e+01])

In [104]:
test_score

0.011276223789644657

In [105]:
train_score

0.04611726121849047

In [24]:
my_string = df.loc[0,"amenities"]

In [25]:
my_string

'{TV,"Wireless Internet",Kitchen,"Free Parking on Premises","Pets live on this property",Dog(s),Heating,"Family/Kid Friendly",Washer,Dryer,"Smoke Detector","Fire Extinguisher",Essentials,Shampoo,"Laptop Friendly Workspace"}'

In [26]:
replacements = ['"',"'","{","}"]
for char in replacements:
    if char in my_string:
        my_string = my_string.replace(char,"")

In [29]:
my_string

'TV,Wireless Internet,Kitchen,Free Parking on Premises,Pets live on this property,Dog(s),Heating,Family/Kid Friendly,Washer,Dryer,Smoke Detector,Fire Extinguisher,Essentials,Shampoo,Laptop Friendly Workspace'

In [28]:
my_string.split(",")

['TV',
 'Wireless Internet',
 'Kitchen',
 'Free Parking on Premises',
 'Pets live on this property',
 'Dog(s)',
 'Heating',
 'Family/Kid Friendly',
 'Washer',
 'Dryer',
 'Smoke Detector',
 'Fire Extinguisher',
 'Essentials',
 'Shampoo',
 'Laptop Friendly Workspace']

In [65]:
all_amenities = []
replacements = ['"',"'","{","}"]

for index, row in df.iterrows():
    amenities = row['amenities']
    for char in replacements:
        if char in amenities:
            amenities = amenities.replace(char,"")
            
    df.loc[index, 'amenities'] = amenities
    amenities = amenities.split(",")
    all_amenities += amenities

In [70]:
a = set(all_amenities)

In [71]:
a

{'',
 '24-Hour Check-in',
 'Air Conditioning',
 'Breakfast',
 'Buzzer/Wireless Intercom',
 'Cable TV',
 'Carbon Monoxide Detector',
 'Cat(s)',
 'Dog(s)',
 'Doorman',
 'Dryer',
 'Elevator in Building',
 'Essentials',
 'Family/Kid Friendly',
 'Fire Extinguisher',
 'First Aid Kit',
 'Free Parking on Premises',
 'Free Parking on Street',
 'Gym',
 'Hair Dryer',
 'Hangers',
 'Heating',
 'Hot Tub',
 'Indoor Fireplace',
 'Internet',
 'Iron',
 'Kitchen',
 'Laptop Friendly Workspace',
 'Lock on Bedroom Door',
 'Other pet(s)',
 'Paid Parking Off Premises',
 'Pets Allowed',
 'Pets live on this property',
 'Pool',
 'Safety Card',
 'Shampoo',
 'Smoke Detector',
 'Smoking Allowed',
 'Suitable for Events',
 'TV',
 'Washer',
 'Washer / Dryer',
 'Wheelchair Accessible',
 'Wireless Internet',
 'translation missing: en.hosting_amenity_49',
 'translation missing: en.hosting_amenity_50'}

In [73]:
df.dtypes

property_type     object
room_type         object
accommodates       int64
bathrooms        float64
bedrooms         float64
beds             float64
bed_type          object
amenities         object
city              object
dtype: object