In [None]:
import pandas as pd
import numpy as np

In [None]:
pd.set_option('display.float_format', lambda x: '%.2f' % x)

# Calendar Data

In [None]:
pd.read_csv('/content/calendar.csv.gz').tail()

Unnamed: 0,listing_id,date,available,price,adjusted_price,minimum_nights,maximum_nights
2282936,8964989,2026-03-10,f,$99.00,,2.0,1125.0
2282937,8964989,2026-03-11,f,$99.00,,2.0,1125.0
2282938,8964989,2026-03-12,f,$99.00,,3.0,1125.0
2282939,8964989,2026-03-13,f,$99.00,,3.0,1125.0
2282940,8964989,2026-03-14,f,$99.00,,3.0,1125.0


In [None]:
import os, gzip, pandas as pd, io, csv

# A. Confirm path & size
path = "/content/calendar.csv.gz"
print("Exists:", os.path.exists(path))
print("Size (bytes):", os.path.getsize(path))

# B. Peek at the first few *raw* lines (after decompressing)
with gzip.open(path, "rt", encoding="utf-8", errors="replace") as f:
    for i, line in zip(range(5), f):
        print(repr(line[:200]))


Exists: True
Size (bytes): 5678319
'listing_id,date,available,price,adjusted_price,minimum_nights,maximum_nights\n'
'3686,2025-03-13,t,$67.00,,,\n'
'3686,2025-03-14,t,$67.00,,31,365\n'
'3686,2025-03-15,t,$67.00,,31,365\n'
'3686,2025-03-16,t,$67.00,,31,365\n'


## Must use the archived 13 March 2025 data. More recent (June) empty/truncated.

In [None]:
calendar = pd.read_csv('/content/calendar.csv.gz')

In [None]:
calendar.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2282941 entries, 0 to 2282940
Data columns (total 7 columns):
 #   Column          Dtype  
---  ------          -----  
 0   listing_id      int64  
 1   date            object 
 2   available       object 
 3   price           object 
 4   adjusted_price  float64
 5   minimum_nights  float64
 6   maximum_nights  float64
dtypes: float64(3), int64(1), object(3)
memory usage: 121.9+ MB


# Converting PRICE to FLOAT Datatype

In [None]:
calendar.price = calendar.price.replace('[\$,]','',regex=True).astype(float)

  calendar.price = calendar.price.replace('[\$,]','',regex=True).astype(float)


In [None]:
calendar.head()

Unnamed: 0,listing_id,date,available,price,adjusted_price,minimum_nights,maximum_nights
0,3686,2025-03-13,t,67.0,,,
1,3686,2025-03-14,t,67.0,,31.0,365.0
2,3686,2025-03-15,t,67.0,,31.0,365.0
3,3686,2025-03-16,t,67.0,,31.0,365.0
4,3686,2025-03-17,t,67.0,,31.0,365.0


# Can drop adjusted_price because contains only null values

In [None]:
calendar.adjusted_price.notnull().sum()

np.int64(0)

In [None]:
calendar = calendar.drop('adjusted_price', axis=1)

# Converting Minimum Nights and Maximum Nights to INT datatype (from FLOAT)

### NOTE: Can't do this because the features both contain NaN values, which aren't valid data types for INT datatypes

In [None]:
calendar.minimum_nights.value_counts().sort_values(ascending=False)

Unnamed: 0_level_0,count
minimum_nights,Unnamed: 1_level_1
31.00,602122
2.00,571021
1.00,480803
3.00,318271
4.00,77676
...,...
127.00,5
99.00,4
305.00,4
11.00,2


In [None]:
calendar.minimum_nights.loc[lambda x: x.isna()] = 1 # set NaN values in MIN NIGHTS features to 1

You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  calendar.minimum_nights.loc[lambda x: x.isna()] = 1
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  calendar.mi

In [None]:
calendar.maximum_nights.value_counts().sort_values(ascending=False) # imputing NaN in column with 365.

Unnamed: 0_level_0,count
maximum_nights,Unnamed: 1_level_1
1125.00,1139668
365.00,536971
30.00,76424
90.00,57851
28.00,52785
...,...
81.00,9
75.00,9
77.00,9
78.00,9


In [None]:
calendar.maximum_nights.loc[lambda x: x.isna()] = 365

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  calendar.maximum_nights.loc[lambda x: x.isna()] = 365


In [None]:
calendar.maximum_nights = calendar.maximum_nights.replace(float(1125),365) # replaced outlier MAX NIGHTS value
# with 365

Unnamed: 0,maximum_nights
0,365.00
1,365.00
2,365.00
3,365.00
4,365.00
...,...
2282936,365.00
2282937,365.00
2282938,365.00
2282939,365.00


# Converting Minimum Nights and Maximum Nights from Float64 datatype to int64

In [None]:
calendar.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2282941 entries, 0 to 2282940
Data columns (total 6 columns):
 #   Column          Dtype  
---  ------          -----  
 0   listing_id      int64  
 1   date            object 
 2   available       object 
 3   price           float64
 4   minimum_nights  float64
 5   maximum_nights  float64
dtypes: float64(3), int64(1), object(2)
memory usage: 104.5+ MB


In [None]:
calendar = calendar.assign(
    minimum_nights = lambda x: x.minimum_nights.astype('int64'),
    maximum_nights = lambda x: x.maximum_nights.astype('int64')
)

# Converting LISTTING_ID to OBJECT datatype and DATE to true DATETIME and AVAILABLE to BINARY TRUE/FALSE (bool) variable

In [None]:
calendar.available.isna().sum()

np.int64(0)

In [None]:
calendar = calendar.assign(
    listing_id = lambda x: x.listing_id.astype('object'),
    date = lambda x: pd.to_datetime(x.date),
    available = lambda x: np.where(x.available=='t',1,0).astype('bool')
)

In [None]:
calendar.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2282941 entries, 0 to 2282940
Data columns (total 6 columns):
 #   Column          Dtype         
---  ------          -----         
 0   listing_id      object        
 1   date            datetime64[ns]
 2   available       bool          
 3   price           float64       
 4   minimum_nights  int64         
 5   maximum_nights  int64         
dtypes: bool(1), datetime64[ns](1), float64(1), int64(2), object(1)
memory usage: 89.3+ MB


# Exporting CLEANED Calendar Data

In [None]:
calendar.to_csv('cleaned_calendar', index=False)

In [None]:
calendar.head()

Unnamed: 0,listing_id,date,available,price,minimum_nights,maximum_nights
0,3686,2025-03-13,True,67.0,1,365
1,3686,2025-03-14,True,67.0,31,365
2,3686,2025-03-15,True,67.0,31,365
3,3686,2025-03-16,True,67.0,31,365
4,3686,2025-03-17,True,67.0,31,365


# Exploring the CLEANED data

In [None]:
calendar.price.value_counts().reset_index().sort_values(by='price', ascending=True) # no price less than $1

Unnamed: 0,price,count
461,1.00,365
203,10.00,2190
372,12.00,730
492,20.00,365
444,22.00,365
...,...,...
459,5000.00,365
196,7000.00,2555
407,11894.00,365
61,17380.00,9130


In [None]:
calendar.price.isna().sum() # No records missing PRICE

np.int64(0)

In [None]:
calendar.available.value_counts() # roughly balanced AVAILABLE and NOT AVAILABLE listings== GOOD

Unnamed: 0_level_0,count
available,Unnamed: 1_level_1
True,1250599
False,1032342


# THINGS TO CONSIDER:

1) Should we remove outlier price listing instances? e.g., those listed for $1, etc. **YES**

# Data Cleaning Continued (9/22)

# Dropping Instances in which the listed PRICE feature is an outlier

In [None]:
data = pd.read_csv('/content/cleaned_calendar')

In [None]:
data.shape

(2282941, 6)

# Resorting the columns so that PRICE is appended at the end (last column)

In [None]:
data = data[['listing_id', 'date', 'available','minimum_nights','maximum_nights', 'price']]

# Making a column of Z Scores for PRICE and appending at end of dataframe. Also, making a boolean column that states whether the listed price is an outlier, depending on whether its PRICE lies outside of +/- 3 Standard Deviations from the Mean Price

In [None]:
data = data.assign(
    price_ZScore = lambda x: (x.price.sub(x.price.mean())).div(x.price.std()),
    outlier = lambda x: np.where(abs(x.price_ZScore) > 3, 1, 0)
)

In [None]:
data.shape # before dropping outliers

(2282941, 8)

In [None]:
drop_rows = data.query('outlier==1').index # obtaining indexes where the instance has an OUTLIER PRICE LISTING

In [None]:
data = data.drop(drop_rows, axis=0) # dropping outliers

In [None]:
data = data.drop(['price_ZScore', 'outlier'], axis=1) # dropping the derived columns

In [None]:
data.shape # data dimensions of CALENDAR dataset after dropping outliers

(2255558, 6)

In [None]:
data.to_csv('cleaned_calendar_v2', index=False)