In [46]:
import pandas as pd

In [47]:
data = pd.read_csv('..\\0.data\\raw\\train.csv')
data.shape

(74111, 29)

1. Drop Columns
2. Type Conversion
3. Imputation
4. Standardization
5. Binning
6. Dummy Creation

### 1. Drop Columns

In [48]:
columnsToBeDropped = ['id', 'description', 'first_review', 'host_response_rate', \
                      'host_since', 'last_review', 'latitude', 'longitude',\
                      'name', 'neighbourhood', 'thumbnail_url', 'zipcode']
data.drop(columnsToBeDropped, axis=1, inplace=True)
data.shape

(74111, 17)

### 2. Type Conversion

Before


In [49]:
data.dtypes

log_price                 float64
property_type              object
room_type                  object
amenities                  object
accommodates                int64
bathrooms                 float64
bed_type                   object
cancellation_policy        object
cleaning_fee                 bool
city                       object
host_has_profile_pic       object
host_identity_verified     object
instant_bookable           object
number_of_reviews           int64
review_scores_rating      float64
bedrooms                  float64
beds                      float64
dtype: object

### Attributes

* Categorical
    1. property_type = 35 Types
    2. room_type = 3 Types
    3. bed_type = 5
    4. cancellation_policy = 5
    5. cleaning_fee = 2
    6. city = 6
    ...


* Numerical
    1. Accomodates
    2. Bathroom
    ...
    
    
* More Preprocessing Required
    1. Amenitites - Textual Data (Needs to be categorised)

In [50]:
categoryColumns = ["property_type", "room_type", "bed_type", "cancellation_policy",\
                   "cleaning_fee", "city", "host_has_profile_pic",\
                   "host_identity_verified", "instant_bookable"]

numericalColumns = ['accommodates', 'bathrooms', 'number_of_reviews',\
                    'review_scores_rating', 'bedrooms', 'beds']
data[categoryColumns] = data[categoryColumns].apply(lambda x: x.astype('category'))
data[numericalColumns] = data[numericalColumns].apply(lambda x: x.astype('float'))
data.dtypes

log_price                  float64
property_type             category
room_type                 category
amenities                   object
accommodates               float64
bathrooms                  float64
bed_type                  category
cancellation_policy       category
cleaning_fee              category
city                      category
host_has_profile_pic      category
host_identity_verified    category
instant_bookable          category
number_of_reviews          float64
review_scores_rating       float64
bedrooms                   float64
beds                       float64
dtype: object

Amenities is left as it is

### 3. Imputation Data

In [51]:
data.isnull().sum()

log_price                     0
property_type                 0
room_type                     0
amenities                     0
accommodates                  0
bathrooms                   200
bed_type                      0
cancellation_policy           0
cleaning_fee                  0
city                          0
host_has_profile_pic        188
host_identity_verified      188
instant_bookable              0
number_of_reviews             0
review_scores_rating      16722
bedrooms                     91
beds                        131
dtype: int64

#### Imputing Bedrooms, Bathrooms, and Beds

In [52]:
'''
LOGIC - Number of bedrooms = bathrooms and vice versa.
bathrooms                   200
bedrooms                     91
beds                        131
''' 
def impute_bedrooms_bathrooms_beds(data):
    bathroomC = 0
    bedroomC = 0
    bedsC = 0
    for index, row in data.iterrows():
        if (pd.isna(row['bathrooms']) == True) and (pd.isna(row['bedrooms']) == False):
            data.at[index, 'bathrooms'] = data.at[index, 'bedrooms']
            bathroomC = bathroomC + 1
        if (pd.isna(row['bedrooms']) == True) and (pd.isna(row['bathrooms']) == False):
            data.at[index, 'bedrooms'] = data.at[index, 'bathrooms']
            bedroomC = bedroomC + 1
    for index, row in data.iterrows():
        if (pd.isna(row['beds']) == True) and (pd.isna(row['bedrooms']) == False):
            data.at[index, 'beds'] = data.at[index, 'bedrooms'] * 2
            bedsC = bedsC + 1

    print('{} counts of Bathroom, {} counts of Bedrooms, {} of Beds attr. are imputed' .format(bathroomC, bedroomC, bedsC))
    return data

In [53]:
'''
No profile Pic - Not Verified
Profile Pic present - Not Verified

Logic
1. If verified: Profile pic must be present.
2. If not verified: 
    if profile pic present: put the same in verified col.
'''

def impute_hostHasPic_identityVerification(data):
#     count1 = 0; count2 = 0
    for index, row in data.iterrows():
        if (pd.isna(row['host_identity_verified']) == False) and (pd.isna(row['host_has_profile_pic']) == True):
            data.at[index, 'host_has_profile_pic'] = data.at[index, 'host_identity_verified']
#             count1 = count1 + 1
        if (pd.isna(row['host_has_profile_pic']) == False) and (pd.isna(row['host_identity_verified']) == True):
            data.at[index, 'host_identity_verified'] = data.at[index, 'host_has_profile_pic']
#             count2 = count2 + 1
    
#     print(count1)
#     print(count2)
    
    return data

In [54]:
'''
Logic - If No. of reviews = 0 then, Review Scores Rating = 0.
'''

def impute_review_scores_rating(data):
    count = 0
    for index, row in data.iterrows():
        if (row['number_of_reviews'] == 0) and (pd.isna(row['review_scores_rating']) == True):
            data.at[index, 'review_scores_rating'] = 0
            count = count + 1
            
    print('{} counts of Review scores rating had 0 number of reviews. Hence, rating for them is also 0.' .format(count))
    return data

Impute

In [55]:
data = impute_bedrooms_bathrooms_beds(data)
data = impute_hostHasPic_identityVerification(data)
data = impute_review_scores_rating(data)

print(data.isnull().sum())

195 counts of Bathroom, 86 counts of Bedrooms, 128 of Beds attr. are imputed
15819 counts of Review scores rating had 0 number of reviews. Hence, rating for them is also 0.
log_price                   0
property_type               0
room_type                   0
amenities                   0
accommodates                0
bathrooms                   5
bed_type                    0
cancellation_policy         0
cleaning_fee                0
city                        0
host_has_profile_pic      188
host_identity_verified    188
instant_bookable            0
number_of_reviews           0
review_scores_rating      903
bedrooms                    5
beds                        3
dtype: int64


In [63]:
'''
For the remaining Nan's, the rows are simple dropped.
'''
data = data.dropna(axis=0)
print(data.isnull().sum())
print(data.shape)

log_price                 0
property_type             0
room_type                 0
amenities                 0
accommodates              0
bathrooms                 0
bed_type                  0
cancellation_policy       0
cleaning_fee              0
city                      0
host_has_profile_pic      0
host_identity_verified    0
instant_bookable          0
number_of_reviews         0
review_scores_rating      0
bedrooms                  0
beds                      0
dtype: int64
(73026, 17)


Save this processed data

In [66]:
data.to_csv('..\\0.data\\processed\\train_EDA_simple.csv', index=False, header=True)