## Import Libraries


In [1]:
import pandas as pd
import os
import json
import numpy as np

In [2]:
# connect to google drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Data Loading

Load all json files in one df (for further processing)

In [None]:
# Create an empty list to store data
data = []

# Specify the path where your JSON files are located
paths =  ['/content/drive/MyDrive/NLP/Data3/', '/content/drive/MyDrive/NLP/Data2/']


# Iterate over all paths
for path in paths:
    # Iterate over all files in the specified directory
    for filename in os.listdir(path):
        # Check if the file is a JSON file
        if filename.endswith('.json'):
            # Full file path
            file_path = os.path.join(path, filename)
            # Load JSON file
            with open(file_path, 'r', encoding='utf-8') as f:
                json_data = json.load(f)
            # Add data from the current file to the overall data list
            data.extend(json_data)

# Convert list into a DataFrame
df = pd.DataFrame(data)
# Create new columns for 'lat' and 'lng'
df['lat'] = df['location'].apply(lambda x: x.get('lat') if isinstance(x, dict) else np.nan)
df['lng'] = df['location'].apply(lambda x: x.get('lng') if isinstance(x, dict) else np.nan)

# Drop irrelevant columns
df = df[["reviews", 'url', 'name', 'stars', 'numberOfGuests', 'address',
       'pricing', 'primaryHost', 'isHostedBySuperhost', 'lat', 'lng']]

# For reviews: create seperate df
df_reviews = pd.json_normalize(data, 'reviews', ['url', 'name', 'stars', 'numberOfGuests', "isHostedBySuperhost"], record_prefix='review_', errors = "ignore")
# Drop irrelevant columns
df_reviews = df_reviews.drop(columns=["review_collectionTag", "review_response", "review_localizedReview", "review_author.hasProfilePic", "review_author.pictureUrl", 'review_recipient.hasProfilePic', "review_recipient.pictureUrl", 'review_author.firstName', 'review_author.smartName', 'review_author.thumbnailUrl','review_recipient.firstName', 'review_recipient.id','review_recipient.smartName', 'review_recipient.thumbnailUrl','review_localizedReview.comments','review_localizedReview.needsTranslation', "review_localizedReview.response"])

# Drop "review" from df (use df_reviews from now on - df only to have additional info about airbnb)
df = df.drop(columns=["reviews"])

## Save to OneDrive

In [None]:
# Save dataframes at google drive
df_reviews.to_csv('/content/drive/MyDrive/NLP/Handin/Data/df_reviews_uncleaned.csv', index=False)
df.to_csv('/content/drive/MyDrive/NLP/Handin/Data/df_uncleaned.csv', index=False)

## Overview combined Dataset

In [3]:
# to load dataframes from google drive
df = pd.read_csv('/content/drive/MyDrive/NLP/Handin/Data/df_uncleaned.csv')
df_reviews = pd.read_csv('/content/drive/MyDrive/NLP/Handin/Data/df_reviews_uncleaned.csv')

In [4]:
# Number of columns
print("Number of uncleaned Airbnbs: ", len(df))
print("Number of uncleaned Reviews: ", len(df_reviews))

Number of uncleaned Airbnbs:  16106
Number of uncleaned Reviews:  1077494


In [None]:
# Look at columns of dfs
print(df.columns)
print(df_reviews.columns)

Index(['url', 'name', 'stars', 'numberOfGuests', 'address', 'pricing',
       'primaryHost', 'isHostedBySuperhost', 'lat', 'lng'],
      dtype='object')
Index(['review_comments', 'review_createdAt', 'review_id', 'review_rating',
       'review_localizedDate', 'review_author.id', 'review_language',
       'review_localizedReview.disclaimer', 'url', 'name', 'stars',
       'numberOfGuests', 'isHostedBySuperhost'],
      dtype='object')


In [None]:
# Show fist 5 rows of dataframe storing Airbnb info
print(df.head())

                                     url  \
0  https://www.airbnb.com/rooms/41747384   
1  https://www.airbnb.com/rooms/10395778   
2  https://www.airbnb.com/rooms/27311647   
3  https://www.airbnb.com/rooms/17842903   
4  https://www.airbnb.com/rooms/37596659   

                                          name  stars  numberOfGuests  \
0              Big and cozy Nørrebro apartment    NaN               4   
1                   Charming room in Ølstykke.    4.8               2   
2                              Home sweet home    NaN               5   
3             Albertslund´s Hyggeligste hjørne    NaN               3   
4  Modern Holiday Home in Skibby with Barbecue    2.5               4   

                address                                            pricing  \
0   Copenhagen, Denmark  {'rate': {'amount': 10, 'amountFormatted': '$1...   
1     Ølstykke, Denmark  {'rate': {'amount': 14, 'amountFormatted': '$1...   
2     Taastrup, Denmark  {'rate': {'amount': 29, 'amountFormat

In [None]:
# Show fist 5 rows of dataframe storing Review info
print(df_reviews.head())

                                     review_comments      review_createdAt  \
0                     Thank you for an amazing place  2023-05-25T12:41:39Z   
1  A beautiful presented property in quite and re...  2023-05-19T12:34:22Z   
2  I spent three months at Lise's accomodation. T...  2022-11-16T15:49:20Z   
3  In short: most of our stay (1 night) was fine....  2022-08-15T13:02:44Z   
4  We appreciate the privacy that we had. The pla...  2022-08-03T12:40:07Z   

            review_id  review_rating review_localizedDate review_author.id  \
0  899105257798730969              5             May 2023        515251200   
1  894752940183370410              5             May 2023        121978028   
2  761492335450220583              4        November 2022        458247565   
3  694004339972529250              3          August 2022        442755170   
4  685295648106250494              4          August 2022        106123106   

  review_language review_localizedReview.disclaimer  \
0      