In [None]:
import requests
import os
import shutil
import pandas as pd

In [None]:
SOURCE_OFFERING_DATA_WEB_PATH = "https://www.cs.cmu.edu/~jiweil/offering.txt.zip"
SOURCE_REVIEW_DATA_WEB_PATH = "https://www.cs.cmu.edu/~jiweil/review.txt.zip"

os.makedirs(os.path.dirname("./../Input_Data/"), exist_ok=True)
OFFERING_DATA_FILEPATH = "./../Input_Data/offering.zip"
REVIEW_DATA_FILEPATH = "./../Input_Data/review.zip"

In [None]:
# Fetch Offering dataset from web
response = requests.get(SOURCE_OFFERING_DATA_WEB_PATH)

if response.status_code == 200:
    with open(OFFERING_DATA_FILEPATH, "wb") as file:
        file.write(response.content)
    print("Offering data downloaded successfully!")
else:
    print(f"Failed to download Offering data from source. Status code: {response.status_code}")

    
# Fetch Review dataset from web
# response = requests.get(SOURCE_REVIEW_DATA_WEB_PATH)

# if response.status_code == 200:
#     with open(REVIEW_DATA_FILEPATH, "wb") as file:
#         file.write(response.content)
#     print("Review data downloaded successfully!")
# else:
#     print(f"Failed to download Review data from source. Status code: {response.status_code}")

with requests.get(SOURCE_REVIEW_DATA_WEB_PATH, stream=True) as r:
        with open(REVIEW_DATA_FILEPATH, 'wb') as f:
            shutil.copyfileobj(r.raw, f)

## Reading in the review and offerings data using Pandas

In [None]:
review_data_pd = pd.read_json(REVIEW_DATA_FILEPATH, lines = True)
# review_data_pd

In [None]:
offerings_data_pd = pd.read_json(OFFERING_DATA_FILEPATH, lines = True)
# offerings_data_pd

## Extract the individual values from columns that have dictionaries

In [None]:
# isolate the 'ratings' column, so that the dictionary within it can be split into individual columns
reviews_ratings = pd.DataFrame(review_data_pd['ratings'])
reviews_ratings_normalized = pd.json_normalize(reviews_ratings['ratings'])

# reviews_ratings_normalized

In [None]:
# isolate the 'authors' column from review_data_pd, so that the dictionary within it can be split into individual columns
reviews_author = pd.DataFrame(review_data_pd['author'])
reviews_author_normalized = pd.json_normalize(reviews_author['author'])
# rename the id column in auther so that it can be recognized
reviews_author_normalized = reviews_author_normalized.rename(columns = {'id': 'author_id'})
# reviews_author_normalized

## Remake the original dataframes with split out dictionaires

In [None]:
# add all the split out data back to the original review_data_pd dataframe
review_data_updated = pd.concat([review_data_pd, reviews_ratings_normalized], axis = 1)
review_data_updated = pd.concat([review_data_updated, reviews_author_normalized], axis = 1)

In [None]:
# remove the original dictionary columns "author" and "ratings" (since we have all the data split out now)
review_data_updated = review_data_updated.drop(columns = ['author'])
review_data_updated = review_data_updated.drop(columns = ['ratings'])

# review_data_updated

In [None]:
# isolate the 'address' column from offerings_data_pd, so that the dictionary within it can be split into individual columns
offerings_address = pd.DataFrame(offerings_data_pd['address'])
offerings_address_normalized = pd.json_normalize(offerings_address['address'])

# offerings_address_normalized

In [None]:
# add all the split out data back to the original offering_data_pd dataframe
offerings_data_updated = pd.concat([offerings_data_pd, offerings_address_normalized], axis = 1)

In [None]:
# remove the original address dictionary column (since we have all the data split out now)
offerings_data_updated = offerings_data_updated.drop(columns = ['address'])

In [None]:
# renaming the ID column so that we can merge the two datasets together
offerings_data_updated = offerings_data_updated.rename(columns = {'id': 'offering_id'})

# offerings_data_updated

## Merging the offerings and reviews datasets together for final dataset creation

In [None]:
merged_offerings_reviews = pd.merge(review_data_updated, offerings_data_updated, on = 'offering_id')

# merged_offerings_reviews

In [None]:
# write complete merged dataframe as gzipped CSV file"
merged_offerings_reviews.to_csv("./../processed_data/original_data_merged.csv.gz", index=False, compression="gzip")

## Final output: original_data_merged.csv.gz

### Next step: Filter review text language (performed in language_filtering.ipynb)