## Compose data

In [1]:
import os
import numpy as np
import pandas as pd

In [2]:
datasets = ["yelpchi", "yelpnyc"]

def read_ds(ds_name):
    result = pd.read_excel("../datasets/processed/{0}_reviews_calculated.xlsx".format(ds_name))
    result["dataset"] = ds_name
    result = result[[result.columns[-1], *result.columns[:-1]]]
    return result

df = pd.concat(
    [read_ds(ds_name) for ds_name in datasets], 
    ignore_index=True,
)

df.head()

Unnamed: 0,dataset,user,product,rating,label,date,content_text,content_polarity,content_subjectivity,content_words,content_sentences,content_emotion
0,yelpchi,yelpchi_user_00000,yelpchi_product_000,5,organic,2011-06-08,Let me begin by saying that there are two kind...,0.025,0.57,"['Let', 'me', 'begin', 'by', 'saying', 'that',...","[""Let me begin by saying that there are two ki...",neutral
1,yelpchi,yelpchi_user_00001,yelpchi_product_000,3,organic,2011-08-30,The only place inside the Loop that you can st...,-0.047,0.625,"['The', 'only', 'place', 'inside', 'the', 'Loo...",['The only place inside the Loop that you can ...,neutral
2,yelpchi,yelpchi_user_00002,yelpchi_product_000,5,organic,2009-06-26,I have walked by the Tokyo Hotel countless tim...,0.172,0.615,"['I', 'have', 'walked', 'by', 'the', 'Tokyo', ...",['I have walked by the Tokyo Hotel countless t...,neutral
3,yelpchi,yelpchi_user_00003,yelpchi_product_000,1,organic,2010-09-16,"If you are considering staying here, watch thi...",0.25,0.333,"['If', 'you', 'are', 'considering', 'staying',...","['If you are considering staying here, watch t...",neutral
4,yelpchi,yelpchi_user_00004,yelpchi_product_000,3,organic,2010-02-05,"This place is disgusting, absolutely horrible,...",-0.435,0.505,"['This', 'place', 'is', 'disgusting', 'absolut...","['This place is disgusting, absolutely horribl...",disgust


In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 426335 entries, 0 to 426334
Data columns (total 11 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   dataset               426335 non-null  object 
 1   user                  426335 non-null  object 
 2   product               426335 non-null  object 
 3   rating                426335 non-null  int64  
 4   label                 426335 non-null  object 
 5   date                  426335 non-null  object 
 6   content_text          426335 non-null  object 
 7   content_polarity      426335 non-null  float64
 8   content_subjectivity  426335 non-null  float64
 9   content_words         426335 non-null  object 
 10  content_emotion       426335 non-null  object 
dtypes: float64(2), int64(1), object(8)
memory usage: 35.8+ MB


In [9]:
df = df.drop(columns=["content_sentences"])
df.content_words = df.content_words.apply(
    lambda content_words: list(map(lambda s: s.strip("'"), content_words.lstrip("[").rstrip("]").split(", ")))
)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 426335 entries, 0 to 426334
Data columns (total 11 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   dataset               426335 non-null  object 
 1   user                  426335 non-null  object 
 2   product               426335 non-null  object 
 3   rating                426335 non-null  int64  
 4   label                 426335 non-null  object 
 5   date                  426335 non-null  object 
 6   content_text          426335 non-null  object 
 7   content_polarity      426335 non-null  float64
 8   content_subjectivity  426335 non-null  float64
 9   content_words         426335 non-null  object 
 10  content_emotion       426335 non-null  object 
dtypes: float64(2), int64(1), object(8)
memory usage: 35.8+ MB


In [11]:
df.to_json("../datasets/processed/yelp_mongo_dumping.json", orient='records')

## Dump to MongoDB

In [12]:
data_path = os.path.abspath("../datasets/processed/yelp_mongo_dumping.json")

!docker stop mongo 
!docker volume remove mongodbdata && docker volume create mongodbdata
!docker run \
    -v mongodbdata:/data/db \
    -v {data_path}:/data/yelp_mongo_dumping.json \
    -p 27017:27017 \
    --rm -it -d --name mongo mongo

mongo
mongodbdata
mongodbdata
557c59e5e101cef9f75d9da0ecf2b8f7b42574bb2aaa76898a096c7678af8202


In [13]:
!docker exec -it mongo mongoimport \
    --uri "mongodb://localhost:27017/?retryWrites=true&w=majority" \
    --collection="yelp_reviews" \
    --db cmp1044 \
    --collection yelp_reviews \
    --file /data/yelp_mongo_dumping.json \
    --jsonArray

2024-01-11T05:37:52.078+0000	connected to: mongodb://localhost:27017/?retryWrites=true&w=majority
2024-01-11T05:37:55.078+0000	[........................] cmp1044.yelp_reviews	29.4MB/731MB (4.0%)
2024-01-11T05:37:58.079+0000	[##......................] cmp1044.yelp_reviews	64.5MB/731MB (8.8%)
2024-01-11T05:38:01.080+0000	[###.....................] cmp1044.yelp_reviews	97.6MB/731MB (13.4%)
2024-01-11T05:38:04.079+0000	[####....................] cmp1044.yelp_reviews	131MB/731MB (18.0%)
2024-01-11T05:38:07.079+0000	[#####...................] cmp1044.yelp_reviews	166MB/731MB (22.7%)
2024-01-11T05:38:10.080+0000	[######..................] cmp1044.yelp_reviews	200MB/731MB (27.3%)
2024-01-11T05:38:13.079+0000	[#######.................] cmp1044.yelp_reviews	235MB/731MB (32.1%)
2024-01-11T05:38:16.086+0000	[########................] cmp1044.yelp_reviews	267MB/731MB (36.5%)
2024-01-11T05:38:19.079+0000	[#########...............] cmp1044.yelp_reviews	298MB/731MB (40.8%)
2024-01-11T05:38:22.079+0000

In [14]:
import urllib.parse
from pymongo import MongoClient
client = MongoClient("mongodb://localhost:27017/test")

collection = client.cmp1044.yelp_reviews
cursor = collection.find({})
df_dumped = pd.DataFrame(cursor)
df_dumped.head()

Unnamed: 0,_id,dataset,user,product,rating,label,date,content_text,content_polarity,content_subjectivity,content_words,content_emotion
0,659f7eb07722ec561c229b73,yelpchi,yelpchi_user_00001,yelpchi_product_000,3,organic,2011-08-30,The only place inside the Loop that you can st...,-0.047,0.625,"[The, only, place, inside, the, Loop, that, yo...",neutral
1,659f7eb07722ec561c229b74,yelpchi,yelpchi_user_00004,yelpchi_product_000,3,organic,2010-02-05,"This place is disgusting, absolutely horrible,...",-0.435,0.505,"[This, place, is, disgusting, absolutely, horr...",disgust
2,659f7eb07722ec561c229b75,yelpchi,yelpchi_user_00005,yelpchi_product_000,1,organic,2010-08-09,Disgusting!!! There is literally duct tape hol...,-0.265,0.557,"[Disgusting, There, is, literally, duct, tape,...",disgust
3,659f7eb07722ec561c229b76,yelpchi,yelpchi_user_00000,yelpchi_product_000,5,organic,2011-06-08,Let me begin by saying that there are two kind...,0.025,0.57,"[Let, me, begin, by, saying, that, there, are,...",neutral
4,659f7eb07722ec561c229b77,yelpchi,yelpchi_user_00007,yelpchi_product_001,2,organic,2012-07-09,"Good location, really run down. I am surprised...",0.211,0.497,"[Good, location, really, run, down, I, am, sur...",surprise
