In [30]:
import requests
import pandas as pd

### Example URL request

In [2]:
url = "http://localhost:8983/solr/tech_products/select?indent=true&q.op=OR&q=viewsonic&rows=20&useParams="

In [3]:
response = requests.get(url)

In [4]:
response_json = response.json()
print(response_json)

{'responseHeader': {'status': 0, 'QTime': 0, 'params': {'q': 'viewsonic', 'indent': 'true', 'q.op': 'OR', 'rows': '20', 'useParams': ''}}, 'response': {'numFound': 1, 'start': 0, 'numFoundExact': True, 'docs': [{'id': 'VA902B', 'name': 'ViewSonic VA902B - flat panel display - TFT - 19"', 'manu': 'ViewSonic Corp.', 'manu_id_s': 'viewsonic', 'cat': ['electronics and stuff2'], 'features': ['19" TFT active matrix LCD, 8ms response time, 1280 x 1024 native resolution'], 'weight': 190.4, 'price': 279.95, 'price_c': '279.95,USD', 'popularity': 6, 'inStock': True, 'store': '45.18814,-93.88541', '_version_': 1795237645975027712, 'manu_exact': 'ViewSonic Corp.', 'price_c____l_ns': 27995, 'name_exact': 'ViewSonic VA902B - flat panel display - TFT - 19"'}]}}


### Read excel sheet

In [31]:
info = pd.ExcelFile("Info.xlsx")
sheet_names = info.sheet_names

## Process data

### Process the first two sheets

In [32]:
post_dataframe = pd.read_excel(info, sheet_names[0])
comment_dataframe = pd.read_excel(info, sheet_names[1])
# add the book name
bk_name_0 = sheet_names[0].split('(')[0]
bk_name_1 = sheet_names[1].split('(')[0]
if (bk_name_0[-1] == ' '):
    bk_name_0 = bk_name_0[:-1]
if (bk_name_1[-1] == ' '):
    bk_name_1 = bk_name_1[:-1]
bk_name_0 = pd.Series([bk_name_0] * len(post_dataframe.index))
bk_name_1 = pd.Series([bk_name_1] * len(comment_dataframe.index))
post_dataframe["book"] = bk_name_0
comment_dataframe["book"] = bk_name_1

### Process the remaining sheets and add them to the first two sheets

In [33]:
for i in range(2, len(sheet_names)):
    raw_dataframe = pd.read_excel(info, sheet_names[i])

    # add book name
    bk_name = sheet_names[i].split('(')[0]
    # remove empty space at the back if any
    if (bk_name[-1] == ' '):
        bk_name = bk_name[:-1]
    if (bk_name == 'GoT'):
        bk_name = "Game Of Thrones"
    elif (bk_name == 'LotR'):
        bk_name = "Lord of the Rings"

    bk_name = pd.Series([bk_name] * len(raw_dataframe.index))
    raw_dataframe["book"] = bk_name
    # Every even-indexed sheet is a POST sheet
    if (i%2 == 0):
        post_dataframe = pd.concat([post_dataframe, raw_dataframe])
    else:
        comment_dataframe = pd.concat([comment_dataframe, raw_dataframe])

### Label the dataframes as either POST or COMMENT

In [34]:
# label all posts in the dataframe as such for filtering when querying
title = pd.Series(["POST"] * len(post_dataframe.index))
post_dataframe["TYPE"] = title
comment_title = pd.Series(["COMMENT"] * len(comment_dataframe.index))
comment_dataframe["TYPE"] = comment_title

# Convert dates to DatePointField format

In [35]:
def convert_time(x): 
    solr_date = x.strftime('%Y-%m-%dT%H:%M:%SZ')
    return solr_date

In [36]:
post_created_utc = post_dataframe['created_utc']
post_created_utc = post_created_utc.apply(convert_time)
post_dataframe['created_utc'] = post_created_utc

comment_created_utc = comment_dataframe['created_utc']
comment_created_utc = comment_created_utc.apply(convert_time)
comment_dataframe['created_utc'] = comment_created_utc

### Save to csv

In [37]:
post_dataframe.to_csv("posts.csv", index=False)
comment_dataframe.to_csv("comments.csv", index=False)



## Add unique ID to comments

In [40]:
comment_csv = pd.read_csv("comments.csv")
comment_csv

Unnamed: 0,post_id,comment_num,comment_text,author,created_utc,book,TYPE
0,1b5u291,1,And all the Pride and Prejudice fans with stun...,nydevon,2024-03-03T22:29:50Z,Pride and Prejudice,COMMENT
1,1b5u291,2,I think the 2005 pride and prejudice is just o...,catyfun19,2024-03-04T01:02:36Z,Pride and Prejudice,COMMENT
2,1b5u291,3,I will just say it took me a minute to underst...,aurynorange5,2024-03-04T02:55:02Z,Pride and Prejudice,COMMENT
3,1b5u291,4,"I feel this way often. I think for me, a lot o...",SwadlingSwine,2024-03-04T01:22:00Z,Pride and Prejudice,COMMENT
4,1b5u291,5,So my sis and I love pride and prejudice and o...,AdInside1346,2024-03-04T06:30:07Z,Pride and Prejudice,COMMENT
...,...,...,...,...,...,...,...
11536,4pvpea,4,The books of Taylor Caldwell. In my childhood...,corathus59,2016-06-26T14:48:52Z,Harry Potter,COMMENT
11537,4pvpea,5,Stephen King is probably the most prolific aut...,Isles86,2016-06-26T20:14:27Z,Harry Potter,COMMENT
11538,4pvpea,6,**The Bible** -- over 6 billion copies are in ...,,2016-06-26T02:49:36Z,Harry Potter,COMMENT
11539,4pvpea,7,"Well, I'd like to start out with that I have n...",60_Percent_Dad,2016-06-26T08:03:00Z,Harry Potter,COMMENT


### Note: Need to manually add comment_num column in csv
Set cell B2 to be 1

Set formula for cell b3 to be: =IF(A3<>A2,1,B2+1)

Drag and fill remaining cells in column

In [41]:
comment_csv['id'] = comment_csv['post_id'].astype(str) + "_" + comment_csv['comment_num'].astype(str)

### Bring column 'id' to the front

In [42]:
cols = comment_csv.columns.tolist()
cols = cols[-1:] + cols[:-1]
cols

['id',
 'post_id',
 'comment_num',
 'comment_text',
 'author',
 'created_utc',
 'book',
 'TYPE']

In [43]:
comment_csv = comment_csv[cols]

In [44]:
comment_csv.to_csv("comments.csv",index=False)