In [1]:
import requests
import pandas as pd

### Example URL request

In [2]:
url = "http://localhost:8983/solr/tech_products/select?indent=true&q.op=OR&q=viewsonic&rows=20&useParams="

In [3]:
response = requests.get(url)

In [4]:
response_json = response.json()
print(response_json)

{'responseHeader': {'status': 0, 'QTime': 0, 'params': {'q': 'viewsonic', 'indent': 'true', 'q.op': 'OR', 'rows': '20', 'useParams': ''}}, 'response': {'numFound': 1, 'start': 0, 'numFoundExact': True, 'docs': [{'id': 'VA902B', 'name': 'ViewSonic VA902B - flat panel display - TFT - 19"', 'manu': 'ViewSonic Corp.', 'manu_id_s': 'viewsonic', 'cat': ['electronics and stuff2'], 'features': ['19" TFT active matrix LCD, 8ms response time, 1280 x 1024 native resolution'], 'weight': 190.4, 'price': 279.95, 'price_c': '279.95,USD', 'popularity': 6, 'inStock': True, 'store': '45.18814,-93.88541', '_version_': 1795237645975027712, 'manu_exact': 'ViewSonic Corp.', 'price_c____l_ns': 27995, 'name_exact': 'ViewSonic VA902B - flat panel display - TFT - 19"'}]}}


### Read excel sheet

In [4]:
info = pd.ExcelFile("Info.xlsx")
sheet_names = info.sheet_names

### Process the first two sheets

In [30]:
post_id_dictionary = {}

In [43]:


def convert_time(x):
    temp = x 
    if (isinstance(x,str)):
        temp = pd.to_datetime(temp,format= '%Y-%m-%d %H:%M:%S')
    solr_date = temp.strftime('%Y-%m-%dT%H:%M:%SZ')
    return solr_date

def ProcessSheetName(sheet_name):
    bk_name = sheet_name.split('(')[0]
    # remove empty space at the back if any
    if (bk_name[-1] == ' '):
        bk_name = bk_name[:-1]
    if (bk_name == 'GoT'):
        bk_name = "Game Of Thrones"
    elif (bk_name == 'LotR'):
        bk_name = "Lord of the Rings"

    return bk_name

def AddIDforComments(df):
    # df: comments dataframe
    # Add a new ID column to comments dataframe as required by Solr
    ### Note: Need to manually add comment_num column in csv
    # Set cell B2 to be 1
    # Set formula for cell b3 to be: =IF(A3<>A2,1,B2+1)
    # Drag and fill remaining cells in column
    df['id'] = df['post_id'].astype(str) + "_" + df['comment_num'].astype(str)

    # bring column id to the front
    cols = df.columns.tolist()
    cols = cols[-1:] + cols[:-1]

    df = df[cols]
    return df

def ProcessData(info_name):
    # info_name: Excel workbook name
    # post_csv_name: name of the csv file containing the posts of the books
    # comment_csv_name: name of the csv file containing the comments
    info = pd.ExcelFile(info_name)
    sheet_names = info.sheet_names
    post_dataframe = pd.read_excel(info, sheet_names[0])
    comment_dataframe = pd.read_excel(info, sheet_names[1])
    # add the book name
    bk_name_0 = ProcessSheetName(sheet_names[0])
    bk_name_1 = ProcessSheetName(sheet_names[1])
    bk_name_0 = pd.Series([bk_name_0] * len(post_dataframe.index))
    bk_name_1 = pd.Series([bk_name_1] * len(comment_dataframe.index))
    post_dataframe["book"] = bk_name_0
    comment_dataframe["book"] = bk_name_1

    for i in range(2, len(sheet_names)):
        raw_dataframe = pd.read_excel(info, sheet_names[i])

        # add book name
        bk_name = ProcessSheetName(sheet_names[i])

        bk_name = pd.Series([bk_name] * len(raw_dataframe.index))
        raw_dataframe["book"] = bk_name
        # Every even-indexed sheet is a POST sheet
        if (i%2 == 0):
            post_dataframe = pd.concat([post_dataframe, raw_dataframe])
        else:
            comment_dataframe = pd.concat([comment_dataframe, raw_dataframe])

    # label rows in the dataframe as POST or COMMENT for filtering when querying
    title = pd.Series(["POST"] * len(post_dataframe.index))
    post_dataframe["TYPE"] = title
    comment_title = pd.Series(["COMMENT"] * len(comment_dataframe.index))
    comment_dataframe["TYPE"] = comment_title

    # Convert time into pdate format as required by Solr
    post_created_utc = post_dataframe['created_utc']
    post_created_utc = post_created_utc.apply(convert_time)
    post_dataframe['created_utc'] = post_created_utc

    comment_created_utc = comment_dataframe['created_utc']
    comment_created_utc = comment_created_utc.apply(convert_time)
    comment_dataframe['created_utc'] = comment_created_utc



    
    

    return post_dataframe, comment_dataframe


def AddCommentID(row):
    if (row["post_id"] not in post_id_dictionary.keys()):
        post_id_dictionary[row["post_id"]] = 1
    else:
        post_id_dictionary[row["post_id"]] += 1
        
    return row["post_id"] + "_" + str(post_id_dictionary[row["post_id"]])
        
def AddCommentNum(row):
    if (row["post_id"] not in post_id_dictionary.keys()):
        post_id_dictionary[row["post_id"]] = 1
    else:
        post_id_dictionary[row["post_id"]] += 1
        
    return post_id_dictionary[row["post_id"]]

In [28]:
data_dir = "./Book Data/"
excel_names = ["Action Books.xlsx", "Comedy Books.xlsx", "Fantasy Books.xlsx", "Horror Books.xlsx", "Mystery Books.xlsx"]
comment_names = []

first_post_dataframe, first_comment_dataframe = ProcessData(data_dir+excel_names[0])
first_post_dataframe.to_csv(excel_names[0].split()[0] + "_post.csv", index=False)
first_comment_dataframe.to_csv(excel_names[0].split()[0] + "_comment.csv", index=False)
comment_names.append(excel_names[0].split()[0] + "_comment")
for i in excel_names[1:]:
    post_df, comment_df = ProcessData(data_dir+i)
    post_df.to_csv(i.split()[0] + "_post.csv", index=False)
    comment_df.to_csv(i.split()[0] + "_comment.csv", index=False)
    comment_names.append(i.split()[0] + "_comment")
    # first_post_dataframe = pd.concat([first_post_dataframe, post_df])
    # first_comment_dataframe = pd.concat([first_comment_dataframe, comment_df])

# first_post_dataframe.to_csv("new_posts.csv", index=False)
# first_comment_dataframe.to_csv("new_comments.csv", index=False)

In [44]:
post_id_dictionary = {}
for i in comment_names:
    post_id_dictionary = {}
    comment_df = pd.read_csv(i + ".csv")
    comment_df["id"] = comment_df.apply(AddCommentID, axis=1)
    cols = comment_df.columns.tolist()
    cols = cols[-1:] + cols[:-1]
    comment_df = comment_df[cols]
    post_id_dictionary = {}
    comment_df["comment_num"] = comment_df.apply(AddCommentNum, axis=1)
    comment_df.to_csv(i + "_process.csv",index=False)