## Downloading, reading, and exporting the contents to CSV

Once the data has been downloaded from https://cseweb.ucsd.edu/~jmcauley/datasets.html#steam_data,
unzip the files and move the resulting JSONs to this folder, do not rename them

Running the first cell is all that's required for running the [final notebook](../notebooks/report/final_notebook.ipynb)

In [None]:
games = []

with open('steam_games.json', 'r') as f:
    for l in f:
        games.append(eval(l))
        
        
df_games = pd.DataFrame(games)
df_games.to_csv('games.csv')

## Cleaning, Subsampling, Aggregating, and Exporting Review Text

    please note, this is not required as the subsample_agg_reviews.p is already located in the repo, these cells will also take some time to run

In [None]:
reviews = []

with open('steam_reviews.json', 'r') as f:
    for l in f:
        reviews.append(eval(l))
        
        
df_reviews = pd.DataFrame(reviews)
df_reviews.to_csv('reviews.csv')

In [None]:
# drop unnecessary columns
df_reviews.drop(['found_funny', 'compensation', 'user_id', 'Unnamed: 0', 'products', 'page_order',\
                'date', 'early_access', 'page'], axis=1, inplace=True)

# create a frequency column based on product_id, sort by said column
df_reviews['freq'] = df_reviews.groupby('product_id')['product_id'].transform('count')
df_reviews.sort_values(by=['freq', 'product_id'], ascending=[False, True], inplace=True)

# remove null values
df_reviews.dropna(inplace=True)

# remove reviews by users that had under 1 hour played for the game
df_reviews = df_reviews[df_reviews['hours'] >= 1]

# remove games that have less than 500 total reviews
df_reviews = df_reviews[df_reviews['freq'] >= 500]

# convert product_id to strings because Doc2Vec needs strings as Tags
df_reviews['product_id'] = df_reviews['product_id'].astype(str)

In [None]:
# take subsample of data for text manipulation/modeling purposes
df_sample = df_reviews.sample(axis=0, n=250000)
df_sample.sort_values(by=['freq', 'product_id'], ascending=[False, True], inplace=True)

# make lowercase
df_sample['text'] = df_sample['text'].str.lower()

# remove new line indicators
df_sample['text'] = df_sample['text'].str.replace('\n', ' ')
df_sample['text'] = df_sample['text'].str.replace('.\n', ' ')

# tokenize text
df_sample['tokens'] = df_sample['text'].apply(nltk.word_tokenize)
df_sample['tokens']

# join tokens into single string
df_sample['clean_text'] = df_sample['tokens'].apply(', '.join)

In [None]:
# create product list of the unique product ids
products = list(df_sample['product_id'].unique())

# initiate a dictionary where each key is a unique product id, and the value is the aggregated text
product_dict = {}
for product_id in products:
    product_dict[product_id] = ''

# aggregate the text from each review in the dataframe corresponding to the product id key
for key in product_dict:
    for index, row in df_sample[df_sample['product_id'] == key].iterrows():
        product_dict[key] = product_dict[key] + ' ' + row['clean_text']

# export the aggregated text to a pickled file
with open('subsample_agg_reviews.p', 'wb') as fp:
    pickle.dump(product_dict, fp, protocol=pickle.HIGHEST_PROTOCOL)