In [37]:
import os
import pandas as pd
from typing import List
from scraper.Comment import Comment
from scraper.RedditJsonLoader import RedditJsonLoader
from scraper.RedditCommentSerializer import RedditCommentSerializer

In [38]:
input_path = os.getenv('INPUT_PATH', 'output')
output_path = os.getenv('OUTPUT_PATH', 'comments')
if not os.path.exists(output_path):
    os.makedirs(output_path)

In [39]:
# Open the output directory and read all the json files
json_files = os.listdir(input_path)
csv_path = f'{output_path}/comments.csv'

# Remove the csv file if it already exists
if os.path.exists(csv_path):
    os.remove(csv_path)

In [40]:
# Flag to track if it's the first file (for header)
first_file = True

for json_file in json_files:
    qualified_json_file_name = f'{input_path}/{json_file}'
    comments: List[Comment] = RedditJsonLoader().load_comments(path=qualified_json_file_name)
    serializer = RedditCommentSerializer()
    serializer.append_to_csv(comments=comments, path=csv_path, header=first_file)
    first_file = False

In [41]:
df = pd.read_csv(csv_path)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4487 entries, 0 to 4486
Data columns (total 11 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   id              4487 non-null   object 
 1   author          4487 non-null   object 
 2   body            4487 non-null   object 
 3   created_utc     4487 non-null   float64
 4   score           4487 non-null   int64  
 5   parent_id       4487 non-null   object 
 6   depth           4487 non-null   int64  
 7   ups             4487 non-null   int64  
 8   downs           4487 non-null   int64  
 9   num_reports     0 non-null      float64
 10  report_reasons  0 non-null      float64
dtypes: float64(3), int64(4), object(4)
memory usage: 385.7+ KB


In [42]:
# Sort by score 
df = df.sort_values(by='score', ascending=False)
# Remove the num_reports and report_reasons columns
df = df.drop(['num_reports', 'report_reasons'], axis=1)
print(f'Dataframe shape: {df.shape}')

# Remove rows with body as [deleted] or [removed]
df = df[df.body != '[deleted]']
df = df[df.body != '[removed]']
print(f'Dataframe shape: {df.shape}')

Dataframe shape: (4487, 9)
Dataframe shape: (4401, 9)


In [43]:
df.head(10)
csv_path = f'{output_path}/cleaned_comments.csv'
df.to_csv(csv_path, index=False)