In [2]:
import pandas as pd
import numpy as np
import plotly.express as px

**Data research**

You can explore another daset simply change the readable file

**Research for top year subreddit**

**Clean data**

In [3]:
# Read the CSV file and create a DataFrame
df = pd.read_csv('top_all_subreddit.csv')


# Clean 'upvotes' and 'comments'
df['upvotes'] = df['upvotes'].str.replace('k', '000').astype(float)
df['comments'] = pd.to_numeric(df['comments'].str.split(' ').str[0], errors='coerce')
df = df.dropna(subset=['comments'])

**Count of posts for each subreddit**

In [4]:
# Group the data by subreddit and count the number of posts for each subreddit
subreddit_count = df.groupby('subreddit').size().reset_index(name='count')

# Sort the data by count in descending order
subreddit_count = subreddit_count.sort_values(by='count', ascending=False)

# Create an interactive bar plot of the count for each subreddit
fig = px.bar(subreddit_count, x='subreddit', y='count', title='Count of Posts for Each Subreddit',
             color='count', color_continuous_scale='Blues')
fig.update_traces(marker_line_color='rgb(8,48,107)', marker_line_width=1.5, opacity=0.8)
fig.update_layout(xaxis_tickangle=-45, xaxis_title='', yaxis_title='Count', 
                  title_font_size=24, title_x=0.5, title_y=0.95, 
                  font=dict(size=14, color='black'))
fig.update_layout(showlegend=False)
#fig.update_xaxes(showticklabels=False)
fig.show()

**Correlation between upvotes and comments**

In [5]:
# Calculate the correlation between upvotes and comments
corr_upvotes_comments = df['upvotes'].corr(df['comments'])

# Create a scatter plot of upvotes vs comments with a trendline
fig = px.scatter(df, x='upvotes', y='comments', trendline='ols', labels={'x': 'Number of Upvotes', 'y': 'Number of Comments'})
fig.show()

print('Correlation between upvotes and comments:', corr_upvotes_comments)

Correlation between upvotes and comments: 0.23755678243751652


**Popularity of subreddit**

We multiply the median of votes and the median of comments by the number of posts in the subreddit (I don't know if this is the right way)

In [6]:
# # calculate the median number of comments and upvotes for each post
popularity = df.groupby('subreddit').agg(upvotes_count=('upvotes', 'count'), 
                                                        comments_count=('comments', 'count'), 
                                                        post_count=('subreddit', 'count')).reset_index()

# Calculate popularity by multiplying post count with median upvotes and comments
popularity['popularity'] = popularity['upvotes_count'] * 0.25 + popularity['comments_count'] * 0.25 + popularity['comments_count'] * 0.25

# Sort the data by popularity in descending ord
popularity = popularity.sort_values(by='popularity', ascending=False)

# create a bar chart using Plotly
fig = px.bar(popularity, x='subreddit', y='popularity', title='Popularity of subreddit',
             color='popularity', color_continuous_scale='Blues')
fig.update_traces(marker_line_color='rgb(8,48,107)', marker_line_width=1.5, opacity=0.8)
fig.update_layout(xaxis_tickangle=-45, xaxis_title='', yaxis_title='Popularity', 
                  title_font_size=24, title_x=0.5, title_y=0.95, 
                  font=dict(size=14, color='black'))
fig.update_layout(showlegend=False)

fig.show()

In [7]:
# Групуємо за subreddit і обчислюємо суми коментарів та голосів
grouped_data = df.groupby('subreddit').agg({'comments': 'sum', 'upvotes': 'sum'}).reset_index()

# Сортуємо графік за сумою голосів
fig_upvotes = px.bar(grouped_data.sort_values(by='upvotes', ascending=False), x='subreddit', y='upvotes',
                      title='Total Upvotes by Subreddit', labels={'upvotes': 'Upvotes'}, color='subreddit',
                      height=400)

# Сортуємо графік за сумою коментарів
fig_comments = px.bar(grouped_data.sort_values(by='comments', ascending=False), x='subreddit', y='comments',
                      title='Total Comments by Subreddit', labels={'comments': 'Comments'}, color='subreddit',
                      height=400)

# Показуємо графіки
fig_upvotes.show()
fig_comments.show()