In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator

## 2. Data Processing for Figure 1

In [None]:
# Obtain processed data
df_new = pd.read_csv("processed_data.csv")
df_tags = df_new[['month', "Tags-2"]]
df_tags.drop_duplicates(inplace=True)
print(df_tags.shape)
df_tags.head(20)

In [None]:
# Obtain count of questions and tags per month
questions = df_new.groupby('month').size().to_frame()
df_month = df_tags.groupby('month').size().to_frame()
df_month = pd.concat([df_month, questions], axis=1)
df_month.columns = ['tags', 'docs']
df_month.index = df_month.index.astype(str)
df_month.tail(20)

In [None]:
# Save as csv
df_month.to_csv('python_counts.csv')

In [None]:
# Create plot of counts over time
import matplotlib.patches as patches
fig, ax1 = plt.subplots(figsize=(12, 5), dpi=80)

ax2 = ax1.twinx()
ax1.plot(df_month.index, df_month['docs'], 'g-', marker='.')
ax2.plot(df_month.index, df_month['tags'], 'b-', marker='.')

ax1.set_xlabel('Year')
ax1.set_ylabel('Nos. of Questions', color='g')
ax2.set_ylabel('Nos. of Tags', color='b')
ax1.set_xticks(np.append(np.arange(0, 120, 12), 119), [])


ax1.add_patch(
    patches.Rectangle(
        xy=(120-12, 0),  # point of origin.
        width=12, height=90000, linewidth=1, facecolor="grey", 
        fill=True, alpha=0.4))
plt.show()

plt.show()

## 2. Data Processing for Figure 4

In [None]:
# Access the data
df = pd.read_csv("clustered_data.csv") # 2019 Data with clustering results
df_2 = pd.read_csv('stackexchange_data.csv') # Data obtained from XML file

print("2019 Data")
print(df.shape)
print(df.columns)

print("\nAll Data")
print(df_2.shape)
print(df_2.columns)

In [None]:
# Remove unnecessary columns
df_2_v2 = df_2.copy().dropna(how='all', axis=1)
df_2_v2.columns = df_2.columns[1:]

In [None]:
# inner merge the two data frames
df_final = pd.merge(df_2_v2, df[['XML_Line', 'Cluster']],
                    how='inner', on="XML_Line")

In [None]:
# Check new dataframe
print(df_final.shape)
df_final.head()

In [None]:
# Remove duplicate rows
df_final.drop_duplicates(inplace=True)
df_final.head()

In [None]:
# Obtain mean for engagement metrics
df_metrics =  df_final.groupby('Cluster')['ViewCount',
                                          'AnswerCount',
                                          'CommentCount',
                                          'FavoriteCount'].mean()
df_metrics

In [None]:
# Combine engagement metrics with cluster length
df_metrics2 = df_final.groupby('Cluster').size().rename('PostCount')
df_metrics2 = pd.concat([df_metrics, df_metrics2], axis=1)
df_metrics2

In [None]:
# Scale the metrics
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
scaled = scaler.fit_transform(df_metrics2)

df_scaled = pd.DataFrame(scaled,
                         index=df_metrics2.index,
                         columns=df_metrics2.columns).reset_index()
df_scaled

In [None]:
# Save to csv
#df_scaled.to_csv('cluster_metrics.csv')

In [None]:
# Create radar plot
from math import pi
def make_spider(row, title, color):
 
    # number of variable
    categories=list(df_scaled)[1:]
    N = len(categories)
 
    # What will be the angle of each axis in the plot? (we divide the plot / number of variable)
    angles = [n / float(N) * 2 * pi for n in range(N)]
    angles += angles[:1]
 
    # Initialise the spider plot
    ax = plt.subplot(2, 2, row+1, polar=True )
 
    # If you want the first axis to be on top:
    ax.set_theta_offset(pi / 2)
    ax.set_theta_direction(-1)
    
    # Draw one axe per variable + add labels labels yet
    plt.xticks(angles[:-1], categories, color='grey', size=8)
 
    # Draw ylabels
    ax.set_rlabel_position(0)
#     plt.yticks([-2, -1, 0, 1, 2], [-2,-1, 0, 1, 2], color="grey", size=7) #for sscaled
#     plt.ylim(-2.5,2.5)
    plt.yticks([0, 0.25, 0.5, 0.75, 1], [0, 0.25, 0.5,0.75, 1], 
               color="grey", size=7) #formmscaled
    plt.ylim(-0.1,1)

    # Ind1
    values=df_scaled.loc[row].drop('Cluster').values.flatten().tolist()
    values += values[:1]
    ax.plot(angles, values, color=color, linewidth=2, linestyle='solid')
    ax.fill(angles, values, color=color, alpha=0.4)
 
    # Add a title
    plt.title(title, size=14, color=color, y=1.1)
 
    

In [None]:
# Display radar chart
plt.figure(figsize=(12, 6), dpi=100)
plt.subplots_adjust(hspace=0.5)

# Create a color palette:
my_palette = plt.cm.get_cmap("Set2", len(df_scaled.index))

for row in range(0, len(df_scaled.index)):
    make_spider(row=row, 
                title='Cluster '+(df_scaled['Cluster'][row]).astype(str), 
                color=my_palette(row))