In [None]:
import requests
import pandas as pd
import os
import time
from bs4 import BeautifulSoup
import csv
from datetime import datetime

# initialization if you don't have followed packages
# !pip install jieba
# !pip install wordcloud
# !pip insrall collections

import jieba
import numpy as np
import random
import jieba.analyse as anls
import matplotlib.pyplot as plt
from wordcloud import WordCloud
from collections import Counter
from matplotlib.colors import LinearSegmentedColormap

import networkx as nx
import matplotlib.ticker as ticker

# Crawl Zhihu travel column content

In [None]:
zhuanlan_url = ['c_114424127','ethanlam','c_134036341','zhoumolvxing','xiyougogo','c_1158068494032707584','c_1199019869133549568','c_1213818120634560512','c_190409180','Ge-Song','lanyanjing','c_1333313310423724032','c_152002793','c_1327315626012880896','c_1365632380191969280','lushuvip','c_176462911','beshan','huangjianhua','c_1277310537517756416']
zhuanlan_names = ['旅行','一群旅行体验师','旅行记','周末旅行','嬉游-旅行其实很简单','研学旅行','旅行','旅行','in旅行','环球旅行杂谈','世界旅行清单','国内旅行','富龙旅行','旅行','自驾旅行','走进旅行定制师','旅行头条','碧山旅行','旅行世界','旅行']
zhuanlan = 0  # The initial number of columns to start crawling
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36'}

# Define methods to write data to csv documents
def write_file(records):
    # Determines if the file already exists locally
    file_exists = os.path.isfile('zhihu_data.csv')
    with open('zhihu_data.csv', mode='a' if file_exists else 'w', newline='', encoding='gb18030') as f:
        writer = csv.writer(f)
        if not file_exists:
            writer.writerow(["column_num","column_name", "title","approval_num", "comments_num","link", "create_date","update_date", "author_name","author_sex", "author_id","type"])
        for item in records:
            row = [item["order"],item["zhuanlan_name"], item["title"],item["voteup_count"], item["comment_count"],item["url"], item["created"],item["updated"], item["author"],item["author_gender"], item["author_uid"],item["ptype"]]
            writer.writerow(row)     
    print("此次共爬取保存{}条数据".format(len(records)))
    
def get_zhihu_json(url):
    res = requests.get(url, headers = headers)
    #print(res)
    soup=BeautifulSoup(res.text, 'html.parser')
    json_data = res.json()
    return json_data

for group in zhuanlan_url:
    records = []  # record the crawled data
    pages = 1     # By default, 1 page is crawled
    limit = 100   # Each request fetches 100 pieces of data
    offset = 0
    url = 'https://www.zhihu.com/api/v4/columns/' + group + '/items?'
    json_data = get_zhihu_json(url)
    data_totals = json_data["paging"]["totals"]    # Get the total number of entries in each column
    print("begin crawl the zhuanlan of "+zhuanlan_names[zhuanlan]+" : which have " +str(data_totals)+" articles.")
    pages = (data_totals // 100)+1  
        
    for page in range(pages):  # Paginate for crawling
        next_url = 'https://www.zhihu.com/api/v4/columns/' + group + '/items?limit=' + str(limit) + f'&offset={page*100}'
        #print("begin crawl page " +str(page)+" : "+str(next_url))
        json_data = get_zhihu_json(next_url)
        for i in range(len(json_data["data"])):
            zhuanlan_name = zhuanlan_names[zhuanlan]   # Column name
            if json_data["data"][i]["type"]=="article":
                title = json_data["data"][i]["title"]   # title
                voteup_count = json_data["data"][i]["voteup_count"]   # Number of endorsements
                comment_count = json_data["data"][i]["comment_count"]   # Number of comment
                url = json_data["data"][i]["url"]   # link
                created = json_data["data"][i]["created"]   # Creation time
                updated = json_data["data"][i]["updated"]   # Last updated time
                author = json_data["data"][i]["author"]["name"]   # author name
                author_gender = json_data["data"][i]["author"]["gender"]   # author gender
                author_uid = json_data["data"][i]["author"]["uid"]   # author id
            if json_data["data"][i]["type"]=="answer":
                title = json_data["data"][i]["question"]["title"]   # title
                voteup_count = json_data["data"][i]["voteup_count"]   # Number of endorsements
                comment_count = json_data["data"][i]["comment_count"]   # Number of comment
                url = "/"   # link
                created = json_data["data"][i]["created_time"]   # creation time
                updated = json_data["data"][i]["updated_time"]   # Number of comment
                author = json_data["data"][i]["author"]["name"]  # author name
                author_gender = json_data["data"][i]["author"]["gender"]  # author gender
                author_uid = json_data["data"][i]["author"]["uid"]   # author id
            ptype = json_data["data"][i]["type"]
            records.append({"order":zhuanlan,"zhuanlan_name":zhuanlan_name,"title":title,"voteup_count":voteup_count, "comment_count":comment_count,"url":url,"created":created,"updated":updated, "author":author,"author_gender":author_gender,"author_uid":author_uid,"ptype":ptype})
        time.sleep(2)  # Control the crawling speed to prevent being blocked by the site
    zhuanlan += 1
    write_file(records)
    time.sleep(10)  # Control the crawling speed to prevent being blocked by the site

# Data preprocessing

In [None]:
# Read the files generated by the crawler and process the data
df = pd.read_csv('zhihu_data.csv',encoding='gb18030')

# Converts numeric dates to year, month, and day format
df['创建日期'] = pd.to_datetime(df['创建日期'], unit='s').dt.strftime('%d/%m/%Y')
df['更新日期'] = pd.to_datetime(df['更新日期'], unit='s').dt.strftime('%d/%m/%Y')
df['作者性别'] = df['作者性别'].map({1: '男', 0: '女'})

# Write the processed data back to the zhihu_data.csv file
df.to_csv('zhihu_data.csv', index=False, encoding='gb18030')

# Key words extraction and visualization

In [None]:
def read_data(filename):
    data = pd.read_csv(filename,sep=',',header=0)
    data = data.iloc[:, [0,1,2,6,8]]
    columns = data.iloc[:,1].unique()
    columns = np.append(columns, 'all')
    lenCol = len(columns)
    
    data.iloc[:, 3] = data.iloc[:, 3].str.split('/').str[-1]
    year = data.iloc[:, 3].unique()
    lenY = len(year)
    
    authors = data.iloc[:,4].unique()
    res_col = {}
    res_year = {}
    
    for i in range(0,lenCol):
        condition = data[data['column_num']==i]
        exec((f'cate{i}= condition.iloc[:, 2].values'))
        res_col[f'cate{i}'] = eval(f'cate{i}')
        
    for y in year:
        condition = data[data['create_date']==y]
        res_year[y] = condition.iloc[:, 2].values
        
    # all columns
    titles = data.iloc[:, [2]].values

    return res_col,res_year,columns,lenCol,authors,year

def seg_text(data,stopwords):
    seg_words = jieba.cut(data) # segmentation
    filtered_words = [word for word in seg_words if word not in stopwords] # filtered stopwords
    return filtered_words

def extract_keywords(data):
    text = ' '.join(data)  
    keywords = jieba.analyse.extract_tags(text, topK=5, withWeight=True)
    return keywords

def reduplication(lists):
    all_keywords = {}

    for keywords in lists:
        for keyword, weight in keywords:
            if keyword not in all_keywords:
                all_keywords[keyword] = weight
            else:
                all_keywords[keyword] += weight
    
    # normalize the weights
    max_weight = max(all_keywords.values())
    for keyword in all_keywords:
        all_keywords[keyword] /= max_weight
    return all_keywords

def ranking_keywords(kw_list):
    sorted_keywords = sorted(kw_list.items(), key=lambda x: x[1], reverse=True)
    
    # Get top 15 keywords
    top_keywords = sorted_keywords[:15]
    top_keywords = dict(top_keywords)
    
    return top_keywords

def generate_wordcloud(name, lists):   
    color_list = ['#61A3BA','#D2DE32','#A2C579']
    sorted_lists = sorted(lists.items(), key=lambda x: x[1], reverse=True)
    keywords = [item[0] for item in sorted_lists]
    weights = [item[1] for item in sorted_lists]
    wordcloud = WordCloud(font_path='simhei.ttf', width=800, height=400, background_color='white')
    wordcloud.generate_from_frequencies(dict(zip(keywords, weights)))
    wordcloud.recolor(color_func=lambda *args, 
                      **kwargs: '#265073' if kwargs['word'] in [item[0] for item in sorted_lists[:14]] else random.choice(color_list))    
    plt.figure(figsize=(10, 5))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis("off")
    plt.title(name, fontproperties='SimSun',size=20)
#     plt.show()
    
    plt.savefig(f'{name}_wordcloud.png', bbox_inches='tight')
    plt.close()
    
def add_stop_words(stopwords,wd_list):
    for wd in wd_list:
        words = jieba.lcut(wd)
        stopwords.extend(words)

    return list(set(stopwords))

def generate_ranking_histogram(name,keyword):
    top_keywords = ranking_keywords(keyword)
    plt.figure(figsize=(8, 6))
    plt.bar(top_keywords.keys(), top_keywords.values())
    plt.title(name,fontproperties='SimSun',size=20)
    plt.xlabel("Keyword")
    plt.ylabel("Frequency")
    plt.xticks(rotation=45,fontproperties='SimSun',size=15)
    plt.tight_layout()
    plt.savefig(f'{name}_ranking.png', bbox_inches='tight')
    plt.close()

def generate_word_frequency(name,seg):
    word_counts = Counter(word for sublist in seg for word in sublist)
    word_counts = word_counts.most_common(15)
    print(name,':',word_counts)
    words = [word for word, count in word_counts]
    counts = [count for word, count in word_counts]
    plt.figure(figsize=(8, 6))
    plt.bar(words, counts)
    plt.xlabel('Words')
    plt.ylabel('Frequency')
    plt.title(name,fontproperties='SimSun',size=20)
    plt.xticks(rotation=45,fontproperties='SimSun',size=15)
#     plt.show()
    plt.savefig(f'{name}_frequency.png', bbox_inches='tight')
    plt.close()

def main():
    res_col,res_year,columns,lenCol,authors,year = read_data(r'.\zhihu_data.csv')
    
    # initialize stopwords
    # source from https://github.com/elephantnose/characters
    # combinding The Chinese stop word list, HIT stop word list, 
    # Baidu stop word list, Sichuan University Machine intelligence Laboratory stop word database 
    # of four lists were merged and de-duplicated
    stopwords = [line.strip() for line in open('stop_words.txt', encoding='UTF-8').readlines()]
    stopwords = add_stop_words(stopwords,authors)
    stopwords = add_stop_words(stopwords,columns)
    
    for j in range(0,lenCol):
        seg_column = [seg_text(str(title), stopwords) for title in res_col[f'cate{j}']]
        generate_word_frequency(columns[j],seg_column)
        keyword = [extract_keywords(data) for data in seg_column]
        keyword = reduplication(keyword)
        all_kwds_col.update(keyword) 
        generate_wordcloud(columns[j],keyword)
        generate_ranking_histogram(columns[j],keyword)
        
    stopwords = add_stop_words(stopwords,year)
    for k in year:
        seg_year = [seg_text(str(title), stopwords) for title in res_year[k]]
        generate_word_frequency(k,seg_year)
        keyword = [extract_keywords(data) for data in seg_year]
        keyword = reduplication(keyword)
        all_kwds_year.update(keyword) 
        generate_wordcloud(k,keyword)
        generate_ranking_histogram(k,keyword)
    
main()

# Data visualization

In [None]:
plt.rcParams["font.sans-serif"] = ["SimHei"]
plt.rcParams["axes.unicode_minus"] = False #Solve problems such as Chinese garbled characters
import warnings
warnings.filterwarnings("ignore")

In [None]:
data = pd.read_csv(r'.\zhihu_data.csv', encoding='gb18030')
data

In [None]:
# check all nan data
data.isna()

In [None]:
# remove lines where some values are nan
data.dropna()

In [None]:
# check approval_num
data[data["approval_num"] < 0]

In [None]:
# check comments_num
data[data["comments_num"] < 0]

In [None]:
# check duplicated data
data.duplicated()

### 1. Average approval_num and comments_num for different columns —— Bar Chart

In [None]:
# Group by column and average the approval_num and comments_num
grouped = data.groupby('column_name')['approval_num', 'comments_num'].mean()

# Draw a histogram of approval_num
fig, ax1 = plt.subplots(figsize=(15, 8))
bar1 = ax1.bar(grouped.index, grouped['approval_num'], color='skyblue')
ax1.set_xlabel('column_name',fontsize=16)
ax1.set_ylabel('approval_num',fontsize=16)
ax1.set_title('Average approval_num by column',fontsize=16)

# Add labels to the bar chart
for rect in bar1:
    height = rect.get_height()
    ax1.text(rect.get_x() + rect.get_width() / 2, height, f'{height:.2f}', ha='center', va='bottom')

# Automatically adjust the display of horizontal axis labels
plt.xticks(rotation=45)

# Draw a histogram of comments_num
fig, ax2 = plt.subplots(figsize=(15, 8))
bar2 = ax2.bar(grouped.index, grouped['comments_num'], color='lightgreen')
ax2.set_xlabel('column_name',fontsize=16)
ax2.set_ylabel('comments_num',fontsize=16)
ax2.set_title('Average comments_num by column',fontsize=16)

# Add labels to the bar chart
for rect in bar2:
    height = rect.get_height()
    ax2.text(rect.get_x() + rect.get_width() / 2, height, f'{height:.2f}', ha='center', va='bottom')

# Automatically adjust the display of horizontal axis labels
plt.xticks(rotation=45)

plt.tight_layout()
plt.show()

### 2. Total approval_num and comments_num for different columns —— Double vertical axis bar chart

In [None]:
# Calculate the sum of approval_num and comments_num based on the column_name
column_data = data.groupby('column_name')['approval_num', 'comments_num'].sum()

# Create a chart with two vertical axes
fig, ax1 = plt.subplots(figsize=(15, 8))
ax2 = ax1.twinx()

# Plot a histogram of total approval points (first vertical axis)
bar1 = ax1.bar(range(len(column_data)), column_data['approval_num'], color='skyblue')
ax1.set_xlabel('column_name',fontsize=16)
ax1.set_ylabel('sum of total approval_num',fontsize=16)

# Plot a line chart of the total comments_num (second vertical axis)
line1, = ax2.plot(range(len(column_data)), column_data['comments_num'], color='lightgreen', marker='o')
ax2.set_ylabel('sum of total comments_num',fontsize=16)

# Set the x-axis tick positions and labels
ax1.set_xticks(range(len(column_data)))
ax1.set_xticklabels(column_data.index, rotation=45)

# Add labels to the bar chart
for rect in bar1:
    height = rect.get_height()
    ax1.text(rect.get_x() + rect.get_width() / 2, height, f'{height}', ha='center', va='bottom')

# Add labels to the line chart
for x, y in zip(range(len(column_data)), column_data['comments_num']):
    ax2.text(x, y, f'{y}', ha='center', va='bottom')

# Set chart title
plt.title('Total approval_num and comments_num in different column_names',fontsize=16)

plt.tight_layout()
plt.show()

### 3. The ten authors with the most approval_num —— horizontal bar chart

In [None]:
# Sort in descending order by total approval_num
top_10_authors = author_approval.sort_values('approval_num', ascending=False).head(10)

# Create a horizontal bar chart
fig, ax = plt.subplots(figsize=(15, 8))

# Draw a horizontal bar chart
bars = ax.barh(top_10_authors['author_name'][::-1], top_10_authors['approval_num'][::-1], color='orange')

# Add text labels to each bar
for bar in bars:
    width = bar.get_width()
    ax.text(width, bar.get_y() + bar.get_height() / 2, f'{int(width)}', ha='left', va='center')

# Set horizontal and vertical axis labels
ax.set_xlabel('total approval_num', fontsize=16)
ax.set_ylabel('author_name', fontsize=16)

# Set chart title
plt.title('Top ten authors with the most approval_num', fontsize=16)

plt.tight_layout()
plt.show()

### 4. The ten authors with the most comments_num —— horizontal bar chart

In [None]:
# Sort in descending order by total comments_num
top_10_authors = author_comments.sort_values('comments_num', ascending=False).head(10)

# Create a horizontal bar chart
fig, ax = plt.subplots(figsize=(15, 8))

# Draw a horizontal bar chart
bars = ax.barh(top_10_authors['author_name'][::-1], top_10_authors['comments_num'][::-1], color='pink')

# Add text labels to each bar
for bar in bars:
    width = bar.get_width()
    ax.text(width, bar.get_y() + bar.get_height() / 2, f'{int(width)}', ha='left', va='center')

# Set horizontal and vertical axis labels
ax.set_xlabel('total comments_num', fontsize=16)
ax.set_ylabel('author_name', fontsize=16)

# Set chart title
plt.title('Top ten authors with the most comments_num', fontsize=16)

plt.tight_layout()
plt.show()

### 5. Gender ratio of top ten authors with most approval_num —— Pie Chart

In [None]:
# Group according to the column_name and calculate the top ten authors with the most approval_num for each column name
top_10_authors = data.groupby('column_name').apply(lambda x: x.nlargest(10, 'approval_num'))

# Statistics of the gender ratio of the top ten authors of each column_name
gender_counts = top_10_authors['author_sex'].value_counts()

# Create a pie chart
fig, ax = plt.subplots()

# Draw a pie chart
ax.pie(gender_counts, labels=gender_counts.index, autopct='%1.1f%%', startangle=90)

# Set chart title
plt.title('Gender ratio of top ten authors with most approval_num')

plt.show()

### 6. Gender ratio of top ten authors with most comments_num —— Pie Chart

In [None]:
# Group according to the column_name and calculate the top ten authors with the most comments_num for each column name
top_10_authors = data.groupby('column_name').apply(lambda x: x.nlargest(10, 'comments_num'))

# Statistics of the gender ratio of the top ten authors of each column_name
gender_counts = top_10_authors['author_sex'].value_counts()

# Create a pie chart
fig, ax = plt.subplots()

# Custom color list
colors = ['lightgreen', 'lightcoral']

# Draw a pie chart and set colors
ax.pie(gender_counts, labels=gender_counts.index, autopct='%1.1f%%', startangle=90, colors=colors)

# Set chart title
plt.title('Gender ratio of top ten authors with most comments_num')
plt.show()

### 7. Analyze changes in the approval_num and comments_num over time —— Area Chart

In [None]:
%matplotlib notebook
# Convert create_date and update_date to date type
data['create_date'] = pd.to_datetime(data['create_date'])
data['update_date'] = pd.to_datetime(data['update_date'])

# Accurate date to month
data['create_month'] = data['create_date'].dt.to_period('M').astype(str)
data['update_month'] = data['update_date'].dt.to_period('M').astype(str)

# Analyze the approval_num and comments_num based on create_date
approval_by_create_month = data.groupby('create_month')['approval_num'].sum()
comments_by_create_month = data.groupby('create_month')['comments_num'].sum()

# Analyze the approval_num and comments_num based on update_date
approval_by_update_date = data.groupby('update_month')['approval_num'].sum()
comments_by_update_date = data.groupby('update_month')['comments_num'].sum()

# Create line and area charts
fig, ax = plt.subplots(2, 1, figsize=(10, 8))

# Plot line and area charts of approval_num as a function of create_date
ax[0].plot(approval_by_create_month.index, approval_by_create_month, color='skyblue', linewidth=2, label='approval_num')
ax[0].fill_between(approval_by_create_month.index, approval_by_create_month, color='skyblue', alpha=0.2)
ax[0].set_xlabel('create_month')
ax[0].set_ylabel('approval_num')
ax[0].set_title('approval_num changes according to create_month')
ax[0].legend()

# Plot line and area charts of comments_num as a function of create_date
ax[1].plot(comments_by_create_month.index, comments_by_create_month, color='lightgreen', linewidth=2, label='comments_num')
ax[1].fill_between(comments_by_create_month.index, comments_by_create_month, color='lightgreen', alpha=0.2)
ax[1].set_xlabel('create_month')
ax[1].set_ylabel('comments_num')
ax[1].set_title('comments_num changes according to create_month')
ax[1].legend()

# Set horizontal axis label spacing
for axes in ax:
    axes.set_xticks(axes.get_xticks()[::6])  # Display every 6 labels

# Adjust the spacing between subimages
plt.tight_layout()

# Show chart
plt.show()

### 8. Author activity analysis —— Lollipop Chart

In [None]:
# Count the number of publications by each author
author_counts = data['author_name'].value_counts()

# Sort authors by number of publications
sorted_authors = author_counts.sort_values(ascending=False)

# Extract the top 10 most active authors and the corresponding number of publications
top_authors = sorted_authors.head(10)
top_author_names = top_authors.index
top_author_counts = top_authors.values

# Create a Lollipop chart
fig, ax = plt.subplots(figsize=(10, 6))

# Draw vertical lines
ax.vlines(x=top_author_names, ymin=0, ymax=top_author_counts, color='firebrick', alpha=0.7, linewidth=2)

# Draw dots
ax.scatter(x=top_author_names, y=top_author_counts, color='firebrick', s=75, alpha=0.7)

# Set horizontal axis labels and titles
ax.set_xlabel('author_name')
ax.set_ylabel('the number of publications')
ax.set_title('Top 10 Authors by the number of publications')

# Rotate horizontal axis labels
plt.xticks(rotation=45)

# Invert the vertical axis so that authors with the highest number of publications are at the top
ax.invert_yaxis()

# Adjust the vertical axis scale range
ax.set_ylim([0, top_author_counts.max() * 1.1])

# Hide right and top borders
ax.spines['right'].set_visible(False)
ax.spines['top'].set_visible(False)

# Show data labels
for i in range(len(top_author_names)):
    ax.text(top_author_names[i], top_author_counts[i], str(top_author_counts[i]), ha='center', va='bottom')

# Show chart
plt.show()