# Web-Scripe Project for Chinese Joke Blog

## Part 1 Load requests and BeautifulSoup Package

In [42]:
import requests                
from bs4 import BeautifulSoup  
import time                    
import random                  
import pandas as pd
import numpy as np

## Part 2 Define get_urls function which will acquire the each detail page link and get_info which will acquire target information from each detail page

In [43]:
def get_urls(url):                   # Acquire urls of all detail page for each navigation page
    urls = []
    # Send Internet Request
    response = requests.get(url=url)
    # Create BeautifulSoup Object and parse the HTML
    soup = BeautifulSoup(response.text, features="lxml")
    h2_all=soup.find_all(name='h2')      # Acquire h2 tage for each page 
    for h in h2_all:                     # For loop all h2 tag
        # Acquire url of each h2 tage and append it into the list
        urls.append(h.find('a')['href'])
    return urls                           # Return urls of all detail page for each navigation page

In [44]:
def get_info(url):              # Acquire required information of each detail page
    response = requests.get(url=url)
    soup = BeautifulSoup(response.text, features="lxml")
    title = soup.find(class_='article-title').find('a').get_text() # Acquire title
    spans = soup.find(class_='article-meta').find_all(name='span')
    date = spans[0].get_text()       # Acquire Date
    category = spans[1].get_text()       # Acquire Category
    read = spans[2].get_text()       # Acquire Read
    comment = spans[3].get_text()    # Acquire Comment
    content = soup.find(class_='article-content').get_text() # Acquire Content
    every_row=pd.DataFrame({"Date":[date],"Category":[category],"Reading":[read],"Comments":[comment],"Content":[content]})
    return every_row

## Part 3 Extract Information and Generate Dataframe

In [45]:
navigative_urls=['https://duanzixing.com/page/{}/'.format(i) for i in range(1,10)]

In [46]:
individual_columns=[]
for i in range(len(navigative_urls)):
    detail_urls=get_urls(navigative_urls[i])
    for url in detail_urls:
        individual_columns.append(get_info(url))
        time.sleep(random.randint(1,3))

In [47]:
results=pd.concat(individual_columns).reset_index(drop=True)

In [48]:
print(results.head())

         Date Category  Reading Comments  \
0  2022-06-08    分类：段子   阅读(57)    评论(0)   
1  2022-06-01    分类：段子  阅读(242)    评论(0)   
2  2022-05-26    分类：段子  阅读(249)    评论(0)   
3  2022-05-26    分类：段子  阅读(278)    评论(0)   
4  2022-05-25    分类：段子  阅读(202)    评论(0)   

                                             Content  
0  \n小明考上了飞行员，但最近挺不开心的。虽然刚刚进入飞行训练阶段，但每次试飞教练都不让他参加...  
1     \n我：表哥，如果你女神突然像你表白，你会有什么反应？表哥：孩子生下来吧，算我的！我：……   
2  \n生活的现状是：想过八戒般的生活，但只有沙僧的本事，却承受着悟空般的压力，还时不时听到唐僧...  
3  \n我有一个女性朋友，长得蛮漂亮，身材也好，还有点小闷骚。。。。前段时间有两个男生同时追她，...  
4  \n在公共卫生间上完厕所，照了照镜子，忍不住赞叹自己：tmd，我长得真帅......然后听见...  


## Part 4 Export Dataframe to CSV

In [49]:
import os
print(os.path.abspath(",")) 

C:\Users\Apple\Desktop\,


In [50]:
results.to_csv("Jokes.csv",encoding="utf_8_sig")