In [1]:
# функция, которая собирает суп страницы
from selenium import webdriver
from bs4 import BeautifulSoup

def get_soup(url):
    driver = webdriver.Chrome()
    driver.maximize_window()
    driver.get(url)
    pageSource = driver.page_source
    driver.quit()
    return BeautifulSoup(pageSource , "html.parser")
    
# soup = get_soup("https://www.crowdfunder.co.uk/search/projects?page=2&completed=pending&isLive=true&map=off")


Мы используем селениум для сбора ссылок, т.к. на сайте динамическая вёрстка.

In [2]:
# функция, которая собирает ссылки на проекты, представленные на странице
def get_hrefs(soup):
    projs = soup.find_all('article', {'class': 'cf-pod'})
    main_data = []
    for i in range(0, 12):
        item = projs[i]
        itemdict = {}
        itemdict.update([('href', item.a.get('href')), 
                         ('title', item.find('h3', {'class': 'cf-text cf-text--header cf-text--spaceQtr cf-text--break-word'}).text),
                        ('description', item.find('p', {'class': 'cf-text cf-text--fixed14 cf-text--light cf-text--thin cf-text--break-word'}).text)])
        main_data.append(itemdict)
    return main_data


# get_hrefs(soup)

Собираем все текущие проекты с сайта.

In [92]:

from tqdm.notebook import tqdm
import time

MAIN_URL = 'https://www.crowdfunder.co.uk/search/projects?'

data = []
fails = []
for p in tqdm(range(1, 84)):
    url = MAIN_URL + f'page={p}&completed=pending&isLive=true&map=off'
    try:
        tree = get_soup(url)
        projects = get_hrefs(tree)
        data.extend(projects)
        time.sleep(0.1)
    except:
        fails.append(url)

  0%|          | 0/83 [00:00<?, ?it/s]

Вручную собираем страницу, которая не спарсилась изначально.

In [6]:
import time
new_3 = []
tree_3 = get_soup('https://www.crowdfunder.co.uk/search/projects?page=3&completed=pending&isLive=true&map=off')
projects_3 = get_hrefs(tree_3)
new_3.extend(projects_3)
time.sleep(0.1)

In [140]:
len(new_3)

12

In [4]:
import pandas as pd
df_live_rest = pd.read_pickle('df_live_store.pkl') # подгружаем датафрейм со всеми текущими проектами, собранный выше

In [141]:
df_3 = pd.DataFrame(new_3)
df_3.head() 

Unnamed: 0,href,title,description
0,https://www.crowdfunder.co.uk/beccsbigbuild,Beccs Big Build,The Beccs Big Build project aims to build new ...
1,https://www.crowdfunder.co.uk/bacup-cc-accessi...,Bacup CC Accessible Access & Outdoor Space,To improve the accessibility to the club and t...
2,https://www.crowdfunder.co.uk/accessible-ev-ch...,Accessible EV Chargepoints for Vulnerable Drivers,To develop an EV charging unit that delivers a...
3,https://www.crowdfunder.co.uk/lcc-fix-our-nets,Liverpool Cricket Club - FIX OUR NETS!,We want to fix the outdoor nets at Liverpool C...
4,https://www.crowdfunder.co.uk/himmah-ramadan-p...,Himmah Ramadan Poverty Appeal,The Prophet Muhammad (peace be upon him) said:...


In [142]:
concatenated_3 = pd.concat([df_live_rest, df_3])
df_bigger = concatenated_3.reset_index(drop = True)
df_bigger.head()

Unnamed: 0,href,title,description
0,https://www.crowdfunder.co.uk/free-assange,Help campaign to Free Julian Assange,Help campaign to Free Julian Assange and Stop ...
1,https://www.crowdfunder.co.uk/blackout2023,Black Out 2023 | Cannes Lions Festival,Taking Black talent connected to the creative ...
2,https://www.crowdfunder.co.uk/50-days-to-make-...,50 Days to Make a Difference,Our scientists curate a portfolio of effective...
3,https://www.crowdfunder.co.uk/saveside,#SAVESIDE,"As of 9th April 2023, Side Gallery will close ..."
4,https://www.crowdfunder.co.uk/lets-smash-the-p...,Let's smash the political silence on Brexit,"Everyone knows Brexit isn't working, but polit..."


Есть пара проектов, запущенных 10-12 раз, это выбросы.

In [10]:
df_bigger.nunique()

href           993
title          981
description    966
dtype: int64

Дропнем повторяющиеся проекты.

In [11]:
df_bigger.drop_duplicates(subset=['title'], keep='last', inplace = True) 

Одинаковое описание у разных частей одного большого проекта, оставим.

In [12]:
df_bigger[df_bigger['description'].duplicated()]

Unnamed: 0,href,title,description
295,https://www.crowdfunder.co.uk/world-record-bow...,WORLD RECORD BOWLS MATCH MARATHON - 62 HOURS,To engage more people by improving accessibili...
594,https://www.crowdfunder.co.uk/katie-1,Katie's Gatwick to Paris Cycle Fundraiser,"On the 8th of June 2023, I will cycle 300km fr..."
615,https://www.crowdfunder.co.uk/smallwood,The Smallwood Primary School Fundraiser,To support the mental and physical health of t...
639,https://www.crowdfunder.co.uk/the-who-tour-tic...,The Who UK Tour Tickets,We are running our prize draws to raise funds ...
692,https://www.crowdfunder.co.uk/london-to-paris-4,London Gatwick to Paris Charity Bike Ride,"On the 8th of June, a team of 60 riders will s..."
695,https://www.crowdfunder.co.uk/webheath,The Webheath Academy Primary School Fundraiser,To support the mental and physical health of t...
740,https://www.crowdfunder.co.uk/denby,The Denby CofE First School Fundraiser,To support the mental and physical health of t...
747,https://www.crowdfunder.co.uk/oughtrington-1,Oughtrington Community Primary School Fundraiser,To support the mental and physical health of t...
761,https://www.crowdfunder.co.uk/lawford,The Lawford CofE Primary School Fundraiser,To support the mental and physical health of t...
785,https://www.crowdfunder.co.uk/glenfrome,The Glenfrome Primary School Fundraiser,To support the mental and physical health of t...


In [13]:
df_bigger.nunique()

href           981
title          981
description    960
dtype: int64

In [143]:
df_live = df_bigger.reset_index(drop = True)
df_live['status'] = 'funding'
df_live.head() # финальный датафрейм с текущими проектами

Unnamed: 0,href,title,description,status
0,https://www.crowdfunder.co.uk/free-assange,Help campaign to Free Julian Assange,Help campaign to Free Julian Assange and Stop ...,funding
1,https://www.crowdfunder.co.uk/blackout2023,Black Out 2023 | Cannes Lions Festival,Taking Black talent connected to the creative ...,funding
2,https://www.crowdfunder.co.uk/50-days-to-make-...,50 Days to Make a Difference,Our scientists curate a portfolio of effective...,funding
3,https://www.crowdfunder.co.uk/saveside,#SAVESIDE,"As of 9th April 2023, Side Gallery will close ...",funding
4,https://www.crowdfunder.co.uk/lets-smash-the-p...,Let's smash the political silence on Brexit,"Everyone knows Brexit isn't working, but polit...",funding


Парсинг страниц текущих проектов завершён.

Парсим завершённые проекты.

In [141]:

from tqdm.notebook import tqdm
import time

MAIN_URL = 'https://www.crowdfunder.co.uk/search/projects?'

data_succ = []
fails_succ = []
for p in tqdm(range(1, 84)):
    url = MAIN_URL + f'page={p}&completed=successful&isLive=false&map=off'
    try:
        tree = get_soup(url)
        projects = get_hrefs(tree)
        data_succ.extend(projects)
        time.sleep(0.1)
    except:
        fails_succ.append(url)

  0%|          | 0/83 [00:00<?, ?it/s]

Подгржуаем датафрейм с завершёнными проектами, данные собраны выше.

In [15]:
df_succ_rest = pd.read_pickle('df_succ_store.pkl')

Вручную добавим страницу, которая не спарсилась.

In [16]:

new_18 = []
tree = get_soup('https://www.crowdfunder.co.uk/search/projects?page=18&completed=successful&isLive=false&map=off')
projects = get_hrefs(tree)
new_18.extend(projects)
time.sleep(0.1)

In [145]:
import pandas as pd

df_18 = pd.DataFrame(new_18)
df_18.head()

Unnamed: 0,href,title,description
0,https://www.crowdfunder.co.uk/level-up-proport...,Proportional Representation - It’s Now or Never,"Despite Boris Johnson’s best efforts, we’ve ne..."
1,https://www.crowdfunder.co.uk/hookpod-technolo...,Hookpod; technology to save seabirds and turtles,We work to make longline fishing safe for mari...
2,https://www.crowdfunder.co.uk/bringbackbrighto...,#SaveOurTheatres - Bring Back Brighton Dome,We’re part of the #SaveOurTheatres initiative ...
3,https://www.crowdfunder.co.uk/backthebike,University of Surrey - Back the Bike!,Let's pledge to bring a Pool Bike Scheme to th...
4,https://www.crowdfunder.co.uk/support-dornoch-...,Dornoch Castle needs you #Covid19 support fund,Covid19 lockdown has wrecked this industry and...


In [18]:
concatenated_18 = pd.concat([df_succ_rest, df_18])
df_bigger2 = concatenated_18.reset_index(drop = True)
df_fin = df_bigger2

In [19]:
df_fin.nunique()

href           996
title          994
description    982
dtype: int64

In [20]:
df_fin.drop_duplicates(subset=['title'], keep='last', inplace = True) # дропнем повторы

In [21]:
df_fin.nunique()

href           994
title          994
description    980
dtype: int64

 Одинаковое описание у разных частей одного большого проекта, оставим.

In [22]:
df_fin[df_fin['description'].duplicated()]

Unnamed: 0,href,title,description
273,https://www.crowdfunder.co.uk/food4heroes-midl...,Food4Heroes Midlands - Feed The NHS,"Help feed our NHS frontline staff, your donati..."
343,https://www.crowdfunder.co.uk/food4heroes-york...,Food4Heroes Yorkshire - Help Us Feed NHS Staff,"Help feed our NHS frontline staff, your donati..."
462,https://www.crowdfunder.co.uk/save-leatherhead...,Save Leatherhead Theatre,Please join me and make a difference? Every do...
569,https://www.crowdfunder.co.uk/cheltenhamtownvm...,Cheltenham Town v Manchester City - FA Cup,Crowdfunder is waiving its platform fees for c...
623,https://www.crowdfunder.co.uk/save-the-finborough,#SaveOurTheatres - Finborough Theatre,We’re part of a national initiative launched b...
665,https://www.crowdfunder.co.uk/thepostbar-saveo...,#SaveOurVenues - The Post Bar Tottenham,We're one of the music venues identified by MV...
674,https://www.crowdfunder.co.uk/saveboom,BOOM LEEDS needs your support right now! #Save...,We're one of the music venues identified by MV...
717,https://www.crowdfunder.co.uk/venue38,#SaveOurVenues - Venue38,We're one of the music venues identified by MV...
757,https://www.crowdfunder.co.uk/voluntary-action...,Voluntary Action Angus Give Local - Give Angus,Please join me and make a difference? Every do...
799,https://www.crowdfunder.co.uk/food4heroes-sout...,Food4Heroes South West - Help Us Feed NHS Staff,"Help feed our NHS frontline staff, your donati..."


In [146]:
df_fin = df_fin.reset_index(drop = True)
df_fin['status'] = 'success'
df_fin.head() # финальный датафрейм с завершёнными проектами

Unnamed: 0,href,title,description,status
0,https://www.crowdfunder.co.uk/unicef,Help UNICEF deliver vital COVID-19 vaccines,"When humanity pulls together, nothing can stop...",success
1,https://www.crowdfunder.co.uk/rbs-fundraiser,NatWest Group,RBS partnering with National Emergencies Trust...,success
2,https://www.crowdfunder.co.uk/unicef-aus,Help UNICEF Australia deliver 2 billion vaccines,"With vaccines rolling out across Australia, we...",success
3,https://www.crowdfunder.co.uk/glenwyvis-distil...,GlenWyvis Distillery,The Worlds first Community owned Scotch Malt W...,success
4,https://www.crowdfunder.co.uk/own-our-venues,Own Our Venues,An exciting opportunity for the live music com...,success


Парсинг страниц успешных проектов завершён.

In [24]:
concatenated = pd.concat([df_live, df_fin])
df1 = concatenated.reset_index(drop = True)
df1 # совмещаем датафреймы

Unnamed: 0,href,title,description,status
0,https://www.crowdfunder.co.uk/free-assange,Help campaign to Free Julian Assange,Help campaign to Free Julian Assange and Stop ...,funding
1,https://www.crowdfunder.co.uk/blackout2023,Black Out 2023 | Cannes Lions Festival,Taking Black talent connected to the creative ...,funding
2,https://www.crowdfunder.co.uk/50-days-to-make-...,50 Days to Make a Difference,Our scientists curate a portfolio of effective...,funding
3,https://www.crowdfunder.co.uk/saveside,#SAVESIDE,"As of 9th April 2023, Side Gallery will close ...",funding
4,https://www.crowdfunder.co.uk/lets-smash-the-p...,Let's smash the political silence on Brexit,"Everyone knows Brexit isn't working, but polit...",funding
...,...,...,...,...
1970,https://www.crowdfunder.co.uk/cosmickitchen,Cosmic Kitchen - Plymouth Climate Challenge,"Back Cosmic Kitchen founders, Gabriela and Luc...",success
1971,https://www.crowdfunder.co.uk/win-mark-knopfle...,Win Mark Knopfler's Signed Stratocaster Guitar,"In aid of Stagehand's #ILoveLive campaign, Mar...",success
1972,https://www.crowdfunder.co.uk/trusselltrust,Trussell Trust food banks need your support,"With your help, we can support food banks duri...",success
1973,https://www.crowdfunder.co.uk/sustaining-creat...,Sustaining Creativity Fund,Thanks to Spotify and your donations we can su...,success


Напишем несколько функций, котрые собирают инфу с индивилуальной страницы проекта.

In [38]:
df1.to_pickle('df1_main.pkl') # сохраняем датафрейм в пикл

In [149]:
links = df1['href'].values
links = links.tolist() # все собранные ссылки
links[0:5] 

['https://www.crowdfunder.co.uk/free-assange',
 'https://www.crowdfunder.co.uk/blackout2023',
 'https://www.crowdfunder.co.uk/50-days-to-make-a-difference',
 'https://www.crowdfunder.co.uk/saveside',
 'https://www.crowdfunder.co.uk/lets-smash-the-political-silence-on-brexit']

In [30]:
mysoup = get_soup('https://www.crowdfunder.co.uk/p/re-open-p-franco')

In [20]:
# функция собирает контент проекта
def get_content(soup):
    return soup.find('section', {'class': 'cf-site__content'})

# content = get_content(mysoup)
# content

In [85]:
content_fin = get_content(get_soup('https://www.crowdfunder.co.uk/p/stockton-swim-scheme-cost-of-living-help'))
# content_fin

In [27]:
# собираем локацию
def get_loc(content):
    try:
        loc = content.find('h2', {'class': 'cf-text cf-text--body cf-text--dark cf-text--l2c'}).text
        location = loc.split('\xa0in\xa0')[1]
    except AttributeError:
        location = 'null'
    return location

In [59]:
print(get_loc(content))

Bridport, Dorset, United Kingdom


Функции для текущих проектов.

In [32]:
# собираем денежную цель, если это текущий проект
goal = content.find('span', {'class': 'cf-text--dark cf-text--thick'}).text
goal_int = int(goal[1:].replace(",",""))
goal_int

30000

In [61]:
# собираем уровень достижения цели, если это текущий проект
rate = content.find('span', {'class': 'cf-text cf-text--fixed14 cf-text--dark cf-text--thick'}).text
mult = int(rate[:-1]) / 100
mult

1.32

In [33]:
# в случае текущего проекта собираем цель и уровень готовности, в случае завершённого проекта собираем собранные деньги
#и длительность проекта
def get_money_rate(content):
    try:
        goal = content.find('span', {'class': 'cf-text--dark cf-text--thick'}).text
        rate = content.find('span', {'class': 'cf-text cf-text--fixed14 cf-text--dark cf-text--thick'}).text
        money = int(goal[1:].replace(",",""))
        mult = int(rate[:-1]) / 100
        days = 'null'
    except AttributeError:
        try:
            report = content.find('span', {'class': 'cf-text cf-text--thin'}).text
            report1 = report.split('£')[1].split(' with')
            money = int(report1[0].replace(",",""))
            days = int(((report1[1].split('in '))[1].split())[0])
            mult = 'null'
        except:
            mult = 'null'
            money = 'null'
            days = 'null'
    return money, mult, days    

In [66]:
get_money_rate(content_fin)

(20093, 'null', 42)

In [101]:
# собираем теги
def get_tags(content):
    try:
        tags = content.find('div', {'data-tip-box': 'bookmark'})
        mytags  =tags.find_all('a', {'class': 'cf-text cf-text--fixed14 cf-text--light'})
        return [item.text for item in mytags]
    except:
        return 'null' #обновление функции, чтобы спарсить страницы без тэгов

In [86]:
get_tags(content_fin)

['Community', 'Sports']

In [96]:
# def get_fb_shares(content):
#     return content.find('div', {'data-layout': 'center-left'}) - почему-то не собирает шэры на фейсбуке

In [35]:
# собираем показатели силы сообщества
def get_community(content):
    communic = content.find('ul', {'class': 'cf-nav__list cf-horizontal-nav__elem'})
    upd = communic.find('span', {'aria-label': 'Number of updates'}).text
    comm = communic.find('span', {'aria-label': 'Number of comments'}).text
    try:
        supp = communic.find('span', {'aria-label': 'Number of investors'}).text
    except AttributeError:
        supp = communic.find('span', {'aria-label': 'Number of supporters'}).text
    community = {'upd': upd, 'comm': comm, 'supp': supp}
    return community

In [158]:
get_community(content)

{'upd': '5', 'comm': '425', 'supp': '916'}

In [36]:
# србираем кол-во вознаграждений
def get_rewards_num(content):
    try:
        rewards = content.find_all('article', {'data-well': 'reward'})
        num_rewards = len(rewards)
    except AttributeError:
        num_rewards = 'null'
    return num_rewards

In [103]:
get_rewards_num(content)

1

In [178]:
# сэмпл 3 рандомных ссылок из базы
from random import sample
babylinks = sample(links, 3)
babylinks

['https://www.crowdfunder.co.uk/safe-passage-legal',
 'https://www.crowdfunder.co.uk/greenandstone',
 'https://www.crowdfunder.co.uk/bromsgroverugby']

Проходимся по ссылкам из сэмпла и собираем инфу.

In [179]:

from tqdm.notebook import tqdm

hrefs_data = []
failed_links = []
for link in tqdm(babylinks):
    features = {'href': link}
    try:
        soup = get_soup(link)
        content = get_content(soup)
        progress = get_money_rate(content)
        features.update({'location': get_loc(content)})
        features.update({'money': progress[0]})
        features.update({'rate': progress[1]})
        features.update({'days': progress[2]})
        features.update({'tags': get_tags(content)})
        features.update({'rewards_num': get_rewards_num(content)})
        community = get_community(content)
        features.update(community)
        hrefs_data.append(features)
    except:
        failed_links.append(link)

  0%|          | 0/3 [00:00<?, ?it/s]

Проходимся по ссылкам из всего датафрейма и собираем инфу.

In [185]:

from tqdm.notebook import tqdm

hrefs_details = []
failed_links = []
for link in tqdm(links):
    features = {'href': link}
    try:
        soup = get_soup(link)
        content = get_content(soup)
        progress = get_money_rate(content)
        features.update({'location': get_loc(content)})
        features.update({'money': progress[0]})
        features.update({'rate': progress[1]})
        features.update({'days': progress[2]})
        features.update({'tags': get_tags(content)})
        features.update({'rewards_num': get_rewards_num(content)})
        community = get_community(content)
        features.update(community)
        hrefs_details.append(features)
    except:
        failed_links.append(link)

  0%|          | 0/1976 [00:00<?, ?it/s]

In [187]:
len(hrefs_details) # успешные 1899 ссылок из 1975, отлично

1899

In [None]:
df_details = pd.DataFrame(hrefs_details)

In [192]:
df_details.to_pickle('df_details_store.pkl') # сохраняем датафрейм в пикл

In [195]:
file_out = r'C:/Users/anfey/Desktop/df_details.csv' # сохраняем датафрейм в таблицу
df_details.to_csv(file_out)
file_data = open(file_out, 'rb').read()
open(file_out, 'wb').write(file_data[:-2])

273016

In [154]:
df2 = pd.read_pickle('df_details_store.pkl')
df2.head()

Unnamed: 0,href,location,money,rate,days,tags,rewards_num,upd,comm,supp
0,https://www.crowdfunder.co.uk/free-assange,"London, Greater London, United Kingdom",300000,0.61,,"[Community, Personal Causes]",0,22,1242,3576
1,https://www.crowdfunder.co.uk/blackout2023,"London, Greater London, United Kingdom",100000,0.84,,"[Business, Music]",0,0,9,24
2,https://www.crowdfunder.co.uk/50-days-to-make-...,"London, Greater London, United Kingdom",150000,0.55,,[Environment],3,0,11,68
3,https://www.crowdfunder.co.uk/saveside,"Newcastle upon Tyne, Tyne and Wear, United Kin...",75000,1.04,,[Creative & Arts],0,5,694,1806
4,https://www.crowdfunder.co.uk/lets-smash-the-p...,"London, Greater London, United Kingdom",100000,0.6,,[Politics],6,1,407,2391


In [197]:
with open(r'C:/Users/anfey/Desktop/fails.csv', 'w') as fp:
    for item in failed_links:
        # write each item on a new line
        fp.write("%s\n" % item)
    print('Done') # сохраняем фейлы

Done


In [45]:
with open(r'C:/Users/anfey/Desktop/fails.csv') as file:
    hrefs = file.read()

In [151]:
fails = hrefs.split('\n')[:-1]
fails[0:5]

['https://www.crowdfunder.co.uk/prevent-holiday-hunger-for-kids-this-winter',
 'https://www.crowdfunder.co.uk/the-dodgy-free-peace-sweet-charity-give-away',
 'https://www.crowdfunder.co.uk/rebeccas-mind-half-marathon',
 'https://www.crowdfunder.co.uk/this-community-heart-needs-to-keep-beating',
 'https://www.crowdfunder.co.uk/international-punjabi-conference']

Далее пройдёмся по ссылкам, которые не удаллось спарсить, и напишем отдельные функции, чтобы заполнить пропуски в данных.

Собираем финансирование в абсолютном выражении для проектов благотворительности.

In [1]:

def get_charity(content):
    char = content.find('span', {'class': 'cf-text cf-text--fixed14 cf-text--thin'}).text
    return char

In [77]:
from tqdm.notebook import tqdm 
# существует тип проектов charity
charity_all = []
no_charity = []
for link in tqdm(fails):
    features = {'href': link}
    try:
        soup = get_soup(link)
        content = get_content(soup)
        char_num = get_charity(content)
        features.update({'char': char_num})
        charity_all.append(features)
    except:
        no_charity.append(link)

  0%|          | 0/77 [00:00<?, ?it/s]

In [152]:
char_del = []
for i in charity_all:
    href = i['href']
    char_del.append(href)
len(char_del)

14

In [100]:
fails_upd =  [item for item in fails if item not in char_del]
len(fails_upd) # charity преоктов менее 1% от всей выборки

63

In [108]:
from tqdm.notebook import tqdm

hrefs_notags = []
fails_notags = []
for link in tqdm(fails_upd[4:]): # многие ссылки выдают ошибку 404, так что попробуем доп. собрать только проекты без тэгов
    features = {'href': link}
    try:
        soup = get_soup(link)
        content = get_content(soup)
        progress = get_money_rate(content)
        features.update({'location': get_loc(content)})
        features.update({'money': progress[0]})
        features.update({'rate': progress[1]})
        features.update({'days': progress[2]})
        features.update({'tags': get_tags(content)})
        features.update({'rewards_num': get_rewards_num(content)})
        community = get_community(content)
        features.update(community)
        hrefs_notags.append(features)
    except:
        fails_notags.append(link)

  0%|          | 0/59 [00:00<?, ?it/s]

In [110]:
len(hrefs_notags) # вау, оказалось, больше половины страниц сначала не спарсились из-за отсутствия тэга

53

In [120]:
df2.shape

(1899, 10)

In [121]:
df_notags = pd.DataFrame(hrefs_notags)
df_notags.head()

Unnamed: 0,href,location,money,rate,days,tags,rewards_num,upd,comm,supp
0,https://www.crowdfunder.co.uk/notmyking,"London, Greater London, United Kingdom",60000,0.6,,[Politics],0,2,539,1162
1,https://www.crowdfunder.co.uk/crowdfund-dartmo...,Plymouth,339930,,35.0,,42,6,186,740
2,https://www.crowdfunder.co.uk/thewavebristol,Bristol,219473,,28.0,,23,2,198,937
3,https://www.crowdfunder.co.uk/sole-of-discretion,Plymouth,142055,,35.0,,17,7,16,70
4,https://www.crowdfunder.co.uk/funding-the-firs...,"Aberdeen, Scotland, United Kingdom",121093,,8.0,,8,3,371,1015


In [None]:
concatenated_notags = pd.concat([df2, df_notags])
df2_conc = concatenated_notags.reset_index(drop = True)
df2 = df2_conc # добавляем оставшиеся ссылки в генеральный датафрейм

In [125]:
df2.shape

(1952, 10)

In [126]:
df2.to_pickle('df2_main.pkl')

In [155]:
df2 = pd.read_pickle('df2_main.pkl')
df2.head()

Unnamed: 0,href,location,money,rate,days,tags,rewards_num,upd,comm,supp
0,https://www.crowdfunder.co.uk/free-assange,"London, Greater London, United Kingdom",300000,0.61,,"[Community, Personal Causes]",0,22,1242,3576
1,https://www.crowdfunder.co.uk/blackout2023,"London, Greater London, United Kingdom",100000,0.84,,"[Business, Music]",0,0,9,24
2,https://www.crowdfunder.co.uk/50-days-to-make-...,"London, Greater London, United Kingdom",150000,0.55,,[Environment],3,0,11,68
3,https://www.crowdfunder.co.uk/saveside,"Newcastle upon Tyne, Tyne and Wear, United Kin...",75000,1.04,,[Creative & Arts],0,5,694,1806
4,https://www.crowdfunder.co.uk/lets-smash-the-p...,"London, Greater London, United Kingdom",100000,0.6,,[Politics],6,1,407,2391


In [132]:
# left join - оставляем все, что в левой таблице, датафпейм со всеми данными парсинга
df = pd.merge(df2, df1, left_on='href', right_on='href', how='left')

In [134]:
columns_titles = ["href","title", 'description', 'location', 'tags',
                  'upd', 'comm', 'supp', 'rewards_num', 'money', 'rate', 'days', 'status']
df=df.reindex(columns=columns_titles)

In [136]:
df.to_pickle('df_main.pkl')

In [139]:
file_out = r'C:/Users/anfey/Desktop/df_main.csv' # сохраняем датафрейм в таблицу
df.to_csv(file_out)
file_data = open(file_out, 'rb').read()
open(file_out, 'wb').write(file_data[:-2])

601357

В признаке money очень много нулей, поэтому нужно дописать парсер.

In [4]:
import pandas as pd # читаем таблицу с исправленным статусом
df = pd.read_pickle('df_main (1).pkl')
df

Unnamed: 0,href,title,description,location,tags,upd,comm,supp,rewards_num,money,rate,days,status
0,https://www.crowdfunder.co.uk/free-assange,Help campaign to Free Julian Assange,Help campaign to Free Julian Assange and Stop ...,"London, Greater London, United Kingdom","[Community, Personal Causes]",22,1242,3576,0,300000,0.61,,funding
1,https://www.crowdfunder.co.uk/blackout2023,Black Out 2023 | Cannes Lions Festival,Taking Black talent connected to the creative ...,"London, Greater London, United Kingdom","[Business, Music]",0,9,24,0,100000,0.84,,funding
2,https://www.crowdfunder.co.uk/50-days-to-make-...,50 Days to Make a Difference,Our scientists curate a portfolio of effective...,"London, Greater London, United Kingdom",[Environment],0,11,68,3,150000,0.55,,funding
3,https://www.crowdfunder.co.uk/saveside,#SAVESIDE,"As of 9th April 2023, Side Gallery will close ...","Newcastle upon Tyne, Tyne and Wear, United Kin...",[Creative & Arts],5,694,1806,0,75000,1.04,,funding
4,https://www.crowdfunder.co.uk/lets-smash-the-p...,Let's smash the political silence on Brexit,"Everyone knows Brexit isn't working, but polit...","London, Greater London, United Kingdom",[Politics],1,407,2391,6,100000,0.6,,funding
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1947,https://www.crowdfunder.co.uk/schumacher,Schumacher International Network for Change (S...,We are raising £25K in 25 days to build a virt...,Totnes,,2,27,200,8,25225,,25,success
1948,https://www.crowdfunder.co.uk/ipcuk-scholarships,IPCUK Scholarships,Help build a scholarship fund so that we can s...,London,,5,26,238,27,24255,,28,success
1949,https://www.crowdfunder.co.uk/snaffling-pig,Snaffling Pig,A porky brand with the balls to take on the sn...,,,2,0,92,13,23600,,42,success
1950,https://www.crowdfunder.co.uk/hookpod-technolo...,Hookpod; technology to save seabirds and turtles,We work to make longline fishing safe for mari...,"Dartington, England, United Kingdom",,14,42,487,8,72997,,10,success


In [None]:
prob = df[(df['money'] == 'null')] # ищем нули в признаке деньги
links_mon = list(prob['href'])

Сэмплируем 3 рандомных ссылок из списка.

In [56]:
from random import sample 
babelinks = sample(links_mon, 3)
babelinks

['https://www.crowdfunder.co.uk/hastings-pier-take-a-stake-in-our-future',
 'https://www.crowdfunder.co.uk/costoflivingcrisis-1',
 'https://www.crowdfunder.co.uk/edinburghzoo']

In [59]:
from tqdm.notebook import tqdm # собираем величину денег проекта с обновлённой функцией

hrefs_upd = []
fails_mon = []
for link in tqdm(links_mon):
    features = {'href': link}
    try:
        soup = get_soup(link)
        content = get_content(soup)
        progress = get_money_succ(content)
        features.update({'money': progress[0]})
        features.update({'rate': progress[1]})
        features.update({'days': progress[2]})
        hrefs_upd.append(features)
    except:
        fails_mon.append(link)

  0%|          | 0/337 [00:00<?, ?it/s]

In [15]:
soup = get_soup('https://www.crowdfunder.co.uk/p/keep-the-lights-on')
content = get_content(soup)

In [51]:
def get_money_succ(content): # обновлённая функция 
    try:
        report = content.find('span', {'class': 'cf-text cf-text--thin'}).text
        report1 = report.split('£')[1].split(' with')
        report2 = report.split('£')[2]
        money = int(report1[0].split()[0].replace(",",""))
        days = int(report2.split('in ')[1].split()[0])
        mult = 'null'
    except:
            mult = 'null'
            money = 'null'
            days = 'null'
    return money, mult, days    

In [52]:
get_money_succ(content)

(26662, 'null', 35)

In [63]:
with open(r'C:/Users/anfey/Desktop/hrefs_mon.csv', 'w') as fp:
    for item in hrefs_upd:
        # write each item on a new line
        fp.write("%s\n" % item)
    print('Done') # сохраняем фейлы

Done


In [2]:
with open(r'C:/Users/anfey/Desktop/hrefs_mon.csv') as file:
    hrefs_mon = file.read()

In [3]:
hrefs_mon = hrefs_mon.split('\n')[:-1]

In [5]:
import pandas as pd # ещё раз подгружаем датафрейм с нулями в признаке money для некоторых ссылок

df_mon = pd.DataFrame(hrefs_mon)
df_mon

Unnamed: 0,href,money,rate,days
0,https://www.crowdfunder.co.uk/new-golf-gym---l...,52055,,35
1,https://www.crowdfunder.co.uk/the-destiny-camp...,53323,,42
2,https://www.crowdfunder.co.uk/csaa-food-bank,45258,,42
3,https://www.crowdfunder.co.uk/crowdfunder-cost...,,,
4,https://www.crowdfunder.co.uk/the87press2023,,,
...,...,...,...,...
331,https://www.crowdfunder.co.uk/bringbackbrighto...,71841,,552
332,https://www.crowdfunder.co.uk/win-mark-knopfle...,68733,,20
333,https://www.crowdfunder.co.uk/trusselltrust,68616,,843
334,https://www.crowdfunder.co.uk/sustaining-creat...,68541,,179


In [6]:
prob3 = df_mon[(df_mon['money'] == 'null')] # смотрим, где остались нули
links_mon1 = list(prob3['href'])
prob3.head()

Unnamed: 0,href,money,rate,days
3,https://www.crowdfunder.co.uk/crowdfunder-cost...,,,
4,https://www.crowdfunder.co.uk/the87press2023,,,
5,https://www.crowdfunder.co.uk/brass-blast,,,
6,https://www.crowdfunder.co.uk/stockton-hockey-...,,,
8,https://www.crowdfunder.co.uk/unravellingshort...,,,


In [35]:
def get_money_running(content): # другая обновлённая функция для парсинга
    try:
        report = content.find('p', {'class': 'cf-callout__text'}).text
        report1 = report.split('£')[1].split(' with')
        money = int(report1[0].split()[0].replace(",",""))
        mult = 'null'
        days = int(report1[1].split('in ')[1].split()[0])
    except:
            mult = 'null'
            money = 'null'
            days = 'null'
    return money, mult, days    

In [21]:
soup = get_soup('https://www.crowdfunder.co.uk/p/support-my-slow-fashion-studio')
content = get_content(soup)

In [36]:
get_money_running(content)

(1001, 'null', 42)

In [37]:
from tqdm.notebook import tqdm # парсинг по фейловым ссылкам

hrefs_run = []
fails_run = []
for link in tqdm(links_mon1):
    features = {'href': link}
    try:
        soup = get_soup(link)
        content = get_content(soup)
        progress = get_money_running(content)
        features.update({'money': progress[0]})
        features.update({'rate': progress[1]})
        features.update({'days': progress[2]})
        hrefs_run.append(features)
    except:
        fails_run.append(link)

  0%|          | 0/47 [00:00<?, ?it/s]

In [84]:
df_last_hrefs = pd.DataFrame(hrefs_run) # посмотрим на успешный сбор
df_last_hrefs.head()

Unnamed: 0,href,money,rate,days
0,https://www.crowdfunder.co.uk/crowdfunder-cost...,,,
1,https://www.crowdfunder.co.uk/the87press2023,18460.0,,56.0
2,https://www.crowdfunder.co.uk/brass-blast,17545.0,,32.0
3,https://www.crowdfunder.co.uk/stockton-hockey-...,16553.0,,78.0
4,https://www.crowdfunder.co.uk/unravellingshort...,12381.0,,32.0


In [46]:
concatenated_mon = pd.concat([df_mon, df_last_hrefs]) # совмещаем с последним датафреймом
df_money = concatenated_mon.reset_index(drop = True)
df_money

Unnamed: 0,href,money,rate,days
0,https://www.crowdfunder.co.uk/new-golf-gym---l...,52055,,35
1,https://www.crowdfunder.co.uk/the-destiny-camp...,53323,,42
2,https://www.crowdfunder.co.uk/csaa-food-bank,45258,,42
3,https://www.crowdfunder.co.uk/crowdfunder-cost...,,,
4,https://www.crowdfunder.co.uk/the87press2023,,,
...,...,...,...,...
378,https://www.crowdfunder.co.uk/hastings-pier-ta...,,,
379,https://www.crowdfunder.co.uk/place-a-trade-gi...,,,
380,https://www.crowdfunder.co.uk/sunderland-together,,,
381,https://www.crowdfunder.co.uk/immunisation-coa...,,,


In [47]:
df_money.drop_duplicates(subset=['href'], keep='last', inplace = True) # дропнем повторяющиеся проекты
df_money

Unnamed: 0,href,money,rate,days
0,https://www.crowdfunder.co.uk/new-golf-gym---l...,52055,,35
1,https://www.crowdfunder.co.uk/the-destiny-camp...,53323,,42
2,https://www.crowdfunder.co.uk/csaa-food-bank,45258,,42
7,https://www.crowdfunder.co.uk/wigtonclubhub,16528,,49
13,https://www.crowdfunder.co.uk/tnn-together,6390,,56
...,...,...,...,...
378,https://www.crowdfunder.co.uk/hastings-pier-ta...,,,
379,https://www.crowdfunder.co.uk/place-a-trade-gi...,,,
380,https://www.crowdfunder.co.uk/sunderland-together,,,
381,https://www.crowdfunder.co.uk/immunisation-coa...,,,


In [51]:
prob4 = df_money[(df_money['money'] == 'null')]
prob4.shape

(27, 4)

In [52]:
df_money.to_pickle('df_money.pkl') # сохраняем датафрейм в пикл

В этой части я пробую собрать дату добавления проекта на сайт, но при парсинге оказалось, что она есть только у малого кол-ва проектов, поэтому такой признак мы не стали вводить.

In [11]:
soup = get_soup('https://www.crowdfunder.co.uk/p/funding-reforestion-of-cornwall')

In [12]:
def get_upload(soup): # функция, которая собирает дату 
    head = soup.find('script', {'type': 'application/ld+json'}).text
    date = head.split("uploadDate")[1].split()[1].split('T')[0][1:]
    return date

In [13]:
get_upload(soup)

'2023-03-30'

In [14]:
links_date = list(df['href'])
len(links_date)

1952

In [92]:
from random import sample # берём сэмпл из трёх ссылок
minlinks = sample(links_date, 3)
minlinks

['https://www.crowdfunder.co.uk/fashionandtextilesforever',
 'https://www.crowdfunder.co.uk/redcarrugbyclub',
 'https://www.crowdfunder.co.uk/richmondrugby']

In [15]:
from tqdm.notebook import tqdm # пробуем собрать даты для половины проектов

hrefs_date = []
failed_date = []
for link in tqdm(links_date[0:800]):
    features = {'href': link}
    try:
        soup = get_soup(link)
        features.update({'upload_date': get_upload(soup)})
        hrefs_date.append(features)
    except:
        failed_date.append(link)

  0%|          | 0/800 [00:00<?, ?it/s]

In [19]:
with open(r'C:/Users/anfey/Desktop/hrefs_date.csv', 'w') as fp:
    for item in hrefs_date:
        # write each item on a new line
        fp.write("%s\n" % item)
    print('Done') # сохраняем фейлы

Done


In [20]:
with open(r'C:/Users/anfey/Desktop/fails_date.csv', 'w') as fp:
    for item in failed_date:
        # write each item on a new line
        fp.write("%s\n" % item)
    print('Done') # сохраняем фейлы

Done


In [7]:
df_money = pd.read_pickle('df_money.pkl')

In [10]:
df_money[df_money['money'] == 'null'].shape # а здесь смотрим, сколько в итоге нулей в money

(27, 4)