In [1]:
import scrapy
import requests
import pandas as pd
import time
import re 
import numpy as np
from scrapy.http import TextResponse
from scrapy.crawler import CrawlerProcess

In [2]:
# Function to scrape the urls from the Kicktraq's main page
def urlscraper(url,base_url="https://www.kicktraq.com"):
  page = requests.get(url)
  response = TextResponse(body=page.text,url=url,encoding="utf-8")
  url = [base_url + i for i in response.css("div[class = 'project-infobox'] h2>a::attr(href)").extract()]
  return url

In [373]:
#Collecting the urls of all projects available available
pages = []
for i in range(0,700):
  pages.extend(urlscraper(url = f"https://www.kicktraq.com/archive/?page={i}"))

In [None]:
# Turning the list of urls to DataFrame
df = pd.DataFrame({"URL":pages})


In [114]:
# Function to scrape all the available features from individual projects pages in Kicktraq
def kicktraqscraper(url):
    page = requests.get(url,verify=False)
    response = TextResponse(body=page.text,url=url,encoding="utf-8")
    regex = re.compile(r'[\n\r\t]')
    '''
    Some project sites don't provide data on average pledge per backer and the order of information changes, so why I use if else 
    statements to correctly scrape all the needed data 
    '''
    info = response.css("div#project-info-text::text").extract()
    res = [i for i in info if "Average Pledge Per Backer" in i]
    if len(res) > 0:
      back = response.css("div#project-info-text::text").extract()[2]
      backer = [int(s) for s in back.split() if s.isdigit()][0]
      pl = response.css("div#project-info-text::text").extract()[3]
      pled = regex.sub("", pl)
      pledge_per_backer = pled.split(": ")[1]
      fund = response.css("div#project-info-text::text").extract()[5]
      money = regex.sub("", fund)
      money = money.split(": ")[1]
      funded = money.split(" of ")[0]
      goal =  money.split(" of ")[1]
    else:
      back = response.css("div#project-info-text::text").extract()[2]
      backer = [int(s) for s in back.split() if s.isdigit()][0]
      pledge_per_backer = "None"
      fund = response.css("div#project-info-text::text").extract()[4]
      money = regex.sub("", fund)
      money = money.split(": ")[1]
      funded = money.split(" of ")[0]
      goal =  money.split(" of ")[1]
        
    status = response.css("div[class = 'ribbon'] h3::text").extract_first(default = None)
    url =  response.css("div[id = 'project-info-image'] a::attr(href)").extract_first(default = None)
    creator = response.css("div[id = 'project-info-text'] a[target='_blank']::attr(href)").extract_first(default = None)
    category = response.css("div[class='project-cat']>a::text").extract_first(default = None)
    start_date = response.css("div[id = 'project-info-text'] a[class = 'datelink']::text").extract()[0]
    end_date =  response.css("div[id = 'project-info-text'] a[class = 'datelink']::text").extract()[1]
    desc = response.css("div#project-info-text::text").extract_first(default = None)
    description = regex.sub("", desc)
    return {"status":status, "url":url,"creator":creator, "category":category,"start date":start_date,"end date":end_date,"description":description,"backer":backer,"pledge_per_backer":pledge_per_backer, "funded":funded,"goal":goal}

In [None]:
# Scraping the features by going to all individual projects by urls we have already scraped
traqdata = []
for i in df.URL.to_list():
  info = kicktraqscraper(url = i)
  traqdata.append(info)

In [141]:
#Transforming to DataFrame to be able to download
data_kicktraq = pd.DataFrame.from_dict(traqdata)
data_kicktraq.to_csv("kicktraq_data.csv")

In [2]:
#Redirecting to the page of a project in Kickstarter by the link provided in Kicktraq page and scraping the additional features that were not available in Kicktraq.
def kickstarterscraper(url):
    page = requests.get(url,verify = False)
    response = TextResponse(body=page.text,url=url,encoding="utf-8")
    regex = re.compile(r'[\n\r\t]')
    '''
    As in different pages some features project  were not in the  same section 
    I generated if else statements  to scrape for all.
    '''
    title_other = response.css("a.hero__link::text").extract_first(default = None)
    title = response.css("div[class = 'grid-col-10 grid-col-10-lg grid-col-offset-1-md block-md order2-md type-center'] h2::text").extract_first(default = None)
    if title:
        title = title  
    elif title_other:
        title = title_other
    else:
        title = ""
    description_other = response.css("span.relative>span[class ='content edit-profile-blurb js-edit-profile-blurb']::text").extract_first(default = None)
    description = response.css("div[class = 'grid-col-10 grid-col-10-lg grid-col-offset-1-md block-md order2-md type-center'] p[class='type-14 type-18-md soft-black project-description mb1']::text").extract_first(default = None)
    if description:
        descr = description
        len_desc = len(description)
    elif description_other:
        descr = description_other
        len_desc = len(description_other)
    else:
        descr = ""
        len_desc = ""
    featured = response.css("span.ml1>span::text").extract_first(default = None)
    featured_other = response.css("div[class='NS_projects__category_location ratio-16-9 flex items-center'] a[class='grey-dark mr3 nowrap type-12 flex items-center']::text").extract()
    if featured and featured == 'Project We Love':
        fea = 1 #Featured
    elif featured_other and len(featured_other)>0:
        fea = 1 #Featured
    else:
        fea = 0 # Not featured
    
    update = response.css("a[class ='js-analytics-section js-load-project-content js-load-project-updates mx3 project-nav__link--updates tabbed-nav__link type-14'] span[class = 'count']::text").extract_first()
    comment = response.css("a[class ='js-analytics-section js-load-project-comments js-load-project-content mx3 project-nav__link--comments tabbed-nav__link type-14'] span[class = 'count']>data[itemprop='Project[comments_count]']::text").extract_first()
    faq = response.css("a[class ='js-analytics-section js-load-project-content js-load-project-faqs mx3 project-nav__link--faqs tabbed-nav__link type-14'] span[class = 'count']::text").extract_first()
    
    location = response.css("a[class ='nowrap navy-700 flex items-center medium mr3 type-12 keyboard-focusable'] span[class = 'ml1']::text").extract_first(default = None)
    location_other = response.css("a[class = 'grey-dark mr3 nowrap type-12']::text").extract_first(default = None)
    if location:
        loc = location 
    elif location_other:
        loc = location_other
    else:
        loc = ""
# Scraping number of backers per tier
    b_p_t = response.css("div[class = 'text-nowrap type-12 support-500 radius6px px2 py4px bg-support-100']::text").extract()
# Cleaning the data by keeping only numeric charachters
    backer_per_tier = list(map(lambda sub:int(''.join([i for i in sub if i.isnumeric()])), b_p_t))
# If a project have reward tiers scrape the number of tiers and average number of backers per tier.
    tier = len(backer_per_tier)
    if len(backer_per_tier)>0:
        avg_backer_per_tier = sum(backer_per_tier) / len(backer_per_tier)
        num_pledge_backers = sum(backer_per_tier)
    else:
        avg_backer_per_tier = "None"
        num_pledge_backers = "None"
    # Some project don't have reward tiers   
    pledge_lim = response.css("h2>span.money::text").extract()
    tier_limits = list(map(lambda sub:int(''.join([i for i in sub if i.isnumeric()])), pledge_lim))
    if len(tier_limits)>0:
        mean_limit =  sum(tier_limits) / len(tier_limits)
        min_limit = min(tier_limits)
        max_limit = max(tier_limits)
    else:
        mean_limit =  "None"
        min_limit = "None"
        max_limit = "None"
    return {"title":title,"description":descr,"len_desc":len_desc,"num_update":update, "num_comment":comment,"num_faq":faq, "location":loc,"num_tiers":tier,"num_pledge_backers":num_pledge_backers,"backer_per_tier":avg_backer_per_tier,"featured":fea,"tier_limits":tier_limits,"min_limit":min_limit,"max_limit":max_limit,"mean_limit":mean_limit}


In [None]:
# Runing the above mentioned function for all projects we have 
kickdata = []
for i in data_kicktraq.url.to_list():
  info = kickstarterscraper(url = i)
  kickdata.append(info)

In [44]:
# Transforming to Dataframe and downloading
data_kickstarter = pd.DataFrame.from_dict(kickdata)
data_kickstarter.to_csv("kikstarter.csv")

In [57]:
# Scraping profile of  creators by redirecting to their profile page in Kickstarter
def creatorbioscraper(url):
    page = requests.get(url,verify = False)
    response = TextResponse(body=page.text,url=url,encoding="utf-8")
    info = response.css("div[class = 'created-projects py2 f5 mb3'] ::text").extract()
    info.remove('·')
    info = ' '.join(info).split()
    created = info[0]
    # Number of backed projects by creator
    backed = int(info[2])
    # Number of created projects by creator 
    if "First" in created:
        creat = 1
    else:
        creat = int(created)
    web = response.css("div[class = 'pt3 pt7-sm mobile-hide'] ::text").extract()
    if len(web)>0:
        website = 1 #Have website(s)
    else:
        website = 0 # Dont have website(s)
    fb = response.css("div[class = 'facebook py2 border-bottom f5'] ::text").extract()
    fb = ' '.join(fb).split()
    if "Not" in fb:
        fb = 0 #Don't connected facebook
    else:
        fb = 1 # Connected to facebook
    # Creator's name
    creator = response.css("span.identity_name::text").extract_first(default = None)
    regex = re.compile(r'[\n\r\t]')
    creator = regex.sub("", creator)
    # Some details about creator
    about = response.css("div.readability>p::text").extract_first(default = None)
    if about:
        len_about = len(about)
    else:
        len_about = 0
    return {"num_backed_by_owner":backed,"num_created_by_owner":creat,"website":website,"facebook":fb,"owner":creator,"about":about,"len_about":len_about}


In [None]:
# Runing the function above for all projects
creatordata =[]
for i in data_kicktraq.creator.to_list():
  info = creatorbioscraper(url = i)
  creatordata.append(info) 

In [74]:
# Downloading the data 
data_creator = pd.DataFrame.from_dict(creatordata)
data_creator.to_csv("creatordata_thirdset_half.csv")