In [1]:
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import requests
from requests.exceptions import RequestException
from bs4 import BeautifulSoup
from urllib.parse import urlparse

plt.style.use({'figure.facecolor': 'white'})

proxies = {
	'http': '127.0.0.1:3213',
	'https': '127.0.0.1:3213'
}

headers = {
	'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.93 Safari/537.36'
}

url = 'https://www.speakev.com/search/583560/?q=fiat&c[rankProfileName]=default&o=relevance'

scheme, netloc = urlparse(url).scheme, urlparse(url).netloc


In [2]:
class Content:
	""" Storage the information of the posts in a webpage """

	def __init__(self, url, links, titles, names, views, replies, datetime):
		self.url = url
		self.links = links
		self.titles = titles
		self.names = names
		self.views = views
		self.replies = replies
		self.datetime = datetime

In [3]:
class Crawler:
	""" Crawl the information of blogs in a website """

	@staticmethod
	def get_page(url):
		""" Get the url pages object """
		try:
			r = requests.get(url, proxies=proxies, headers=headers)
		except RequestException as e:
			print(e)
			return None
		else:
			return BeautifulSoup(r.text, 'lxml')

	@staticmethod
	def find_links_titles(page):
		""" Get all the blogs on a webpage """
		links = page.findAll('h3', {'class', 'contentRow-title'})
		if links is None:
			return None
		results1 = []
		results2 = []
		for i in range(len(links)):
			results1.append(f"{scheme}://{netloc}{links[i].find('a')['href']}")
			results2.append(links[i].find('a').text)
		return results1, results2

	@staticmethod
	def find_names(page):
		""" Get the blog author's id """
		names = page.findAll('a', {'class': 'username'})
		if names is None:
			return None
		results = []
		for i in range(len(names)):
			results.append(names[i].text)
		return results

	@staticmethod
	def find_views_replies(page):
		""" Get the view number """
		views = page.findAll('i', {'class', 'eye-icon'})
		replies = page.findAll('i', {'class', 'message-icon'})
		if views is None:
			return None
		results1 = []
		results2 = []
		for i in range(len(views)):
			results1.append(views[i].parent.text)
			results2.append(replies[i].parent.text)
		return results1, results2

	@staticmethod
	def find_datetime(page):
		""" Get the post's time """
		times = page.findAll('time')
		if times is None:
			return None
		results = []
		for i in range(len(times)):
			results.append(times[i].attrs['datetime'])
		return results

	def parse(self, url):
		""" Parse the URL and story it in the 'Content' class """
		soup = self.get_page(url)
		if soup is not None:
			links, titles = self.find_links_titles(soup)
			names = self.find_names(soup)
			views, replies = self.find_views_replies(soup)
			datetime = self.find_datetime(soup)
			if links is not None:
				content = Content(url, links, titles, names,
								  views, replies, datetime)
				return content

## def function return the last page of the post

In [4]:
def get_last_page(url):
	"""return the website last search page"""
	r = requests.get(url, proxies=proxies, headers=headers)
	soup = BeautifulSoup(r.text, 'lxml')
	return int(soup.findAll('li', {'class', 'pageNav-page'})[-1].text)

In [15]:
# create the car list
cars = ['fiat 500', 'SEAT Mii', 'Mini Cooper',
		'Honda E', 'Renault Zoe', 'Smart EQ']

# create pages url list
cars_webs = [
	'https://www.speakev.com/search/601568/?q=fiat+500&o=relevance',
	'https://www.speakev.com/search/601570/?q=SEAT+Mii&o=relevance',
	'https://www.speakev.com/search/601573/?q=Mini+Cooper&o=relevance',
	'https://www.speakev.com/search/601574/?q=Honda+E&o=relevance',
	'https://www.speakev.com/search/601575/?q=Renault+Zoe&o=relevance',
	'https://www.speakev.com/search/601576/?q=Smart+EQ&o=relevance'
]

# re split the website
pattern = re.compile("(.*?[0-9]+[/?]+)(.*)")
for i in range(len(cars_webs)):
	cars_webs[i] = re.findall(pattern, cars_webs[i])[0]

cars_webs

[('https://www.speakev.com/search/601568/?', 'q=fiat+500&o=relevance'),
 ('https://www.speakev.com/search/601570/?', 'q=SEAT+Mii&o=relevance'),
 ('https://www.speakev.com/search/601573/?', 'q=Mini+Cooper&o=relevance'),
 ('https://www.speakev.com/search/601574/?', 'q=Honda+E&o=relevance'),
 ('https://www.speakev.com/search/601575/?', 'q=Renault+Zoe&o=relevance'),
 ('https://www.speakev.com/search/601576/?', 'q=Smart+EQ&o=relevance')]

In [16]:
def get_page_links(url_tupel):
	""" return the posts page url links """
	urls = []

	for i in range(1, get_last_page(url_tupel[0]+url_tupel[1])+1):
		urls.append(url_tupel[0] + f'page={i}&' + url_tupel[1])

	return urls

## Check the  cars website pages

In [17]:
# create the dictionary store the cars webpages
cars_webs_dict = {}
for i in range(len(cars)):
	cars_webs_dict[cars[i]] = get_page_links(cars_webs[i])

## Save fiat500 links as csv file

In [19]:
# create an instances list include all posts of fiat 500
def get_post_objects(webpages):
	"""
	input the list of the cars website pages
	return the crawled posts url from car webpages
	"""
	post_objects = []
	for url in webpages:
		craw = Crawler()
		craw.parse(url)
		post_objects.append(craw.parse(url))

	return post_objects

def create_posts_df(post_objects):
	""" return the dataframe of the posts information """

	col_names = ['links', 'titles', 'names', 'views', 'replies', 'datetime']

	links, titles, names, views, replies, datetime = [], [], [], [], [], []

	for i in range(len(post_objects)):
		links.extend(post_objects[i].links)
		titles.extend(post_objects[i].titles)
		names.extend(post_objects[i].names)
		views.extend(post_objects[i].views)
		replies.extend(post_objects[i].replies)
		datetime.extend(post_objects[i].datetime)

	df = pd.DataFrame(zip(links, titles, names, views, replies, datetime),
					  columns=col_names)

	# modify the urls, remove the "/posts12345..." parameter
	pattern = re.compile('post-[0-9]*')
	df['links'] = df['links'].apply(lambda x: re.sub(pattern, '', x))

	# convert the 'K' to numerical thousand
	df['replies'] = df['replies'].apply(
		lambda x: int(x[:-1])*1000 if 'K' in list(x) else int(x)
	)
	df['views'] = df['views'].apply(
		lambda x: int(x[:-1])*1000 if 'K' in list(x) else int(x)
	)

	# remove duplicate title links
	df.drop_duplicates('titles', inplace=True, ignore_index=True)

	# remove abnormal replies
	df = df[df.loc[:,'replies'] < 1000]
	df.reset_index(drop=True, inplace=True)

	return df

In [None]:
df_seat = create_posts_df(get_post_objects(cars_webs_dict['SEAT Mii']))
df_mini = create_posts_df(get_post_objects(cars_webs_dict['Mini Cooper']))
df_honda = create_posts_df(get_post_objects(cars_webs_dict['Honda E']))
df_renault = create_posts_df(get_post_objects(cars_webs_dict['Renault Zoe']))
df_smart = create_posts_df(get_post_objects(cars_webs_dict['Smart EQ']))

In [21]:
df_fiat

Unnamed: 0,links,titles,names,views,replies,datetime
0,https://www.speakev.com/threads/fiat-500e-conv...,Fiat 500e Convertible La Prima Test Drive/Review,MarkyM,1000,17,2021-04-24T16:44:37-0400
1,https://www.speakev.com/threads/how-the-other-...,How the other half live (Fiat 500 $83 per mont...,Russ,1000,5,2015-03-18T17:00:11-0400
2,https://www.speakev.com/threads/fiat-500e-2020...,Fiat 500e (2020) - Youtube Reviews,Kronospace,1000,1,2020-11-15T15:08:01-0500
3,https://www.speakev.com/threads/new-fiat-500-e...,New Fiat 500 Electric,cah197,495,2,2020-04-24T05:49:43-0400
4,https://www.speakev.com/threads/fiat-500e-any-...,"Fiat 500e, any info?",Lee Fraser,1000,7,2015-08-06T18:58:20-0400
...,...,...,...,...,...,...
310,https://www.speakev.com/threads/making-some-pr...,Making some progress with the Nottingham council!,EVPotential,2000,39,2015-03-18T15:23:26-0400
311,https://www.speakev.com/threads/selling-a-leaf...,Selling a Leaf 24 to We Buy Any Car,Aleras,3000,82,2021-04-21T04:00:17-0400
312,https://www.speakev.com/threads/red-or-wrap-ca...,Red or wrap car?,Durzel,2000,21,2019-06-17T07:42:19-0400
313,https://www.speakev.com/threads/it-seems-only-...,It Seems Only Yesterday,Hogthelimelight,7000,2,2017-06-24T15:13:54-0400


In [22]:
# save as csv
# df_fiat.to_csv('df_fiat.csv', index=False)
# df_mini.to_csv('df_mini.csv', index=False)
# df_seat.to_csv('df_seat.csv', index=False)
# df_honda.to_csv('df_honda.csv', index=False)
# df_smart.to_csv('df_smart.csv', index=False)
# df_renault.to_csv('df_renault.csv', index=False)

## Get the text of the posts

In [290]:
class TextContent:
	""" Storage the replies text data """

	def __init__(self, title, comments, names):
		self.title = title
		self.comments = comments
		self.names = names


class GetText:
	""" crawl the replies text data in the links """
	@staticmethod
	def get_page(url):
		r = requests.get(url, proxies=proxies, headers=headers)
		return BeautifulSoup(r.text, 'lxml')

	@staticmethod
	def get_title(page):
		rep_length = len(page.findAll('article', {'class': 'message'}))
		return [page.find('h1').text] * rep_length

	@staticmethod
	def get_comments(page):
		comments = page.findAll('div', {'class': 'message-userContent'})
		results = []
		for i in comments:
			results.append(i.text)
		return results

	@staticmethod
	def get_names(page):
		names = page.findAll('article', {'class': 'message'})
		results = []
		for i in names:
			results.append(i.attrs['data-author'])
		return results

	def parse(self, url):
		soup = self.get_page(url)
		title = self.get_title(soup)
		comments = self.get_comments(soup)
		names = self.get_names(soup)
		content = TextContent(title, comments, names)
		return content

## Get the posts text content from the post links

In [304]:
def comments_pages_links(df):
	"""
	return the subpages of the posts comments
	for each webpages
	"""
	url_list = []
	for index in df.index:
		link = df.at[index, 'links']
		urls = [link]
		replies = df.at[index, 'replies']
		# each page contain most 20 comments
		if replies > 20:
			for i in range(replies//20):
				urls.append(link + f'page-{i+2}')
		url_list.append(urls)

	return url_list

col_names = ['title', 'comments', 'names']

def get_post_text(url_list):
	""" return the post text dataframe """
	title, comments, names = [], [], []

	for urls in url_list:
		for link in urls:
			crawlText = GetText()
			content = crawlText.parse(link)
			title.extend(content.title)
			comments.extend(content.comments)
			names.extend(content.names)

	df = pd.DataFrame(zip(title, comments, names),
						   columns=col_names)

	# remove the repeated text
	pattern = re.compile('.*?\n\nClick to expand...\n\n', re.S)
	df['comments'] = df['comments'].apply(
		lambda x: re.sub(pattern, '', x))

	return df

In [307]:
df_fiat_text = get_post_text(comments_pages_links(df_fiat))

In [312]:
df_fiat_text.head(3)

Unnamed: 0,title,comments,names
0,Fiat 500e Convertible La Prima Test Drive/Review,\n\nHow to make this review short. Fiat have a Winner on their hands!\nI am in the process of lo...,MarkyM
1,Fiat 500e Convertible La Prima Test Drive/Review,"\n\nNice review, I am having a test drive on Tuesday, can I ask what sort of discounts were offe...",WillWord
2,Fiat 500e Convertible La Prima Test Drive/Review,\n\nI saw Jonny Smith’s early review of this and thought it looked great.\n \n\n,idiotzoo
3,Fiat 500e Convertible La Prima Test Drive/Review,\n\nI work in the NHS and with their finance package -Affinity? Would give a £2k discount Straig...,MarkyM
4,Fiat 500e Convertible La Prima Test Drive/Review,\n\nI noticed this site seems to have some really good prices - £23.5k for the La Prima hatch. N...,tophatron
...,...,...,...
23164,"Cheap 2015 Blizzak LM-500 Winter Tyres (19"")","\n\nSo the second pair of tyres has shown up, after couriers lost them, a cancellation and reboo...",POB_FBR
23165,"Cheap 2015 Blizzak LM-500 Winter Tyres (19"")",Bump\n \n\n,POB_FBR
23166,"Cheap 2015 Blizzak LM-500 Winter Tyres (19"")",\n\nI shouldn't have dithered on this perhaps as they've upped the price to £71.29! Still a pret...,charliestyr
23167,"Cheap 2015 Blizzak LM-500 Winter Tyres (19"")",\n\nMy 3 are still available if anyone wants them.\n \n\n,i3S


In [311]:
df_fiat_text.to_csv('df_fiat_text.csv', index=False)

In [1]:
df_mini_text = get_post_text(comments_pages_links(df_mini))

In [None]:
df_mini_text.to_csv('df_mini_text.csv', index=False)

In [None]:
df_honda_text = get_post_text(comments_pages_links(df_honda))

In [None]:
df_honda_text.to_csv('df_honda_text.csv', index=False)

In [None]:
df_seat_text = get_post_text(comments_pages_links(df_seat))

In [None]:
df_seat_text.to_csv('df_seat_text.csv', index=False)

In [None]:
df_smart_text = get_post_text(comments_pages_links(df_smart))

In [None]:
df_smart_text.to_csv('df_smart_text.csv', index=False)

In [None]:
df_renault_text = get_post_text(comments_pages_links(df_renault))

In [None]:
df_renault_text.to_csv('df_renault_text.csv', index=False)

In [2]:
df_fiat_text = pd.read_csv('df_fiat_text.csv')

In [4]:
df_fiat_text['comments'][0]

'\n\nHow to make this review short. Fiat have a Winner on their hands!\nI am in the process of looking for a replacement EV for my BMW i3 94Ah (33kW) Rex which has just gone 4 years old and now on 57,000 miles. I have been looking at and testing ideally long range EVs the past few days, as there are now a few about (Kona, Tesla M3, ID3 etc) but this car has caught my eye! I love a small car and the 500 shape has been around for a few years now. Invited to drive this and luckily for me the dealership here in South Wales had both coupe and convertible La Prima editions but only the convertible was ready for a test drive, which would not be my first choice,as the coupe has a lovely panoramic but non opening sunroof anyway. But also lucky as it was a stunningly beautiful sunny day today.\nThe design of this car is already a classic based as it is on the tiny original. Modernised and now even more smoothed off as an EV. Proportions almost perfect. What really has surprised me and my wife to

NameError: name 'cadk' is not defined