In [17]:
import requests
from bs4 import BeautifulSoup
import json
import pandas as pd
import urllib
import time
import concurrent.futures
import random

In [6]:
def get_json(URL):
    page = requests.get(URL)
    soup = BeautifulSoup(page.content, "html.parser")
    relevant_part = soup.find_all("script")
    json_text = relevant_part[5].text.replace("window.jsonModel = ", "")
    required_json = json.loads(json_text)
    return required_json

In [7]:
def get_region_code(location):
    response = requests.get(
        "https://www.rightmove.co.uk/house-prices/" + location + ".html"
    )
    soup = BeautifulSoup(response.content, "html.parser")
    relevant_part = soup.find_all("script")[3]
    json_text = relevant_part.text.replace("window.__PRELOADED_STATE__ = ", "")
    required_json = json.loads(json_text)
    return required_json["searchLocation"]["locationId"]

In [8]:
def create_url(location, min_price ='', max_price = '', min_bedrooms='', max_bedrooms='', min_bathrooms='', max_bathrooms='', radius='', property_type='', index=''):
    base_url = "https://www.rightmove.co.uk/property-for-sale/find.html?"
    location_code = get_region_code(location)
    params = {'searchType': 'SALE', 'locationIdentifier': 'REGION^' + location_code, 
              "radius": radius, "minPrice":min_price,"maxPrice":max_price, 
              "minBedrooms":min_bedrooms, "maxBedrooms":max_bedrooms, 
              "minBathrooms":min_bathrooms, "maxBathrooms":'', "propertyTypes":property_type,
             "index":index}
    final_url = base_url + urllib.parse.urlencode(params)
    return final_url

In [6]:
get_json(create_url("manchester"))

{'properties': [{'id': 130152713,
   'bedrooms': 4,
   'bathrooms': 1,
   'numberOfImages': 17,
   'numberOfFloorplans': 0,
   'numberOfVirtualTours': 1,
   'summary': 'Normie & Company are pleased to bring to market this four bedroom detached property, which is ideally located close to Heaton Park, off Park Road.',
   'displayAddress': 'Craigwell Road, Prestwich, M25',
   'countryCode': 'GB',
   'location': {'latitude': 53.523592, 'longitude': -2.254963},
   'propertyImages': {'images': [{'srcUrl': 'https://media.rightmove.co.uk:443/dir/crop/10:9-16:9/40k/39452/130152713/39452_1db93034-608d-4238-8b2b-8c36fbd5134d_IMG_00_0000_max_476x317.jpeg',
      'url': '40k/39452/130152713/39452_1db93034-608d-4238-8b2b-8c36fbd5134d_IMG_00_0000.jpeg',
      'caption': None},
     {'srcUrl': 'https://media.rightmove.co.uk:443/dir/crop/10:9-16:9/40k/39452/130152713/39452_1db93034-608d-4238-8b2b-8c36fbd5134d_IMG_01_0000_max_476x317.jpeg',
      'url': '40k/39452/130152713/39452_1db93034-608d-4238-8b2b

In [9]:
def json_to_df(json):
    df = pd.DataFrame(
        columns=[
            "Summary"
        ]
    )
    for i in range(len(json["properties"])):
        df.loc[i] = [
            json["properties"][i]["summary"],
        ]
    return df

In [8]:
data = json_to_df(get_json(create_url("manchester")))

In [9]:
data["Summary"][4]

'Wow.......Dunelm.........It truly is a hidden gem! Occupying a commanding yet concealed position just off the prestigious Chatsworth Road in Worsley. A simply outstanding detached family home which offers an abundance of space set over four floors. On a quiet no-through road from the moment you e...'

In [10]:
def create_url_list(
    location,
    min_price="",
    max_price="",
    min_bedrooms="",
    max_bedrooms="",
    min_bathrooms="",
    max_bathrooms="",
    radius="",
    property_type="",
):
    def get_index(i):
        return create_url(
            location,
            min_price,
            max_price,
            min_bedrooms,
            max_bedrooms,
            min_bathrooms,
            max_bathrooms,
            radius,
            property_type,
            index=24 * (i - 1))
    url_list=[]
    threads = 30
    with concurrent.futures.ThreadPoolExecutor(max_workers=threads) as executor:
        executor = executor.map(get_index, range(43))
        for url in executor:
            url_list.append(url)
    return url_list

In [11]:
def download_jsons(url_list):
    threads = min(30, len(url_list))
    json_list=[]
    with concurrent.futures.ThreadPoolExecutor(max_workers=threads) as executor:
        executor = executor.map(get_json, url_list)
        for json in executor:
            json_list.append(json)
        return json_list  

In [12]:
def create_df(jsons):
    df = pd.DataFrame(
        columns=[
            "Summary"
        ]
    )
    for json in jsons:
        new_df = json_to_df(json)
        df = pd.concat([df, new_df], ignore_index=True, axis=0)
    return df.drop_duplicates()    

In [13]:
def create_table(
    location,
    min_price="",
    max_price="",
    min_bedrooms="",
    max_bedrooms="",
    min_bathrooms="",
    max_bathrooms="",
    radius="",
    property_type="",
):
    return create_df(download_jsons(create_url_list(location,
            min_price,
            max_price,
            min_bedrooms,
            max_bedrooms,
            min_bathrooms,
            max_bathrooms,
            radius,
            property_type)))

In [29]:
data = create_table("manchester")

In [30]:
data

Unnamed: 0,Summary
0,NO VENDOR CHAIN! LOCATE ESTATE AGENTS are deli...
1,"OFF MARKET BUILDINGS AVAILABLE, PRICES FROM £4..."
2,This exquisite 3 bed duplex apartment situated...
3,A beautifully presented block of four apartmen...
4,Wow.......Dunelm.........It truly is a hidden ...
...,...
1070,Local Blackfriars is a gated community compris...
1071,NO ONWARD CHAIN. Looking for a very large thre...
1072,SEARCH...SEE...LOVE... this excellent three be...
1073,Available with no onward chain is this spaciou...


In [48]:
city_list = ["manchester", "leeds"]

In [67]:
def create_full_table(city_list):
    df = pd.DataFrame(
        columns=[
            "Summary"
        ]
    )
    for city in city_list:
        new_df = create_table(city)
        df = pd.concat([df, new_df], ignore_index=True, axis=0)
        time.sleep(random.randint(3, 7))
        print(city + " is loaded")
    return df.drop_duplicates()    

In [49]:
create_full_table(city_list)

Unnamed: 0,Summary
0,Apartments with 11.7% price growth forecast in...
1,"OFF MARKET BUILDINGS AVAILABLE, PRICES FROM £4..."
2,This exquisite 3 bed duplex apartment situated...
3,A beautifully presented block of four apartmen...
4,Wow.......Dunelm.........It truly is a hidden ...
...,...
1940,A SPACIOUS SIX BEDROOMED TERRACED PROPERTY ON...
1941,"Strike are pleased to offer this stunning, ful..."
1942,A deceptively spacious end-terrace property wi...
1943,*** Extended detached family home *** Highly s...


In [15]:
city_list_scotland = ["Aberdeen",
    "Dundee",
    "Dunfermline",
    "Edinburgh",
    "Glasgow",
    "Inverness",
    "Perth",
    "Stirling",
]

In [18]:
scotland = create_full_table(city_list_scotland)

In [20]:
scotland.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2986 entries, 0 to 3002
Data columns (total 1 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   Summary  2986 non-null   object
dtypes: object(1)
memory usage: 46.7+ KB


In [24]:
scotland.to_csv(r"C:\Users\Lawrence\Desktop\NLP-Project\Scotland_csv", index=False)

In [25]:
scotland1 = pd.read_csv("scotland_csv")

In [28]:
scotland1["Summary"][3]

'Prime residential development opportunity on the former site of Aberdeen Exhibition and Conference Centre in the popular Bridge of Don area.'

In [30]:
city_list_wales = [
    "Bangor",
    "Cardiff",
    "Newport",
    "St-Asaph",
    "St-Davids",
    "Swansea",
    "Wrexham",
]


In [31]:
wales = create_full_table(city_list_wales)

In [32]:
wales

Unnamed: 0,Summary
0,Spacious and modern six bedroom HMO with curre...
1,This significant site has been owned by the De...
2,*Property Ref - IW0179* ** WATCH OUR VIDEO TOU...
3,A rare opportunity to acquire this beautiful s...
4,"Enjoying a highly sought after location, in th..."
...,...
2707,A mid terraced property with lawned garden to ...
2708,1 Bedroom PURPOSE BUILT first floor flat with ...
2709,Situated a short distance from Wrexham town ce...
2710,An excellent opportunity to purchase a prime b...


In [33]:
wales.to_csv(r"C:\Users\Lawrence\Desktop\NLP-Project\Wales_csv", index=False)

In [34]:
scotland

Unnamed: 0,Summary
0,SUPERB SHEDDOCKSLEY! Click the Virtual Tour li...
1,"Exceptional Care Home, Retirement Living and R..."
2,Spectacular mansion house set on a south facin...
3,Prime residential development opportunity on t...
4,A magnificent residence set in about 3.47 acre...
...,...
2998,A traditional main door period flat in the hea...
2999,"A deceptively spacious three-bedroom, end-terr..."
3000,New to the Market. NO ONWARD CHAIN! This Semi-...
3001,This traditional one bedroom ground floor flat...


In [40]:
len(scotland["Summary"][0].split())

48

In [65]:
wales1 = pd.read_csv("wales_csv")

In [63]:
def number_of_words(city):
    word_count = 0
    for sentence_num in range(len(wales1["Summary"])):
        #print(sentence_num)
        word_count += len(wales1["Summary"][sentence_num].split())
    return word_count    

In [66]:
number_of_words(scotland)

107328

In [60]:
scotland1["Summary"][1005]

'Nestled around the Water of Leith this two bedroom ground floor apartment offers stylish open plan living, kitchen, dining area with private terrace, family bathroom and principal en suite.'

In [68]:
city_list_england_1 = ["Bath",
"Birmingham",
"Bradford",
"Brighton",
"Bristol",
]

In [69]:
england_1 = create_full_table(city_list_england_1)

Bath is loaded
Birmingham is loaded
Bradford is loaded
Brighton is loaded
Bristol is loaded


In [70]:
england_1.to_csv(r"C:\Users\Lawrence\Desktop\NLP-Project\England_1_csv", index=False)

In [71]:
england_1_csv = pd.read_csv("England_1_csv")

In [72]:
england_1_csv

Unnamed: 0,Summary
0,This modern detached bungalow offers well prop...
1,A substantial Grade I Listed five storey terra...
2,A pre-eminent example of classical Georgian ar...
3,Spectacular third floor apartment offering fle...
4,Sion Hill Place is an absolute hidden gem of p...
...,...
4444,"An exceedingly spacious and light filled, 3 do..."
4445,Matthews Estates are pleased to offer to the o...
4446,Situated in Rathbone Close is this three bedro...
4447,A delightful 2 bedroom apartment with open pla...


In [73]:
city_list_england_2 = [
    "Cambridge",
    "Canterbury",
    "Carlisle",
    "Chelmsford",
    "Chester",
    "Chichester",
    "Colchester",
    "Coventry",
]

In [74]:
england_2 = create_full_table(city_list_england_2)

Cambridge is loaded
Canterbury is loaded
Carlisle is loaded
Chelmsford is loaded
Chester is loaded
Chichester is loaded
Colchester is loaded
Coventry is loaded


In [75]:
england_2.to_csv(r"C:\Users\Lawrence\Desktop\NLP-Project\England_2_csv", index=False)

In [77]:
england_2_csv = pd.read_csv("England_2_csv")

In [78]:
england_2_csv

Unnamed: 0,Summary
0,A substantial and elegant Victorian townhouse ...
1,Substantial property with annexe & walled gard...
2,Stunning substantial home on popular road in t...
3,"This multi-award-winning, high specification e..."
4,Detached mid-century family house in need of r...
...,...
4913,Reeds Rains Coventry are pleased to offer with...
4914,Viewing is highly recommended for this decepti...
4915,Reeds Rains are pleased to offer this THREE BE...
4916,"*Offers in Excess of £200,000 - £220,000 +* Wo..."


In [79]:
city_list_england_3 = [
    "Derby",
    "Doncaster",
    "Durham",
    "Ely",
    "Exeter",
    "Gloucester",
    "Hereford",
    "Hull",
]

In [80]:
england_3 = create_full_table(city_list_england_3)

Derby is loaded
Doncaster is loaded
Durham is loaded
Ely is loaded
Exeter is loaded
Gloucester is loaded
Hereford is loaded
Hull is loaded


In [81]:
england_3.to_csv(r"C:\Users\Lawrence\Desktop\NLP-Project\England_3_csv", index=False)

In [82]:
england_3_csv = pd.read_csv("England_3_csv")

In [83]:
england_3_csv

Unnamed: 0,Summary
0,EXCLUSIVE LOCATION & GENEROUS GARDEN PLOT - A ...
1,BEAUTIFUL HOME SET IN LARGE GARDENS - Fine 192...
2,***NEW OFFER - Reserve a plot at Stephenson Ga...
3,***NEW OFFER - Reserve a plot at Stephenson Ga...
4,OFFERING EXCELLENT POTENTIAL FOR FURTHER REDEV...
...,...
5267,"Calling all investors, this is the perfect inv..."
5268,"INVITING OFFERS BETWEEN £100,000 TO £110,000 A..."
5269,"***Guide Price £100,000 - £110,000 plus Reserv..."
5270,A fantastic two bedroom semi-detached property...


In [84]:
city_list_england_4 = [
    "Lancaster",
    "Leeds",
    "Leicester",
    "Lichfield",
    "Lincoln",
    "Liverpool",
    "London",
]

In [85]:
england_4 = create_full_table(city_list_england_4)

Lancaster is loaded
Leeds is loaded
Leicester is loaded
Lichfield is loaded
Lincoln is loaded
Liverpool is loaded
London is loaded


In [86]:
england_4.to_csv(r"C:\Users\Lawrence\Desktop\NLP-Project\England_4_csv", index=False)

In [87]:
england_4_csv = pd.read_csv("England_4_csv")

In [88]:
england_4_csv

Unnamed: 0,Summary
0,Representing a brilliant investment or first t...
1,"Built in 1978, this imaginatively designed, co..."
2,Formerly the coach house for the adjacent coun...
3,NO CHAIN ***** EXTENDED TERRACED HOUSE *** 5/6...
4,Forming part of the historical and charismatic...
...,...
5481,A rarely available seven bedroom detached fami...
5482,An exceptionally rare opportunity to deliver t...
5483,As you enter Queens Gate Gardens through the o...
5484,A five bedroom freehold house spanning at almo...


In [90]:
city_list_england_5 = [
    "Manchester",
    "Milton-Keynes",
    "Newcastle-upon-Tyne",
    "Norwich",
    "Nottingham",
    "Oxford",
    "Peterborough",
    "Plymouth",
    "Portsmouth",
    "Preston",
]

In [91]:
england_5 = create_full_table(city_list_england_5)

Manchester is loaded
Milton-Keynes is loaded
Newcastle-upon-Tyne is loaded
Norwich is loaded
Nottingham is loaded
Oxford is loaded
Peterborough is loaded
Plymouth is loaded
Portsmouth is loaded
Preston is loaded


In [92]:
england_5.to_csv(r"C:\Users\Lawrence\Desktop\NLP-Project\England_5_csv", index=False)

In [93]:
england_5_csv = pd.read_csv("England_5_csv")

In [94]:
england_5_csv

Unnamed: 0,Summary
0,Residential buy to let opportunity for any inv...
1,"OFF MARKET BUILDINGS AVAILABLE, PRICES FROM £4..."
2,This exquisite 3 bed duplex apartment situated...
3,Wow.......Dunelm.........It truly is a hidden ...
4,A beautifully presented block of four apartmen...
...,...
9252,We are pleased to offer to the auction this on...
9253,Modern one bedroom first floor apartment boast...
9254,**SOLD STC**SIMILAR PROPERTIES REQUIRED** Haze...
9255,Ben Rose Estate Agents are pleased to present ...


In [95]:
city_list_england_6 = [
    "Ripon",
    "Salford",
    "Salisbury",
    "Sheffield",
    "Southampton",
    "Southend-on-Sea",
    "St-Albans",
    "Stoke-on-Trent",
    "Sunderland",
]

In [96]:
england_6 = create_full_table(city_list_england_6)

Ripon is loaded
Salford is loaded
Salisbury is loaded
Sheffield is loaded
Southampton is loaded
Southend-on-Sea is loaded
St-Albans is loaded
Stoke-on-Trent is loaded
Sunderland is loaded


In [97]:
england_6.to_csv(r"C:\Users\Lawrence\Desktop\NLP-Project\England_6_csv", index=False)

In [98]:
england_6_csv = pd.read_csv("England_6_csv")

In [99]:
england_6_csv

Unnamed: 0,Summary
0,"Close to local schools ideal for families, Rip..."
1,A rare development/investment opportunity comp...
2,A stunning recently updated detached four-bedr...
3,A stunning period family home with a self-cont...
4,A building plot overlooking open countryside w...
...,...
6288,*FANTASTIC CASH INVESTMENT OPPORTUNITY* CIRCA ...
6289,INVESTOR POTENTIAL ONE BEDROOM MID TERRACE HOU...
6290,FOR SALE VIA I AM SOLD AUCTION T&C's apply. Th...
6291,Freehold House Arranged as Two Flats Vacant Po...


In [100]:
city_list_england_7 = [
    "Truro",
    "Wakefield",
    "Wells",
    "Winchester",
    "Wolverhampton",
    "Worcester",
    "York",
]

In [101]:
england_7 = create_full_table(city_list_england_7)

Truro is loaded
Wakefield is loaded
Wells is loaded
Winchester is loaded
Wolverhampton is loaded
Worcester is loaded
York is loaded


In [None]:
england_7.to_csv(r"C:\Users\Lawrence\Desktop\NLP-Project\England_7_csv", index=False)

In [None]:
england_7_csv = pd.read_csv("England_7_csv")

In [None]:
england_7_csv