<a href="https://colab.research.google.com/github/juliewang2020/FreeRealEstate/blob/master/parser.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Importing Data


In [0]:
import pandas as pd
import io
from google.colab import files
uploaded = files.upload()
df = pd.read_csv(io.BytesIO(uploaded['data.csv']))


In [0]:
############## Cleaning the HTML

#get rid of ID and CreatedDate
df = df.loc[df['Country_USA'] == 1]
df = df.loc[df['Tag_Company'] == 1]

features = df.iloc[:, 1:]
features.drop('CreatedDate', axis = 1, inplace=True)
common_words = ['said', 'office','new','million','costar', 'square', 'building', 'market', 'space', 'year', 'estate', 'real', 'align', 'development', 'feet', 'property', 'percent', 'company','foot', 'investment', 'based', 'retail', 'years', 'according']
import re

def cleanhtml(raw_html):
  cleanr = re.compile('<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});')
  cleantext = re.sub(cleanr, '', raw_html)
  return cleantext

for i in range(0, 80):
  body_text = features.iloc[i , 2]
  body_text = cleanhtml(body_text)
  body_text = body_text.replace('\n', ' ')
  body_text  = [word for word in re.split("\W+",body_text) if word.lower() not in common_words]
  body_text = ' '.join(body_text)
  features.loc[i, 'Body'] = body_text


In [0]:
from google.cloud import language
from google.cloud.language import enums
from google.cloud.language import types

In [51]:
import nltk
from sklearn.feature_extraction import text
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer("english")

eng_contractions = ["ain't", "amn't", "aren't", "can't", "could've", "couldn't",
                    "daresn't", "didn't", "doesn't", "don't", "gonna", "gotta", 
                    "hadn't", "hasn't", "haven't", "he'd", "he'll", "he's", "how'd",
                    "how'll", "how's", "I'd", "I'll", "I'm", "I've", "isn't", "it'd",
                    "it'll", "it's", "let's", "mayn't", "may've", "mightn't", 
                    "might've", "mustn't", "must've", "needn't", "o'clock", "ol'",
                    "oughtn't", "shan't", "she'd", "she'll", "she's", "should've",
                    "shouldn't", "somebody's", "someone's", "something's", "that'll",
                    "that're", "that's", "that'd", "there'd", "there're", "there's", 
                    "these're", "they'd", "they'll", "they're", "they've", "this's",
                    "those're", "tis", "twas", "twasn't", "wasn't", "we'd", "we'd've",
                    "we'll", "we're", "we've", "weren't", "what'd", "what'll", 
                    "what're", "what's", "what've", "when's", "where'd", "where're",
                    "where's", "where've", "which's", "who'd", "who'd've", "who'll",
                    "who're", "who's", "who've", "why'd", "why're", "why's", "won't",
                    "would've", "wouldn't", "y'all", "you'd", "you'll", "you're", 
                    "you've", "'s", "s"
                     ]

nltk.download('stopwords')
nltk.download('punkt')

custom_stopwords = text.ENGLISH_STOP_WORDS.union(eng_contractions)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [0]:
import nltk
import re

def tokenize_and_stem(text, do_stem=True):
    # first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
    tokens = [word.lower() for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    
    # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
    filtered_tokens = []
    for token in tokens:
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)
            
    # stem filtered tokens
    stems = [stemmer.stem(t) for t in filtered_tokens]
    
    if do_stem:
        return stems
    else:
        return filtered_tokens

In [0]:
# not super pythonic, no, not at all.
# use extend so it's a big flat list of vocab
totalvocab_stemmed = []
totalvocab_tokenized = []
for i in range(0, 80):
    body_text = features.iloc[i , 2]
    allwords_stemmed = tokenize_and_stem(body_text)
    totalvocab_stemmed.extend(allwords_stemmed)
    
    allwords_tokenized = tokenize_and_stem(body_text, False)
    totalvocab_tokenized.extend(allwords_tokenized)  
  
vocab_frame = pd.DataFrame({'words': totalvocab_tokenized}, index = totalvocab_stemmed)

In [58]:
from sklearn.feature_extraction.text import TfidfVectorizer

#define vectorizer parameters
tfidf_vectorizer = TfidfVectorizer(max_df=0.8, max_features=200000,
                                 min_df=0.2, stop_words=custom_stopwords,
                                 use_idf=True, tokenizer=tokenize_and_stem, ngram_range=(1,3))

tfidf_matrix = tfidf_vectorizer.fit_transform(features['Body']) #fit the vectorizer to synopses

print(tfidf_matrix.shape)

terms = tfidf_vectorizer.get_feature_names()

  'stop_words.' % sorted(inconsistent))


(148, 240)


In [60]:
from sklearn.cluster import KMeans
import math

num_clusters = int(math.sqrt(features.shape[0] / 2) * 1.5)

km = KMeans(n_clusters=num_clusters)

km.fit(tfidf_matrix)

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
       n_clusters=12, n_init=10, n_jobs=None, precompute_distances='auto',
       random_state=None, tol=0.0001, verbose=0)

In [0]:
clusters = km.labels_.tolist()

features['cluster'] = clusters

In [62]:
print("Top terms per cluster:")
print()

#sort cluster centers by proximity to centroid
order_centroids = km.cluster_centers_.argsort()[:, ::-1] 

for i in range(num_clusters):
    print("Cluster %d words:" % i, end='')
    
    for ind in order_centroids[i, :6]: #replace 6 with n words per cluster
        print(' %s' % vocab_frame.loc[terms[ind].split(' ')].values.tolist()[0][0], end=',')
    print() #add whitespace
    print() #add whitespace
    
    print("Cluster %d titles:" % i, end='')
    print()
    for title in features[features['cluster'] == i]['Title'].values.tolist():
        print(' - %s' % title)
    print() #add whitespace
    print() #add whitespace
    
print()
print()

Top terms per cluster:

Cluster 0 words: region, managers, u, high, asset, operate,

Cluster 0 titles:
 - nan
 - nan


Cluster 1 words: br, br, stores, retailer, said, companies,

Cluster 1 titles:
 - Scrapped Rite Aid-Albertsons Merger, Mattress Firm’s Financial Struggles Could Lead to Thousands of Vacancies
 - Cosmetics Retailers Ramp Up Plans to Open Stores, Distribution Hubs as Amazon Sparks Battle for Beauty
 - TJX Companies Plans to Add 2,000 More Stores
 - Lowe’s to Close All 99 Orchard Hardware Stores
 - A Moving Target: Smaller Stores, New Fulfillment Options Power Strong Performance
 - Ross Shoots for 3,000 Stores
 - Authentic Brands Group Makes $35 Million Bid for Brookstone Ahead of Auction
 - Tiffany & Co. to Renovate Flagship New York Store, Remodel Outlets Across the Country
 - American Eagle Joins Retailers Opening More Stores, Many in New Markets
 - Coca-Cola to Buy UK's Costa for $5.1 Billion to Add Global Coffee Brand
 - Landlords Could Seize Opportunity If Papa John