In [126]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import spacy
nlp = spacy.load('en_core_web_sm')
from spacy import displacy
from spacy.matcher import Matcher, PhraseMatcher

In [127]:
keep = ['id','ask', 'bedrooms', 'numRooms', 'title','original_title',
'latitude','longitude','zip_muni','studio','one_bedroom','two_bedroom', 'three_bedroom', 
'four_bedroom','five_bedroom','six_bedroom','seven_bedroom','eight_bedroom',
'periodblt', 'roomrent', 'sublet','shortterm', 'shared']

In [128]:
df_full = pd.read_csv('listings_unique_NOT-FULL_2018_20191028.csv')


Columns (0,17,25,26,50) have mixed types. Specify dtype option on import or set low_memory=False.



In [129]:
df = df_full[keep].copy()

In [130]:
#Only Craiglist
df_a = df.iloc[15196:139214]
df_b = df.iloc[147617:273821]
df_c = df.iloc[284924:405947]
df_d = df.iloc[408712:]
df = pd.concat([df_a,df_b,df_c,df_d])

# PhaseMatcher - Keywords

### Compiling list of keywords from different data sources

In [131]:
words = pd.read_excel('Cambridge_listings_2018_flagged.xlsx', sheet_name='Auto-Skip Phrases')

In [132]:
words = words['ROOM AVAILABLE IN'].tolist()

In [133]:
words_2 = pd.read_excel('FullListings_2018_Misclassifications.xlsx', sheet_name='keywords')

In [134]:
#Extarct keywords from the ones previously used

words_mis_camb = []

words_c1 = words_2['roomrent'].tolist()
words_c2 = words_2['sublet'][:3].tolist()
words_c3 = words_2['shortterm'][:1].tolist()
words_c4 = words_2['shared'][:3].tolist()

all_words = [words_c1,words_c2,words_c3,words_c4]

for w in all_words:
    words_mis_camb = words_mis_camb+w
     

In [135]:
#Cleaning keywords
words_mis_camb[3] = 'ONE ROOM IN'
words_mis_camb[4] = 'ONE BEDROOM IN'
words_mis_camb[5] = 'ONE BEDROOM AVAILABLE IN'
words_mis_camb[12] = 'ROOMS AVAILABLE IN'
words_mis_camb[15] = 'ROOM IN'
words_mis_camb[21] = 'BEDROOMS AVAILABLE IN'
words_mis_camb[23] = 'BEDROOMS OPEN IN'

In [136]:
words2 = pd.DataFrame(words_mis_camb, columns=['words'])

In [114]:
# creating a bool series from isin() 
isnotindf = words2[~words2["words"].isin(words)]
isnotindf = isnotindf['words'].tolist()
phrase_list = isnotindf + words
    

### Completed compiling list of keywords

In [139]:
# Current list of keywords
phrase_list 

['ROOM RENT',
 'ROOMMATE',
 'ROOMIE',
 'ONE ROOM IN',
 'ONE BEDROOM IN',
 'ROOM AVAILABLE IN',
 'ONE BEDROOM IN',
 'ONE ROOM IN',
 'ROOMMATES NEEDED',
 'ROOM IN',
 'ROOM IN',
 'SUBLET',
 'SHORT TERM',
 'SHARED',
 'SHARE',
 'ONE BEDROOM AVAILABLE IN',
 'PRIVATE ROOM',
 'ONE ROOM AVAILABLE',
 'FURNISHED BEDROOM',
 'APARTMENT SHARING',
 'ROOMS AVAILABLE IN',
 'ROOMMATES',
 'PRIVATE BEDROOM',
 'RENTING ROOM',
 'MASTER BEDROOM IN',
 'BEDROOMS AVAILABLE IN',
 'ONE BEDROOM OPEN IN',
 'BEDROOMS OPEN IN',
 'SUBLEASING',
 'SUBLEASE']

In [137]:
def filter_listing(dataframe, keywords):
    
    '''
    Input Parameters: 1) pandas dataframe from which the index 
                         and title will be extracted 
                      2) keywords as a list - already compiled a list for reference
                      
    Output: Returns 1) index of flaggled listing
                    2) Keyword found in listing
    '''
    #Spacy Phraser Object
    matcher = PhraseMatcher(nlp.vocab)
    
    #convert each phrase to a Doc object:
    phrase_patterns = [nlp(text) for text in phrase_list]

    # Pass each Doc object into matcher (note the use of the asterisk!):
    matcher.add('Cambridge_keywords', None, *phrase_patterns)
    
    all_matches = []
    idx_matches = []

    # Looping through dataset and instantiating Spacy docuemnt
    for x in df.itertuples():
        title = (x.title)
        idx = x.Index
        title = nlp(title)
        matches = matcher(title)

        #For title object, finding the keywords
        for match_id, start, end in matches:
            string_id = nlp.vocab.strings[match_id]  
            #span = keyword that was matched on
            span = title[start:end]  

            #Index that should be dropped - flagged
            idx_matches.append(idx)
            #Keyword that was flagged for that listing
            all_matches.append(span.text)
            
    return idx_matches, all_matches  

# Entity Recognition : 


In [32]:
df['title'].values

array([' STUDIO IN GREAT LOCATION GREAT PRICE ',
       ' HARVARD PORTER STUDIO RENOVATED HEAT INCLUDED ',
       ' STUDIO WENDELL STREET NO FEE PARKING RENT FITNESS CENTER ', ...,
       ' AMAZING EIGHT BEDROOM THREE BATH NEAR GALLAGHER PARK ',
       ' NINE BEDROOM IN ALLSTON ',
       ' HUGE EIGHT BEDROOM THREE BATHROOM OPEN FLOOR PLAN IN BIGHTON '],
      dtype=object)

In [33]:
sample = df.sample(frac= 0.0005)

for title in sample['title'].values:
    title = nlp(title)
    for ent in title.ents:
        if ent.label_ != 'CARDINAL':
            print(f'{ent.text:{50}} {ent.start_char:{2}} {ent.end_char:{2}} {ent.label_:{1}} {spacy.explain(ent.label_)}')
            #displacy.render(sentence,style='ent', jupyter=True)''

    

THIRD                                              13 18 ORDINAL "first", "second", etc.
BATHROOM                                           37 45 GPE Countries, cities, states
LUXE SEAPORT                                       26 38 GPE Countries, cities, states
CAMBRIDGE                                           1 10 GPE Countries, cities, states
1499                                               13 17 DATE Absolute or relative dates or periods
SEPTEMBER                                           1 10 DATE Absolute or relative dates or periods
NORTH END                                          26 35 ORG Companies, agencies, institutions, etc.
WEST FOUR BEDROOM                                  21 38 FAC Buildings, airports, highways, bridges, etc.
SOMETHING YEAR                                     36 50 EVENT Named hurricanes, battles, wars, sports events, etc.
AVAILABLE                                           1 10 GPE Countries, cities, states
JUNE                                    

## Plan of Action for Entity Recognition/Amentities: 
### keywords would be better to locate custom entities and find variation in location & price